fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/namei.h>
  75 #include <linux/fsnotify.h>
  76 #include <linux/fadvise.h>
  77 #include <linux/eventpoll.h>
  78 #include <linux/fs_struct.h>
  79 #include <linux/splice.h>
  80 #include <linux/task_work.h>
  81
  82 #define CREATE_TRACE_POINTS
  83 #include <trace/events/io_uring.h>
  84
  85 #include <uapi/linux/io_uring.h>
  86
  87 #include "internal.h"
  88 #include "io-wq.h"
  89
  90 #define IORING_MAX_ENTRIES      32768
  91 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  92
  93 /*
  94  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  95  */
  96 #define IORING_FILE_TABLE_SHIFT 9
  97 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  98 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  99 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 100
 101 struct io_uring {
 102         u32 head ____cacheline_aligned_in_smp;
 103         u32 tail ____cacheline_aligned_in_smp;
 104 };
 105
 106 /*
 107  * This data is shared with the application through the mmap at offsets
 108  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 109  *
 110  * The offsets to the member fields are published through struct
 111  * io_sqring_offsets when calling io_uring_setup.
 112  */
 113 struct io_rings {
 114         /*
 115          * Head and tail offsets into the ring; the offsets need to be
 116          * masked to get valid indices.
 117          *
 118          * The kernel controls head of the sq ring and the tail of the cq ring,
 119          * and the application controls tail of the sq ring and the head of the
 120          * cq ring.
 121          */
 122         struct io_uring         sq, cq;
 123         /*
 124          * Bitmasks to apply to head and tail offsets (constant, equals
 125          * ring_entries - 1)
 126          */
 127         u32                     sq_ring_mask, cq_ring_mask;
 128         /* Ring sizes (constant, power of 2) */
 129         u32                     sq_ring_entries, cq_ring_entries;
 130         /*
 131          * Number of invalid entries dropped by the kernel due to
 132          * invalid index stored in array
 133          *
 134          * Written by the kernel, shouldn't be modified by the
 135          * application (i.e. get number of "new events" by comparing to
 136          * cached value).
 137          *
 138          * After a new SQ head value was read by the application this
 139          * counter includes all submissions that were dropped reaching
 140          * the new SQ head (and possibly more).
 141          */
 142         u32                     sq_dropped;
 143         /*
 144          * Runtime SQ flags
 145          *
 146          * Written by the kernel, shouldn't be modified by the
 147          * application.
 148          *
 149          * The application needs a full memory barrier before checking
 150          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 151          */
 152         u32                     sq_flags;
 153         /*
 154          * Runtime CQ flags
 155          *
 156          * Written by the application, shouldn't be modified by the
 157          * kernel.
 158          */
 159         u32                     cq_flags;
 160         /*
 161          * Number of completion events lost because the queue was full;
 162          * this should be avoided by the application by making sure
 163          * there are not more requests pending than there is space in
 164          * the completion queue.
 165          *
 166          * Written by the kernel, shouldn't be modified by the
 167          * application (i.e. get number of "new events" by comparing to
 168          * cached value).
 169          *
 170          * As completion events come in out of order this counter is not
 171          * ordered with any other data.
 172          */
 173         u32                     cq_overflow;
 174         /*
 175          * Ring buffer of completion events.
 176          *
 177          * The kernel writes completion events fresh every time they are
 178          * produced, so the application is allowed to modify pending
 179          * entries.
 180          */
 181         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 182 };
 183
 184 struct io_mapped_ubuf {
 185         u64             ubuf;
 186         size_t          len;
 187         struct          bio_vec *bvec;
 188         unsigned int    nr_bvecs;
 189 };
 190
 191 struct fixed_file_table {
 192         struct file             **files;
 193 };
 194
 195 struct fixed_file_ref_node {
 196         struct percpu_ref               refs;
 197         struct list_head                node;
 198         struct list_head                file_list;
 199         struct fixed_file_data          *file_data;
 200         struct llist_node               llist;
 201 };
 202
 203 struct fixed_file_data {
 204         struct fixed_file_table         *table;
 205         struct io_ring_ctx              *ctx;
 206
 207         struct percpu_ref               *cur_refs;
 208         struct percpu_ref               refs;
 209         struct completion               done;
 210         struct list_head                ref_list;
 211         spinlock_t                      lock;
 212 };
 213
 214 struct io_buffer {
 215         struct list_head list;
 216         __u64 addr;
 217         __s32 len;
 218         __u16 bid;
 219 };
 220
 221 struct io_ring_ctx {
 222         struct {
 223                 struct percpu_ref       refs;
 224         } ____cacheline_aligned_in_smp;
 225
 226         struct {
 227                 unsigned int            flags;
 228                 unsigned int            compat: 1;
 229                 unsigned int            account_mem: 1;
 230                 unsigned int            cq_overflow_flushed: 1;
 231                 unsigned int            drain_next: 1;
 232                 unsigned int            eventfd_async: 1;
 233
 234                 /*
 235                  * Ring buffer of indices into array of io_uring_sqe, which is
 236                  * mmapped by the application using the IORING_OFF_SQES offset.
 237                  *
 238                  * This indirection could e.g. be used to assign fixed
 239                  * io_uring_sqe entries to operations and only submit them to
 240                  * the queue when needed.
 241                  *
 242                  * The kernel modifies neither the indices array nor the entries
 243                  * array.
 244                  */
 245                 u32                     *sq_array;
 246                 unsigned                cached_sq_head;
 247                 unsigned                sq_entries;
 248                 unsigned                sq_mask;
 249                 unsigned                sq_thread_idle;
 250                 unsigned                cached_sq_dropped;
 251                 atomic_t                cached_cq_overflow;
 252                 unsigned long           sq_check_overflow;
 253
 254                 struct list_head        defer_list;
 255                 struct list_head        timeout_list;
 256                 struct list_head        cq_overflow_list;
 257
 258                 wait_queue_head_t       inflight_wait;
 259                 struct io_uring_sqe     *sq_sqes;
 260         } ____cacheline_aligned_in_smp;
 261
 262         struct io_rings *rings;
 263
 264         /* IO offload */
 265         struct io_wq            *io_wq;
 266         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 267         struct mm_struct        *sqo_mm;
 268         wait_queue_head_t       sqo_wait;
 269
 270         /*
 271          * If used, fixed file set. Writers must ensure that ->refs is dead,
 272          * readers must ensure that ->refs is alive as long as the file* is
 273          * used. Only updated through io_uring_register(2).
 274          */
 275         struct fixed_file_data  *file_data;
 276         unsigned                nr_user_files;
 277         int                     ring_fd;
 278         struct file             *ring_file;
 279
 280         /* if used, fixed mapped user buffers */
 281         unsigned                nr_user_bufs;
 282         struct io_mapped_ubuf   *user_bufs;
 283
 284         struct user_struct      *user;
 285
 286         const struct cred       *creds;
 287
 288         struct completion       ref_comp;
 289         struct completion       sq_thread_comp;
 290
 291         /* if all else fails... */
 292         struct io_kiocb         *fallback_req;
 293
 294 #if defined(CONFIG_UNIX)
 295         struct socket           *ring_sock;
 296 #endif
 297
 298         struct idr              io_buffer_idr;
 299
 300         struct idr              personality_idr;
 301
 302         struct {
 303                 unsigned                cached_cq_tail;
 304                 unsigned                cq_entries;
 305                 unsigned                cq_mask;
 306                 atomic_t                cq_timeouts;
 307                 unsigned long           cq_check_overflow;
 308                 struct wait_queue_head  cq_wait;
 309                 struct fasync_struct    *cq_fasync;
 310                 struct eventfd_ctx      *cq_ev_fd;
 311         } ____cacheline_aligned_in_smp;
 312
 313         struct {
 314                 struct mutex            uring_lock;
 315                 wait_queue_head_t       wait;
 316         } ____cacheline_aligned_in_smp;
 317
 318         struct {
 319                 spinlock_t              completion_lock;
 320
 321                 /*
 322                  * ->poll_list is protected by the ctx->uring_lock for
 323                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 324                  * For SQPOLL, only the single threaded io_sq_thread() will
 325                  * manipulate the list, hence no extra locking is needed there.
 326                  */
 327                 struct list_head        poll_list;
 328                 struct hlist_head       *cancel_hash;
 329                 unsigned                cancel_hash_bits;
 330                 bool                    poll_multi_file;
 331
 332                 spinlock_t              inflight_lock;
 333                 struct list_head        inflight_list;
 334         } ____cacheline_aligned_in_smp;
 335
 336         struct delayed_work             file_put_work;
 337         struct llist_head               file_put_llist;
 338
 339         struct work_struct              exit_work;
 340 };
 341
 342 /*
 343  * First field must be the file pointer in all the
 344  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 345  */
 346 struct io_poll_iocb {
 347         struct file                     *file;
 348         union {
 349                 struct wait_queue_head  *head;
 350                 u64                     addr;
 351         };
 352         __poll_t                        events;
 353         bool                            done;
 354         bool                            canceled;
 355         struct wait_queue_entry         wait;
 356 };
 357
 358 struct io_close {
 359         struct file                     *file;
 360         struct file                     *put_file;
 361         int                             fd;
 362 };
 363
 364 struct io_timeout_data {
 365         struct io_kiocb                 *req;
 366         struct hrtimer                  timer;
 367         struct timespec64               ts;
 368         enum hrtimer_mode               mode;
 369 };
 370
 371 struct io_accept {
 372         struct file                     *file;
 373         struct sockaddr __user          *addr;
 374         int __user                      *addr_len;
 375         int                             flags;
 376         unsigned long                   nofile;
 377 };
 378
 379 struct io_sync {
 380         struct file                     *file;
 381         loff_t                          len;
 382         loff_t                          off;
 383         int                             flags;
 384         int                             mode;
 385 };
 386
 387 struct io_cancel {
 388         struct file                     *file;
 389         u64                             addr;
 390 };
 391
 392 struct io_timeout {
 393         struct file                     *file;
 394         u64                             addr;
 395         int                             flags;
 396         u32                             off;
 397         u32                             target_seq;
 398 };
 399
 400 struct io_rw {
 401         /* NOTE: kiocb has the file as the first member, so don't do it here */
 402         struct kiocb                    kiocb;
 403         u64                             addr;
 404         u64                             len;
 405 };
 406
 407 struct io_connect {
 408         struct file                     *file;
 409         struct sockaddr __user          *addr;
 410         int                             addr_len;
 411 };
 412
 413 struct io_sr_msg {
 414         struct file                     *file;
 415         union {
 416                 struct user_msghdr __user *msg;
 417                 void __user             *buf;
 418         };
 419         int                             msg_flags;
 420         int                             bgid;
 421         size_t                          len;
 422         struct io_buffer                *kbuf;
 423 };
 424
 425 struct io_open {
 426         struct file                     *file;
 427         int                             dfd;
 428         struct filename                 *filename;
 429         struct open_how                 how;
 430         unsigned long                   nofile;
 431 };
 432
 433 struct io_files_update {
 434         struct file                     *file;
 435         u64                             arg;
 436         u32                             nr_args;
 437         u32                             offset;
 438 };
 439
 440 struct io_fadvise {
 441         struct file                     *file;
 442         u64                             offset;
 443         u32                             len;
 444         u32                             advice;
 445 };
 446
 447 struct io_madvise {
 448         struct file                     *file;
 449         u64                             addr;
 450         u32                             len;
 451         u32                             advice;
 452 };
 453
 454 struct io_epoll {
 455         struct file                     *file;
 456         int                             epfd;
 457         int                             op;
 458         int                             fd;
 459         struct epoll_event              event;
 460 };
 461
 462 struct io_splice {
 463         struct file                     *file_out;
 464         struct file                     *file_in;
 465         loff_t                          off_out;
 466         loff_t                          off_in;
 467         u64                             len;
 468         unsigned int                    flags;
 469 };
 470
 471 struct io_provide_buf {
 472         struct file                     *file;
 473         __u64                           addr;
 474         __s32                           len;
 475         __u32                           bgid;
 476         __u16                           nbufs;
 477         __u16                           bid;
 478 };
 479
 480 struct io_statx {
 481         struct file                     *file;
 482         int                             dfd;
 483         unsigned int                    mask;
 484         unsigned int                    flags;
 485         const char __user               *filename;
 486         struct statx __user             *buffer;
 487 };
 488
 489 struct io_async_connect {
 490         struct sockaddr_storage         address;
 491 };
 492
 493 struct io_async_msghdr {
 494         struct iovec                    fast_iov[UIO_FASTIOV];
 495         struct iovec                    *iov;
 496         struct sockaddr __user          *uaddr;
 497         struct msghdr                   msg;
 498         struct sockaddr_storage         addr;
 499 };
 500
 501 struct io_async_rw {
 502         struct iovec                    fast_iov[UIO_FASTIOV];
 503         struct iovec                    *iov;
 504         ssize_t                         nr_segs;
 505         ssize_t                         size;
 506 };
 507
 508 struct io_async_ctx {
 509         union {
 510                 struct io_async_rw      rw;
 511                 struct io_async_msghdr  msg;
 512                 struct io_async_connect connect;
 513                 struct io_timeout_data  timeout;
 514         };
 515 };
 516
 517 enum {
 518         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 519         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 520         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 521         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 522         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 523         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 524
 525         REQ_F_LINK_HEAD_BIT,
 526         REQ_F_LINK_NEXT_BIT,
 527         REQ_F_FAIL_LINK_BIT,
 528         REQ_F_INFLIGHT_BIT,
 529         REQ_F_CUR_POS_BIT,
 530         REQ_F_NOWAIT_BIT,
 531         REQ_F_LINK_TIMEOUT_BIT,
 532         REQ_F_TIMEOUT_BIT,
 533         REQ_F_ISREG_BIT,
 534         REQ_F_MUST_PUNT_BIT,
 535         REQ_F_TIMEOUT_NOSEQ_BIT,
 536         REQ_F_COMP_LOCKED_BIT,
 537         REQ_F_NEED_CLEANUP_BIT,
 538         REQ_F_OVERFLOW_BIT,
 539         REQ_F_POLLED_BIT,
 540         REQ_F_BUFFER_SELECTED_BIT,
 541         REQ_F_NO_FILE_TABLE_BIT,
 542         REQ_F_QUEUE_TIMEOUT_BIT,
 543         REQ_F_WORK_INITIALIZED_BIT,
 544
 545         /* not a real bit, just to check we're not overflowing the space */
 546         __REQ_F_LAST_BIT,
 547 };
 548
 549 enum {
 550         /* ctx owns file */
 551         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 552         /* drain existing IO first */
 553         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 554         /* linked sqes */
 555         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 556         /* doesn't sever on completion < 0 */
 557         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 558         /* IOSQE_ASYNC */
 559         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 560         /* IOSQE_BUFFER_SELECT */
 561         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 562
 563         /* head of a link */
 564         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 565         /* already grabbed next link */
 566         REQ_F_LINK_NEXT         = BIT(REQ_F_LINK_NEXT_BIT),
 567         /* fail rest of links */
 568         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 569         /* on inflight list */
 570         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 571         /* read/write uses file position */
 572         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 573         /* must not punt to workers */
 574         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 575         /* has linked timeout */
 576         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 577         /* timeout request */
 578         REQ_F_TIMEOUT           = BIT(REQ_F_TIMEOUT_BIT),
 579         /* regular file */
 580         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 581         /* must be punted even for NONBLOCK */
 582         REQ_F_MUST_PUNT         = BIT(REQ_F_MUST_PUNT_BIT),
 583         /* no timeout sequence */
 584         REQ_F_TIMEOUT_NOSEQ     = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 585         /* completion under lock */
 586         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
 587         /* needs cleanup */
 588         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 589         /* in overflow list */
 590         REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
 591         /* already went through poll handler */
 592         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 593         /* buffer already selected */
 594         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 595         /* doesn't need file table for this request */
 596         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 597         /* needs to queue linked timeout */
 598         REQ_F_QUEUE_TIMEOUT     = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 599         /* io_wq_work is initialized */
 600         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 601 };
 602
 603 struct async_poll {
 604         struct io_poll_iocb     poll;
 605         struct io_wq_work       work;
 606 };
 607
 608 /*
 609  * NOTE! Each of the iocb union members has the file pointer
 610  * as the first entry in their struct definition. So you can
 611  * access the file pointer through any of the sub-structs,
 612  * or directly as just 'ki_filp' in this struct.
 613  */
 614 struct io_kiocb {
 615         union {
 616                 struct file             *file;
 617                 struct io_rw            rw;
 618                 struct io_poll_iocb     poll;
 619                 struct io_accept        accept;
 620                 struct io_sync          sync;
 621                 struct io_cancel        cancel;
 622                 struct io_timeout       timeout;
 623                 struct io_connect       connect;
 624                 struct io_sr_msg        sr_msg;
 625                 struct io_open          open;
 626                 struct io_close         close;
 627                 struct io_files_update  files_update;
 628                 struct io_fadvise       fadvise;
 629                 struct io_madvise       madvise;
 630                 struct io_epoll         epoll;
 631                 struct io_splice        splice;
 632                 struct io_provide_buf   pbuf;
 633                 struct io_statx         statx;
 634         };
 635
 636         struct io_async_ctx             *io;
 637         int                             cflags;
 638         u8                              opcode;
 639         /* polled IO has completed */
 640         u8                              iopoll_completed;
 641
 642         u16                             buf_index;
 643
 644         struct io_ring_ctx      *ctx;
 645         struct list_head        list;
 646         unsigned int            flags;
 647         refcount_t              refs;
 648         struct task_struct      *task;
 649         unsigned long           fsize;
 650         u64                     user_data;
 651         u32                     result;
 652         u32                     sequence;
 653
 654         struct list_head        link_list;
 655
 656         struct list_head        inflight_entry;
 657
 658         struct percpu_ref       *fixed_file_refs;
 659
 660         union {
 661                 /*
 662                  * Only commands that never go async can use the below fields,
 663                  * obviously. Right now only IORING_OP_POLL_ADD uses them, and
 664                  * async armed poll handlers for regular commands. The latter
 665                  * restore the work, if needed.
 666                  */
 667                 struct {
 668                         struct callback_head    task_work;
 669                         struct hlist_node       hash_node;
 670                         struct async_poll       *apoll;
 671                 };
 672                 struct io_wq_work       work;
 673         };
 674 };
 675
 676 #define IO_PLUG_THRESHOLD               2
 677 #define IO_IOPOLL_BATCH                 8
 678
 679 struct io_submit_state {
 680         struct blk_plug         plug;
 681
 682         /*
 683          * io_kiocb alloc cache
 684          */
 685         void                    *reqs[IO_IOPOLL_BATCH];
 686         unsigned int            free_reqs;
 687
 688         /*
 689          * File reference cache
 690          */
 691         struct file             *file;
 692         unsigned int            fd;
 693         unsigned int            has_refs;
 694         unsigned int            used_refs;
 695         unsigned int            ios_left;
 696 };
 697
 698 struct io_op_def {
 699         /* needs req->io allocated for deferral/async */
 700         unsigned                async_ctx : 1;
 701         /* needs current->mm setup, does mm access */
 702         unsigned                needs_mm : 1;
 703         /* needs req->file assigned */
 704         unsigned                needs_file : 1;
 705         /* don't fail if file grab fails */
 706         unsigned                needs_file_no_error : 1;
 707         /* hash wq insertion if file is a regular file */
 708         unsigned                hash_reg_file : 1;
 709         /* unbound wq insertion if file is a non-regular file */
 710         unsigned                unbound_nonreg_file : 1;
 711         /* opcode is not supported by this kernel */
 712         unsigned                not_supported : 1;
 713         /* needs file table */
 714         unsigned                file_table : 1;
 715         /* needs ->fs */
 716         unsigned                needs_fs : 1;
 717         /* set if opcode supports polled "wait" */
 718         unsigned                pollin : 1;
 719         unsigned                pollout : 1;
 720         /* op supports buffer selection */
 721         unsigned                buffer_select : 1;
 722 };
 723
 724 static const struct io_op_def io_op_defs[] = {
 725         [IORING_OP_NOP] = {},
 726         [IORING_OP_READV] = {
 727                 .async_ctx              = 1,
 728                 .needs_mm               = 1,
 729                 .needs_file             = 1,
 730                 .unbound_nonreg_file    = 1,
 731                 .pollin                 = 1,
 732                 .buffer_select          = 1,
 733         },
 734         [IORING_OP_WRITEV] = {
 735                 .async_ctx              = 1,
 736                 .needs_mm               = 1,
 737                 .needs_file             = 1,
 738                 .hash_reg_file          = 1,
 739                 .unbound_nonreg_file    = 1,
 740                 .pollout                = 1,
 741         },
 742         [IORING_OP_FSYNC] = {
 743                 .needs_file             = 1,
 744         },
 745         [IORING_OP_READ_FIXED] = {
 746                 .needs_file             = 1,
 747                 .unbound_nonreg_file    = 1,
 748                 .pollin                 = 1,
 749         },
 750         [IORING_OP_WRITE_FIXED] = {
 751                 .needs_file             = 1,
 752                 .hash_reg_file          = 1,
 753                 .unbound_nonreg_file    = 1,
 754                 .pollout                = 1,
 755         },
 756         [IORING_OP_POLL_ADD] = {
 757                 .needs_file             = 1,
 758                 .unbound_nonreg_file    = 1,
 759         },
 760         [IORING_OP_POLL_REMOVE] = {},
 761         [IORING_OP_SYNC_FILE_RANGE] = {
 762                 .needs_file             = 1,
 763         },
 764         [IORING_OP_SENDMSG] = {
 765                 .async_ctx              = 1,
 766                 .needs_mm               = 1,
 767                 .needs_file             = 1,
 768                 .unbound_nonreg_file    = 1,
 769                 .needs_fs               = 1,
 770                 .pollout                = 1,
 771         },
 772         [IORING_OP_RECVMSG] = {
 773                 .async_ctx              = 1,
 774                 .needs_mm               = 1,
 775                 .needs_file             = 1,
 776                 .unbound_nonreg_file    = 1,
 777                 .needs_fs               = 1,
 778                 .pollin                 = 1,
 779                 .buffer_select          = 1,
 780         },
 781         [IORING_OP_TIMEOUT] = {
 782                 .async_ctx              = 1,
 783                 .needs_mm               = 1,
 784         },
 785         [IORING_OP_TIMEOUT_REMOVE] = {},
 786         [IORING_OP_ACCEPT] = {
 787                 .needs_mm               = 1,
 788                 .needs_file             = 1,
 789                 .unbound_nonreg_file    = 1,
 790                 .file_table             = 1,
 791                 .pollin                 = 1,
 792         },
 793         [IORING_OP_ASYNC_CANCEL] = {},
 794         [IORING_OP_LINK_TIMEOUT] = {
 795                 .async_ctx              = 1,
 796                 .needs_mm               = 1,
 797         },
 798         [IORING_OP_CONNECT] = {
 799                 .async_ctx              = 1,
 800                 .needs_mm               = 1,
 801                 .needs_file             = 1,
 802                 .unbound_nonreg_file    = 1,
 803                 .pollout                = 1,
 804         },
 805         [IORING_OP_FALLOCATE] = {
 806                 .needs_file             = 1,
 807         },
 808         [IORING_OP_OPENAT] = {
 809                 .file_table             = 1,
 810                 .needs_fs               = 1,
 811         },
 812         [IORING_OP_CLOSE] = {
 813                 .needs_file             = 1,
 814                 .needs_file_no_error    = 1,
 815                 .file_table             = 1,
 816         },
 817         [IORING_OP_FILES_UPDATE] = {
 818                 .needs_mm               = 1,
 819                 .file_table             = 1,
 820         },
 821         [IORING_OP_STATX] = {
 822                 .needs_mm               = 1,
 823                 .needs_fs               = 1,
 824                 .file_table             = 1,
 825         },
 826         [IORING_OP_READ] = {
 827                 .needs_mm               = 1,
 828                 .needs_file             = 1,
 829                 .unbound_nonreg_file    = 1,
 830                 .pollin                 = 1,
 831                 .buffer_select          = 1,
 832         },
 833         [IORING_OP_WRITE] = {
 834                 .needs_mm               = 1,
 835                 .needs_file             = 1,
 836                 .unbound_nonreg_file    = 1,
 837                 .pollout                = 1,
 838         },
 839         [IORING_OP_FADVISE] = {
 840                 .needs_file             = 1,
 841         },
 842         [IORING_OP_MADVISE] = {
 843                 .needs_mm               = 1,
 844         },
 845         [IORING_OP_SEND] = {
 846                 .needs_mm               = 1,
 847                 .needs_file             = 1,
 848                 .unbound_nonreg_file    = 1,
 849                 .pollout                = 1,
 850         },
 851         [IORING_OP_RECV] = {
 852                 .needs_mm               = 1,
 853                 .needs_file             = 1,
 854                 .unbound_nonreg_file    = 1,
 855                 .pollin                 = 1,
 856                 .buffer_select          = 1,
 857         },
 858         [IORING_OP_OPENAT2] = {
 859                 .file_table             = 1,
 860                 .needs_fs               = 1,
 861         },
 862         [IORING_OP_EPOLL_CTL] = {
 863                 .unbound_nonreg_file    = 1,
 864                 .file_table             = 1,
 865         },
 866         [IORING_OP_SPLICE] = {
 867                 .needs_file             = 1,
 868                 .hash_reg_file          = 1,
 869                 .unbound_nonreg_file    = 1,
 870         },
 871         [IORING_OP_PROVIDE_BUFFERS] = {},
 872         [IORING_OP_REMOVE_BUFFERS] = {},
 873         [IORING_OP_TEE] = {
 874                 .needs_file             = 1,
 875                 .hash_reg_file          = 1,
 876                 .unbound_nonreg_file    = 1,
 877         },
 878 };
 879
 880 static void io_wq_submit_work(struct io_wq_work **workptr);
 881 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 882 static void io_put_req(struct io_kiocb *req);
 883 static void __io_double_put_req(struct io_kiocb *req);
 884 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 885 static void io_queue_linked_timeout(struct io_kiocb *req);
 886 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 887                                  struct io_uring_files_update *ip,
 888                                  unsigned nr_args);
 889 static int io_grab_files(struct io_kiocb *req);
 890 static void io_cleanup_req(struct io_kiocb *req);
 891 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 892                        int fd, struct file **out_file, bool fixed);
 893 static void __io_queue_sqe(struct io_kiocb *req,
 894                            const struct io_uring_sqe *sqe);
 895
 896 static struct kmem_cache *req_cachep;
 897
 898 static const struct file_operations io_uring_fops;
 899
 900 struct sock *io_uring_get_socket(struct file *file)
 901 {
 902 #if defined(CONFIG_UNIX)
 903         if (file->f_op == &io_uring_fops) {
 904                 struct io_ring_ctx *ctx = file->private_data;
 905
 906                 return ctx->ring_sock->sk;
 907         }
 908 #endif
 909         return NULL;
 910 }
 911 EXPORT_SYMBOL(io_uring_get_socket);
 912
 913 static void io_file_put_work(struct work_struct *work);
 914
 915 /*
 916  * Note: must call io_req_init_async() for the first time you
 917  * touch any members of io_wq_work.
 918  */
 919 static inline void io_req_init_async(struct io_kiocb *req)
 920 {
 921         if (req->flags & REQ_F_WORK_INITIALIZED)
 922                 return;
 923
 924         memset(&req->work, 0, sizeof(req->work));
 925         req->flags |= REQ_F_WORK_INITIALIZED;
 926 }
 927
 928 static inline bool io_async_submit(struct io_ring_ctx *ctx)
 929 {
 930         return ctx->flags & IORING_SETUP_SQPOLL;
 931 }
 932
 933 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 934 {
 935         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 936
 937         complete(&ctx->ref_comp);
 938 }
 939
 940 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 941 {
 942         struct io_ring_ctx *ctx;
 943         int hash_bits;
 944
 945         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 946         if (!ctx)
 947                 return NULL;
 948
 949         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 950         if (!ctx->fallback_req)
 951                 goto err;
 952
 953         /*
 954          * Use 5 bits less than the max cq entries, that should give us around
 955          * 32 entries per hash list if totally full and uniformly spread.
 956          */
 957         hash_bits = ilog2(p->cq_entries);
 958         hash_bits -= 5;
 959         if (hash_bits <= 0)
 960                 hash_bits = 1;
 961         ctx->cancel_hash_bits = hash_bits;
 962         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 963                                         GFP_KERNEL);
 964         if (!ctx->cancel_hash)
 965                 goto err;
 966         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 967
 968         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 969                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 970                 goto err;
 971
 972         ctx->flags = p->flags;
 973         init_waitqueue_head(&ctx->sqo_wait);
 974         init_waitqueue_head(&ctx->cq_wait);
 975         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 976         init_completion(&ctx->ref_comp);
 977         init_completion(&ctx->sq_thread_comp);
 978         idr_init(&ctx->io_buffer_idr);
 979         idr_init(&ctx->personality_idr);
 980         mutex_init(&ctx->uring_lock);
 981         init_waitqueue_head(&ctx->wait);
 982         spin_lock_init(&ctx->completion_lock);
 983         INIT_LIST_HEAD(&ctx->poll_list);
 984         INIT_LIST_HEAD(&ctx->defer_list);
 985         INIT_LIST_HEAD(&ctx->timeout_list);
 986         init_waitqueue_head(&ctx->inflight_wait);
 987         spin_lock_init(&ctx->inflight_lock);
 988         INIT_LIST_HEAD(&ctx->inflight_list);
 989         INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
 990         init_llist_head(&ctx->file_put_llist);
 991         return ctx;
 992 err:
 993         if (ctx->fallback_req)
 994                 kmem_cache_free(req_cachep, ctx->fallback_req);
 995         kfree(ctx->cancel_hash);
 996         kfree(ctx);
 997         return NULL;
 998 }
 999
1000 static inline bool __req_need_defer(struct io_kiocb *req)
1001 {
1002         struct io_ring_ctx *ctx = req->ctx;
1003
1004         return req->sequence != ctx->cached_cq_tail
1005                                 + atomic_read(&ctx->cached_cq_overflow);
1006 }
1007
1008 static inline bool req_need_defer(struct io_kiocb *req)
1009 {
1010         if (unlikely(req->flags & REQ_F_IO_DRAIN))
1011                 return __req_need_defer(req);
1012
1013         return false;
1014 }
1015
1016 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1017 {
1018         struct io_rings *rings = ctx->rings;
1019
1020         /* order cqe stores with ring update */
1021         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1022
1023         if (wq_has_sleeper(&ctx->cq_wait)) {
1024                 wake_up_interruptible(&ctx->cq_wait);
1025                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1026         }
1027 }
1028
1029 static inline void io_req_work_grab_env(struct io_kiocb *req,
1030                                         const struct io_op_def *def)
1031 {
1032         if (!req->work.mm && def->needs_mm) {
1033                 mmgrab(current->mm);
1034                 req->work.mm = current->mm;
1035         }
1036         if (!req->work.creds)
1037                 req->work.creds = get_current_cred();
1038         if (!req->work.fs && def->needs_fs) {
1039                 spin_lock(&current->fs->lock);
1040                 if (!current->fs->in_exec) {
1041                         req->work.fs = current->fs;
1042                         req->work.fs->users++;
1043                 } else {
1044                         req->work.flags |= IO_WQ_WORK_CANCEL;
1045                 }
1046                 spin_unlock(&current->fs->lock);
1047         }
1048         if (!req->work.task_pid)
1049                 req->work.task_pid = task_pid_vnr(current);
1050 }
1051
1052 static inline void io_req_work_drop_env(struct io_kiocb *req)
1053 {
1054         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1055                 return;
1056
1057         if (req->work.mm) {
1058                 mmdrop(req->work.mm);
1059                 req->work.mm = NULL;
1060         }
1061         if (req->work.creds) {
1062                 put_cred(req->work.creds);
1063                 req->work.creds = NULL;
1064         }
1065         if (req->work.fs) {
1066                 struct fs_struct *fs = req->work.fs;
1067
1068                 spin_lock(&req->work.fs->lock);
1069                 if (--fs->users)
1070                         fs = NULL;
1071                 spin_unlock(&req->work.fs->lock);
1072                 if (fs)
1073                         free_fs_struct(fs);
1074         }
1075 }
1076
1077 static inline void io_prep_async_work(struct io_kiocb *req,
1078                                       struct io_kiocb **link)
1079 {
1080         const struct io_op_def *def = &io_op_defs[req->opcode];
1081
1082         if (req->flags & REQ_F_ISREG) {
1083                 if (def->hash_reg_file)
1084                         io_wq_hash_work(&req->work, file_inode(req->file));
1085         } else {
1086                 if (def->unbound_nonreg_file)
1087                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1088         }
1089
1090         io_req_work_grab_env(req, def);
1091
1092         *link = io_prep_linked_timeout(req);
1093 }
1094
1095 static inline void io_queue_async_work(struct io_kiocb *req)
1096 {
1097         struct io_ring_ctx *ctx = req->ctx;
1098         struct io_kiocb *link;
1099
1100         io_prep_async_work(req, &link);
1101
1102         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1103                                         &req->work, req->flags);
1104         io_wq_enqueue(ctx->io_wq, &req->work);
1105
1106         if (link)
1107                 io_queue_linked_timeout(link);
1108 }
1109
1110 static void io_kill_timeout(struct io_kiocb *req)
1111 {
1112         int ret;
1113
1114         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1115         if (ret != -1) {
1116                 atomic_inc(&req->ctx->cq_timeouts);
1117                 list_del_init(&req->list);
1118                 req->flags |= REQ_F_COMP_LOCKED;
1119                 io_cqring_fill_event(req, 0);
1120                 io_put_req(req);
1121         }
1122 }
1123
1124 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1125 {
1126         struct io_kiocb *req, *tmp;
1127
1128         spin_lock_irq(&ctx->completion_lock);
1129         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1130                 io_kill_timeout(req);
1131         spin_unlock_irq(&ctx->completion_lock);
1132 }
1133
1134 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1135 {
1136         do {
1137                 struct io_kiocb *req = list_first_entry(&ctx->defer_list,
1138                                                         struct io_kiocb, list);
1139
1140                 if (req_need_defer(req))
1141                         break;
1142                 list_del_init(&req->list);
1143                 io_queue_async_work(req);
1144         } while (!list_empty(&ctx->defer_list));
1145 }
1146
1147 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1148 {
1149         while (!list_empty(&ctx->timeout_list)) {
1150                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1151                                                         struct io_kiocb, list);
1152
1153                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
1154                         break;
1155                 if (req->timeout.target_seq != ctx->cached_cq_tail
1156                                         - atomic_read(&ctx->cq_timeouts))
1157                         break;
1158
1159                 list_del_init(&req->list);
1160                 io_kill_timeout(req);
1161         }
1162 }
1163
1164 static void io_commit_cqring(struct io_ring_ctx *ctx)
1165 {
1166         io_flush_timeouts(ctx);
1167         __io_commit_cqring(ctx);
1168
1169         if (unlikely(!list_empty(&ctx->defer_list)))
1170                 __io_queue_deferred(ctx);
1171 }
1172
1173 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1174 {
1175         struct io_rings *rings = ctx->rings;
1176         unsigned tail;
1177
1178         tail = ctx->cached_cq_tail;
1179         /*
1180          * writes to the cq entry need to come after reading head; the
1181          * control dependency is enough as we're using WRITE_ONCE to
1182          * fill the cq entry
1183          */
1184         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1185                 return NULL;
1186
1187         ctx->cached_cq_tail++;
1188         return &rings->cqes[tail & ctx->cq_mask];
1189 }
1190
1191 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1192 {
1193         if (!ctx->cq_ev_fd)
1194                 return false;
1195         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1196                 return false;
1197         if (!ctx->eventfd_async)
1198                 return true;
1199         return io_wq_current_is_worker();
1200 }
1201
1202 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1203 {
1204         if (waitqueue_active(&ctx->wait))
1205                 wake_up(&ctx->wait);
1206         if (waitqueue_active(&ctx->sqo_wait))
1207                 wake_up(&ctx->sqo_wait);
1208         if (io_should_trigger_evfd(ctx))
1209                 eventfd_signal(ctx->cq_ev_fd, 1);
1210 }
1211
1212 /* Returns true if there are no backlogged entries after the flush */
1213 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1214 {
1215         struct io_rings *rings = ctx->rings;
1216         struct io_uring_cqe *cqe;
1217         struct io_kiocb *req;
1218         unsigned long flags;
1219         LIST_HEAD(list);
1220
1221         if (!force) {
1222                 if (list_empty_careful(&ctx->cq_overflow_list))
1223                         return true;
1224                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1225                     rings->cq_ring_entries))
1226                         return false;
1227         }
1228
1229         spin_lock_irqsave(&ctx->completion_lock, flags);
1230
1231         /* if force is set, the ring is going away. always drop after that */
1232         if (force)
1233                 ctx->cq_overflow_flushed = 1;
1234
1235         cqe = NULL;
1236         while (!list_empty(&ctx->cq_overflow_list)) {
1237                 cqe = io_get_cqring(ctx);
1238                 if (!cqe && !force)
1239                         break;
1240
1241                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1242                                                 list);
1243                 list_move(&req->list, &list);
1244                 req->flags &= ~REQ_F_OVERFLOW;
1245                 if (cqe) {
1246                         WRITE_ONCE(cqe->user_data, req->user_data);
1247                         WRITE_ONCE(cqe->res, req->result);
1248                         WRITE_ONCE(cqe->flags, req->cflags);
1249                 } else {
1250                         WRITE_ONCE(ctx->rings->cq_overflow,
1251                                 atomic_inc_return(&ctx->cached_cq_overflow));
1252                 }
1253         }
1254
1255         io_commit_cqring(ctx);
1256         if (cqe) {
1257                 clear_bit(0, &ctx->sq_check_overflow);
1258                 clear_bit(0, &ctx->cq_check_overflow);
1259         }
1260         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1261         io_cqring_ev_posted(ctx);
1262
1263         while (!list_empty(&list)) {
1264                 req = list_first_entry(&list, struct io_kiocb, list);
1265                 list_del(&req->list);
1266                 io_put_req(req);
1267         }
1268
1269         return cqe != NULL;
1270 }
1271
1272 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1273 {
1274         struct io_ring_ctx *ctx = req->ctx;
1275         struct io_uring_cqe *cqe;
1276
1277         trace_io_uring_complete(ctx, req->user_data, res);
1278
1279         /*
1280          * If we can't get a cq entry, userspace overflowed the
1281          * submission (by quite a lot). Increment the overflow count in
1282          * the ring.
1283          */
1284         cqe = io_get_cqring(ctx);
1285         if (likely(cqe)) {
1286                 WRITE_ONCE(cqe->user_data, req->user_data);
1287                 WRITE_ONCE(cqe->res, res);
1288                 WRITE_ONCE(cqe->flags, cflags);
1289         } else if (ctx->cq_overflow_flushed) {
1290                 WRITE_ONCE(ctx->rings->cq_overflow,
1291                                 atomic_inc_return(&ctx->cached_cq_overflow));
1292         } else {
1293                 if (list_empty(&ctx->cq_overflow_list)) {
1294                         set_bit(0, &ctx->sq_check_overflow);
1295                         set_bit(0, &ctx->cq_check_overflow);
1296                 }
1297                 req->flags |= REQ_F_OVERFLOW;
1298                 refcount_inc(&req->refs);
1299                 req->result = res;
1300                 req->cflags = cflags;
1301                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1302         }
1303 }
1304
1305 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1306 {
1307         __io_cqring_fill_event(req, res, 0);
1308 }
1309
1310 static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1311 {
1312         struct io_ring_ctx *ctx = req->ctx;
1313         unsigned long flags;
1314
1315         spin_lock_irqsave(&ctx->completion_lock, flags);
1316         __io_cqring_fill_event(req, res, cflags);
1317         io_commit_cqring(ctx);
1318         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1319
1320         io_cqring_ev_posted(ctx);
1321 }
1322
1323 static void io_cqring_add_event(struct io_kiocb *req, long res)
1324 {
1325         __io_cqring_add_event(req, res, 0);
1326 }
1327
1328 static inline bool io_is_fallback_req(struct io_kiocb *req)
1329 {
1330         return req == (struct io_kiocb *)
1331                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1332 }
1333
1334 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1335 {
1336         struct io_kiocb *req;
1337
1338         req = ctx->fallback_req;
1339         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1340                 return req;
1341
1342         return NULL;
1343 }
1344
1345 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1346                                      struct io_submit_state *state)
1347 {
1348         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1349         struct io_kiocb *req;
1350
1351         if (!state) {
1352                 req = kmem_cache_alloc(req_cachep, gfp);
1353                 if (unlikely(!req))
1354                         goto fallback;
1355         } else if (!state->free_reqs) {
1356                 size_t sz;
1357                 int ret;
1358
1359                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1360                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1361
1362                 /*
1363                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1364                  * retry single alloc to be on the safe side.
1365                  */
1366                 if (unlikely(ret <= 0)) {
1367                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1368                         if (!state->reqs[0])
1369                                 goto fallback;
1370                         ret = 1;
1371                 }
1372                 state->free_reqs = ret - 1;
1373                 req = state->reqs[ret - 1];
1374         } else {
1375                 state->free_reqs--;
1376                 req = state->reqs[state->free_reqs];
1377         }
1378
1379         return req;
1380 fallback:
1381         return io_get_fallback_req(ctx);
1382 }
1383
1384 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1385                           bool fixed)
1386 {
1387         if (fixed)
1388                 percpu_ref_put(req->fixed_file_refs);
1389         else
1390                 fput(file);
1391 }
1392
1393 static void __io_req_aux_free(struct io_kiocb *req)
1394 {
1395         if (req->flags & REQ_F_NEED_CLEANUP)
1396                 io_cleanup_req(req);
1397
1398         kfree(req->io);
1399         if (req->file)
1400                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1401         if (req->task)
1402                 put_task_struct(req->task);
1403
1404         io_req_work_drop_env(req);
1405 }
1406
1407 static void __io_free_req(struct io_kiocb *req)
1408 {
1409         __io_req_aux_free(req);
1410
1411         if (req->flags & REQ_F_INFLIGHT) {
1412                 struct io_ring_ctx *ctx = req->ctx;
1413                 unsigned long flags;
1414
1415                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1416                 list_del(&req->inflight_entry);
1417                 if (waitqueue_active(&ctx->inflight_wait))
1418                         wake_up(&ctx->inflight_wait);
1419                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1420         }
1421
1422         percpu_ref_put(&req->ctx->refs);
1423         if (likely(!io_is_fallback_req(req)))
1424                 kmem_cache_free(req_cachep, req);
1425         else
1426                 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1427 }
1428
1429 struct req_batch {
1430         void *reqs[IO_IOPOLL_BATCH];
1431         int to_free;
1432         int need_iter;
1433 };
1434
1435 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1436 {
1437         if (!rb->to_free)
1438                 return;
1439         if (rb->need_iter) {
1440                 int i, inflight = 0;
1441                 unsigned long flags;
1442
1443                 for (i = 0; i < rb->to_free; i++) {
1444                         struct io_kiocb *req = rb->reqs[i];
1445
1446                         if (req->flags & REQ_F_INFLIGHT)
1447                                 inflight++;
1448                         __io_req_aux_free(req);
1449                 }
1450                 if (!inflight)
1451                         goto do_free;
1452
1453                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1454                 for (i = 0; i < rb->to_free; i++) {
1455                         struct io_kiocb *req = rb->reqs[i];
1456
1457                         if (req->flags & REQ_F_INFLIGHT) {
1458                                 list_del(&req->inflight_entry);
1459                                 if (!--inflight)
1460                                         break;
1461                         }
1462                 }
1463                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1464
1465                 if (waitqueue_active(&ctx->inflight_wait))
1466                         wake_up(&ctx->inflight_wait);
1467         }
1468 do_free:
1469         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1470         percpu_ref_put_many(&ctx->refs, rb->to_free);
1471         rb->to_free = rb->need_iter = 0;
1472 }
1473
1474 static bool io_link_cancel_timeout(struct io_kiocb *req)
1475 {
1476         struct io_ring_ctx *ctx = req->ctx;
1477         int ret;
1478
1479         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1480         if (ret != -1) {
1481                 io_cqring_fill_event(req, -ECANCELED);
1482                 io_commit_cqring(ctx);
1483                 req->flags &= ~REQ_F_LINK_HEAD;
1484                 io_put_req(req);
1485                 return true;
1486         }
1487
1488         return false;
1489 }
1490
1491 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1492 {
1493         struct io_ring_ctx *ctx = req->ctx;
1494         bool wake_ev = false;
1495
1496         /* Already got next link */
1497         if (req->flags & REQ_F_LINK_NEXT)
1498                 return;
1499
1500         /*
1501          * The list should never be empty when we are called here. But could
1502          * potentially happen if the chain is messed up, check to be on the
1503          * safe side.
1504          */
1505         while (!list_empty(&req->link_list)) {
1506                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1507                                                 struct io_kiocb, link_list);
1508
1509                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1510                              (nxt->flags & REQ_F_TIMEOUT))) {
1511                         list_del_init(&nxt->link_list);
1512                         wake_ev |= io_link_cancel_timeout(nxt);
1513                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1514                         continue;
1515                 }
1516
1517                 list_del_init(&req->link_list);
1518                 if (!list_empty(&nxt->link_list))
1519                         nxt->flags |= REQ_F_LINK_HEAD;
1520                 *nxtptr = nxt;
1521                 break;
1522         }
1523
1524         req->flags |= REQ_F_LINK_NEXT;
1525         if (wake_ev)
1526                 io_cqring_ev_posted(ctx);
1527 }
1528
1529 /*
1530  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1531  */
1532 static void io_fail_links(struct io_kiocb *req)
1533 {
1534         struct io_ring_ctx *ctx = req->ctx;
1535         unsigned long flags;
1536
1537         spin_lock_irqsave(&ctx->completion_lock, flags);
1538
1539         while (!list_empty(&req->link_list)) {
1540                 struct io_kiocb *link = list_first_entry(&req->link_list,
1541                                                 struct io_kiocb, link_list);
1542
1543                 list_del_init(&link->link_list);
1544                 trace_io_uring_fail_link(req, link);
1545
1546                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1547                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1548                         io_link_cancel_timeout(link);
1549                 } else {
1550                         io_cqring_fill_event(link, -ECANCELED);
1551                         __io_double_put_req(link);
1552                 }
1553                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1554         }
1555
1556         io_commit_cqring(ctx);
1557         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1558         io_cqring_ev_posted(ctx);
1559 }
1560
1561 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1562 {
1563         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1564                 return;
1565
1566         /*
1567          * If LINK is set, we have dependent requests in this chain. If we
1568          * didn't fail this request, queue the first one up, moving any other
1569          * dependencies to the next request. In case of failure, fail the rest
1570          * of the chain.
1571          */
1572         if (req->flags & REQ_F_FAIL_LINK) {
1573                 io_fail_links(req);
1574         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1575                         REQ_F_LINK_TIMEOUT) {
1576                 struct io_ring_ctx *ctx = req->ctx;
1577                 unsigned long flags;
1578
1579                 /*
1580                  * If this is a timeout link, we could be racing with the
1581                  * timeout timer. Grab the completion lock for this case to
1582                  * protect against that.
1583                  */
1584                 spin_lock_irqsave(&ctx->completion_lock, flags);
1585                 io_req_link_next(req, nxt);
1586                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1587         } else {
1588                 io_req_link_next(req, nxt);
1589         }
1590 }
1591
1592 static void io_free_req(struct io_kiocb *req)
1593 {
1594         struct io_kiocb *nxt = NULL;
1595
1596         io_req_find_next(req, &nxt);
1597         __io_free_req(req);
1598
1599         if (nxt)
1600                 io_queue_async_work(nxt);
1601 }
1602
1603 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1604 {
1605         struct io_kiocb *link;
1606         const struct io_op_def *def = &io_op_defs[nxt->opcode];
1607
1608         if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
1609                 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1610
1611         *workptr = &nxt->work;
1612         link = io_prep_linked_timeout(nxt);
1613         if (link)
1614                 nxt->flags |= REQ_F_QUEUE_TIMEOUT;
1615 }
1616
1617 /*
1618  * Drop reference to request, return next in chain (if there is one) if this
1619  * was the last reference to this request.
1620  */
1621 __attribute__((nonnull))
1622 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1623 {
1624         if (refcount_dec_and_test(&req->refs)) {
1625                 io_req_find_next(req, nxtptr);
1626                 __io_free_req(req);
1627         }
1628 }
1629
1630 static void io_put_req(struct io_kiocb *req)
1631 {
1632         if (refcount_dec_and_test(&req->refs))
1633                 io_free_req(req);
1634 }
1635
1636 static void io_steal_work(struct io_kiocb *req,
1637                           struct io_wq_work **workptr)
1638 {
1639         /*
1640          * It's in an io-wq worker, so there always should be at least
1641          * one reference, which will be dropped in io_put_work() just
1642          * after the current handler returns.
1643          *
1644          * It also means, that if the counter dropped to 1, then there is
1645          * no asynchronous users left, so it's safe to steal the next work.
1646          */
1647         if (refcount_read(&req->refs) == 1) {
1648                 struct io_kiocb *nxt = NULL;
1649
1650                 io_req_find_next(req, &nxt);
1651                 if (nxt)
1652                         io_wq_assign_next(workptr, nxt);
1653         }
1654 }
1655
1656 /*
1657  * Must only be used if we don't need to care about links, usually from
1658  * within the completion handling itself.
1659  */
1660 static void __io_double_put_req(struct io_kiocb *req)
1661 {
1662         /* drop both submit and complete references */
1663         if (refcount_sub_and_test(2, &req->refs))
1664                 __io_free_req(req);
1665 }
1666
1667 static void io_double_put_req(struct io_kiocb *req)
1668 {
1669         /* drop both submit and complete references */
1670         if (refcount_sub_and_test(2, &req->refs))
1671                 io_free_req(req);
1672 }
1673
1674 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1675 {
1676         struct io_rings *rings = ctx->rings;
1677
1678         if (test_bit(0, &ctx->cq_check_overflow)) {
1679                 /*
1680                  * noflush == true is from the waitqueue handler, just ensure
1681                  * we wake up the task, and the next invocation will flush the
1682                  * entries. We cannot safely to it from here.
1683                  */
1684                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1685                         return -1U;
1686
1687                 io_cqring_overflow_flush(ctx, false);
1688         }
1689
1690         /* See comment at the top of this file */
1691         smp_rmb();
1692         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1693 }
1694
1695 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1696 {
1697         struct io_rings *rings = ctx->rings;
1698
1699         /* make sure SQ entry isn't read before tail */
1700         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1701 }
1702
1703 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
1704 {
1705         if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
1706                 return false;
1707
1708         if (req->file || req->io)
1709                 rb->need_iter++;
1710
1711         rb->reqs[rb->to_free++] = req;
1712         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1713                 io_free_req_many(req->ctx, rb);
1714         return true;
1715 }
1716
1717 static int io_put_kbuf(struct io_kiocb *req)
1718 {
1719         struct io_buffer *kbuf;
1720         int cflags;
1721
1722         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1723         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1724         cflags |= IORING_CQE_F_BUFFER;
1725         req->rw.addr = 0;
1726         kfree(kbuf);
1727         return cflags;
1728 }
1729
1730 /*
1731  * Find and free completed poll iocbs
1732  */
1733 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1734                                struct list_head *done)
1735 {
1736         struct req_batch rb;
1737         struct io_kiocb *req;
1738
1739         rb.to_free = rb.need_iter = 0;
1740         while (!list_empty(done)) {
1741                 int cflags = 0;
1742
1743                 req = list_first_entry(done, struct io_kiocb, list);
1744                 list_del(&req->list);
1745
1746                 if (req->flags & REQ_F_BUFFER_SELECTED)
1747                         cflags = io_put_kbuf(req);
1748
1749                 __io_cqring_fill_event(req, req->result, cflags);
1750                 (*nr_events)++;
1751
1752                 if (refcount_dec_and_test(&req->refs) &&
1753                     !io_req_multi_free(&rb, req))
1754                         io_free_req(req);
1755         }
1756
1757         io_commit_cqring(ctx);
1758         if (ctx->flags & IORING_SETUP_SQPOLL)
1759                 io_cqring_ev_posted(ctx);
1760         io_free_req_many(ctx, &rb);
1761 }
1762
1763 static void io_iopoll_queue(struct list_head *again)
1764 {
1765         struct io_kiocb *req;
1766
1767         do {
1768                 req = list_first_entry(again, struct io_kiocb, list);
1769                 list_del(&req->list);
1770                 refcount_inc(&req->refs);
1771                 io_queue_async_work(req);
1772         } while (!list_empty(again));
1773 }
1774
1775 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1776                         long min)
1777 {
1778         struct io_kiocb *req, *tmp;
1779         LIST_HEAD(done);
1780         LIST_HEAD(again);
1781         bool spin;
1782         int ret;
1783
1784         /*
1785          * Only spin for completions if we don't have multiple devices hanging
1786          * off our complete list, and we're under the requested amount.
1787          */
1788         spin = !ctx->poll_multi_file && *nr_events < min;
1789
1790         ret = 0;
1791         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1792                 struct kiocb *kiocb = &req->rw.kiocb;
1793
1794                 /*
1795                  * Move completed and retryable entries to our local lists.
1796                  * If we find a request that requires polling, break out
1797                  * and complete those lists first, if we have entries there.
1798                  */
1799                 if (READ_ONCE(req->iopoll_completed)) {
1800                         list_move_tail(&req->list, &done);
1801                         continue;
1802                 }
1803                 if (!list_empty(&done))
1804                         break;
1805
1806                 if (req->result == -EAGAIN) {
1807                         list_move_tail(&req->list, &again);
1808                         continue;
1809                 }
1810                 if (!list_empty(&again))
1811                         break;
1812
1813                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1814                 if (ret < 0)
1815                         break;
1816
1817                 if (ret && spin)
1818                         spin = false;
1819                 ret = 0;
1820         }
1821
1822         if (!list_empty(&done))
1823                 io_iopoll_complete(ctx, nr_events, &done);
1824
1825         if (!list_empty(&again))
1826                 io_iopoll_queue(&again);
1827
1828         return ret;
1829 }
1830
1831 /*
1832  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1833  * non-spinning poll check - we'll still enter the driver poll loop, but only
1834  * as a non-spinning completion check.
1835  */
1836 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1837                                 long min)
1838 {
1839         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1840                 int ret;
1841
1842                 ret = io_do_iopoll(ctx, nr_events, min);
1843                 if (ret < 0)
1844                         return ret;
1845                 if (!min || *nr_events >= min)
1846                         return 0;
1847         }
1848
1849         return 1;
1850 }
1851
1852 /*
1853  * We can't just wait for polled events to come to us, we have to actively
1854  * find and complete them.
1855  */
1856 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1857 {
1858         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1859                 return;
1860
1861         mutex_lock(&ctx->uring_lock);
1862         while (!list_empty(&ctx->poll_list)) {
1863                 unsigned int nr_events = 0;
1864
1865                 io_iopoll_getevents(ctx, &nr_events, 1);
1866
1867                 /*
1868                  * Ensure we allow local-to-the-cpu processing to take place,
1869                  * in this case we need to ensure that we reap all events.
1870                  */
1871                 cond_resched();
1872         }
1873         mutex_unlock(&ctx->uring_lock);
1874 }
1875
1876 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1877                            long min)
1878 {
1879         int iters = 0, ret = 0;
1880
1881         /*
1882          * We disallow the app entering submit/complete with polling, but we
1883          * still need to lock the ring to prevent racing with polled issue
1884          * that got punted to a workqueue.
1885          */
1886         mutex_lock(&ctx->uring_lock);
1887         do {
1888                 int tmin = 0;
1889
1890                 /*
1891                  * Don't enter poll loop if we already have events pending.
1892                  * If we do, we can potentially be spinning for commands that
1893                  * already triggered a CQE (eg in error).
1894                  */
1895                 if (io_cqring_events(ctx, false))
1896                         break;
1897
1898                 /*
1899                  * If a submit got punted to a workqueue, we can have the
1900                  * application entering polling for a command before it gets
1901                  * issued. That app will hold the uring_lock for the duration
1902                  * of the poll right here, so we need to take a breather every
1903                  * now and then to ensure that the issue has a chance to add
1904                  * the poll to the issued list. Otherwise we can spin here
1905                  * forever, while the workqueue is stuck trying to acquire the
1906                  * very same mutex.
1907                  */
1908                 if (!(++iters & 7)) {
1909                         mutex_unlock(&ctx->uring_lock);
1910                         mutex_lock(&ctx->uring_lock);
1911                 }
1912
1913                 if (*nr_events < min)
1914                         tmin = min - *nr_events;
1915
1916                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1917                 if (ret <= 0)
1918                         break;
1919                 ret = 0;
1920         } while (min && !*nr_events && !need_resched());
1921
1922         mutex_unlock(&ctx->uring_lock);
1923         return ret;
1924 }
1925
1926 static void kiocb_end_write(struct io_kiocb *req)
1927 {
1928         /*
1929          * Tell lockdep we inherited freeze protection from submission
1930          * thread.
1931          */
1932         if (req->flags & REQ_F_ISREG) {
1933                 struct inode *inode = file_inode(req->file);
1934
1935                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1936         }
1937         file_end_write(req->file);
1938 }
1939
1940 static inline void req_set_fail_links(struct io_kiocb *req)
1941 {
1942         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1943                 req->flags |= REQ_F_FAIL_LINK;
1944 }
1945
1946 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1947 {
1948         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1949         int cflags = 0;
1950
1951         if (kiocb->ki_flags & IOCB_WRITE)
1952                 kiocb_end_write(req);
1953
1954         if (res != req->result)
1955                 req_set_fail_links(req);
1956         if (req->flags & REQ_F_BUFFER_SELECTED)
1957                 cflags = io_put_kbuf(req);
1958         __io_cqring_add_event(req, res, cflags);
1959 }
1960
1961 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1962 {
1963         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1964
1965         io_complete_rw_common(kiocb, res);
1966         io_put_req(req);
1967 }
1968
1969 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1970 {
1971         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1972
1973         if (kiocb->ki_flags & IOCB_WRITE)
1974                 kiocb_end_write(req);
1975
1976         if (res != req->result)
1977                 req_set_fail_links(req);
1978         req->result = res;
1979         if (res != -EAGAIN)
1980                 WRITE_ONCE(req->iopoll_completed, 1);
1981 }
1982
1983 /*
1984  * After the iocb has been issued, it's safe to be found on the poll list.
1985  * Adding the kiocb to the list AFTER submission ensures that we don't
1986  * find it from a io_iopoll_getevents() thread before the issuer is done
1987  * accessing the kiocb cookie.
1988  */
1989 static void io_iopoll_req_issued(struct io_kiocb *req)
1990 {
1991         struct io_ring_ctx *ctx = req->ctx;
1992
1993         /*
1994          * Track whether we have multiple files in our lists. This will impact
1995          * how we do polling eventually, not spinning if we're on potentially
1996          * different devices.
1997          */
1998         if (list_empty(&ctx->poll_list)) {
1999                 ctx->poll_multi_file = false;
2000         } else if (!ctx->poll_multi_file) {
2001                 struct io_kiocb *list_req;
2002
2003                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
2004                                                 list);
2005                 if (list_req->file != req->file)
2006                         ctx->poll_multi_file = true;
2007         }
2008
2009         /*
2010          * For fast devices, IO may have already completed. If it has, add
2011          * it to the front so we find it first.
2012          */
2013         if (READ_ONCE(req->iopoll_completed))
2014                 list_add(&req->list, &ctx->poll_list);
2015         else
2016                 list_add_tail(&req->list, &ctx->poll_list);
2017
2018         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2019             wq_has_sleeper(&ctx->sqo_wait))
2020                 wake_up(&ctx->sqo_wait);
2021 }
2022
2023 static void __io_state_file_put(struct io_submit_state *state)
2024 {
2025         int diff = state->has_refs - state->used_refs;
2026
2027         if (diff)
2028                 fput_many(state->file, diff);
2029         state->file = NULL;
2030 }
2031
2032 static inline void io_state_file_put(struct io_submit_state *state)
2033 {
2034         if (state->file)
2035                 __io_state_file_put(state);
2036 }
2037
2038 /*
2039  * Get as many references to a file as we have IOs left in this submission,
2040  * assuming most submissions are for one file, or at least that each file
2041  * has more than one submission.
2042  */
2043 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2044 {
2045         if (!state)
2046                 return fget(fd);
2047
2048         if (state->file) {
2049                 if (state->fd == fd) {
2050                         state->used_refs++;
2051                         state->ios_left--;
2052                         return state->file;
2053                 }
2054                 __io_state_file_put(state);
2055         }
2056         state->file = fget_many(fd, state->ios_left);
2057         if (!state->file)
2058                 return NULL;
2059
2060         state->fd = fd;
2061         state->has_refs = state->ios_left;
2062         state->used_refs = 1;
2063         state->ios_left--;
2064         return state->file;
2065 }
2066
2067 /*
2068  * If we tracked the file through the SCM inflight mechanism, we could support
2069  * any file. For now, just ensure that anything potentially problematic is done
2070  * inline.
2071  */
2072 static bool io_file_supports_async(struct file *file, int rw)
2073 {
2074         umode_t mode = file_inode(file)->i_mode;
2075
2076         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2077                 return true;
2078         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
2079                 return true;
2080
2081         /* any ->read/write should understand O_NONBLOCK */
2082         if (file->f_flags & O_NONBLOCK)
2083                 return true;
2084
2085         if (!(file->f_mode & FMODE_NOWAIT))
2086                 return false;
2087
2088         if (rw == READ)
2089                 return file->f_op->read_iter != NULL;
2090
2091         return file->f_op->write_iter != NULL;
2092 }
2093
2094 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2095                       bool force_nonblock)
2096 {
2097         struct io_ring_ctx *ctx = req->ctx;
2098         struct kiocb *kiocb = &req->rw.kiocb;
2099         unsigned ioprio;
2100         int ret;
2101
2102         if (S_ISREG(file_inode(req->file)->i_mode))
2103                 req->flags |= REQ_F_ISREG;
2104
2105         kiocb->ki_pos = READ_ONCE(sqe->off);
2106         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2107                 req->flags |= REQ_F_CUR_POS;
2108                 kiocb->ki_pos = req->file->f_pos;
2109         }
2110         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2111         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2112         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2113         if (unlikely(ret))
2114                 return ret;
2115
2116         ioprio = READ_ONCE(sqe->ioprio);
2117         if (ioprio) {
2118                 ret = ioprio_check_cap(ioprio);
2119                 if (ret)
2120                         return ret;
2121
2122                 kiocb->ki_ioprio = ioprio;
2123         } else
2124                 kiocb->ki_ioprio = get_current_ioprio();
2125
2126         /* don't allow async punt if RWF_NOWAIT was requested */
2127         if (kiocb->ki_flags & IOCB_NOWAIT)
2128                 req->flags |= REQ_F_NOWAIT;
2129
2130         if (force_nonblock)
2131                 kiocb->ki_flags |= IOCB_NOWAIT;
2132
2133         if (ctx->flags & IORING_SETUP_IOPOLL) {
2134                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2135                     !kiocb->ki_filp->f_op->iopoll)
2136                         return -EOPNOTSUPP;
2137
2138                 kiocb->ki_flags |= IOCB_HIPRI;
2139                 kiocb->ki_complete = io_complete_rw_iopoll;
2140                 req->result = 0;
2141                 req->iopoll_completed = 0;
2142         } else {
2143                 if (kiocb->ki_flags & IOCB_HIPRI)
2144                         return -EINVAL;
2145                 kiocb->ki_complete = io_complete_rw;
2146         }
2147
2148         req->rw.addr = READ_ONCE(sqe->addr);
2149         req->rw.len = READ_ONCE(sqe->len);
2150         req->buf_index = READ_ONCE(sqe->buf_index);
2151         return 0;
2152 }
2153
2154 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2155 {
2156         switch (ret) {
2157         case -EIOCBQUEUED:
2158                 break;
2159         case -ERESTARTSYS:
2160         case -ERESTARTNOINTR:
2161         case -ERESTARTNOHAND:
2162         case -ERESTART_RESTARTBLOCK:
2163                 /*
2164                  * We can't just restart the syscall, since previously
2165                  * submitted sqes may already be in progress. Just fail this
2166                  * IO with EINTR.
2167                  */
2168                 ret = -EINTR;
2169                 /* fall through */
2170         default:
2171                 kiocb->ki_complete(kiocb, ret, 0);
2172         }
2173 }
2174
2175 static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
2176 {
2177         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2178
2179         if (req->flags & REQ_F_CUR_POS)
2180                 req->file->f_pos = kiocb->ki_pos;
2181         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2182                 io_complete_rw(kiocb, ret, 0);
2183         else
2184                 io_rw_done(kiocb, ret);
2185 }
2186
2187 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2188                                struct iov_iter *iter)
2189 {
2190         struct io_ring_ctx *ctx = req->ctx;
2191         size_t len = req->rw.len;
2192         struct io_mapped_ubuf *imu;
2193         u16 index, buf_index;
2194         size_t offset;
2195         u64 buf_addr;
2196
2197         /* attempt to use fixed buffers without having provided iovecs */
2198         if (unlikely(!ctx->user_bufs))
2199                 return -EFAULT;
2200
2201         buf_index = req->buf_index;
2202         if (unlikely(buf_index >= ctx->nr_user_bufs))
2203                 return -EFAULT;
2204
2205         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2206         imu = &ctx->user_bufs[index];
2207         buf_addr = req->rw.addr;
2208
2209         /* overflow */
2210         if (buf_addr + len < buf_addr)
2211                 return -EFAULT;
2212         /* not inside the mapped region */
2213         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2214                 return -EFAULT;
2215
2216         /*
2217          * May not be a start of buffer, set size appropriately
2218          * and advance us to the beginning.
2219          */
2220         offset = buf_addr - imu->ubuf;
2221         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2222
2223         if (offset) {
2224                 /*
2225                  * Don't use iov_iter_advance() here, as it's really slow for
2226                  * using the latter parts of a big fixed buffer - it iterates
2227                  * over each segment manually. We can cheat a bit here, because
2228                  * we know that:
2229                  *
2230                  * 1) it's a BVEC iter, we set it up
2231                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2232                  *    first and last bvec
2233                  *
2234                  * So just find our index, and adjust the iterator afterwards.
2235                  * If the offset is within the first bvec (or the whole first
2236                  * bvec, just use iov_iter_advance(). This makes it easier
2237                  * since we can just skip the first segment, which may not
2238                  * be PAGE_SIZE aligned.
2239                  */
2240                 const struct bio_vec *bvec = imu->bvec;
2241
2242                 if (offset <= bvec->bv_len) {
2243                         iov_iter_advance(iter, offset);
2244                 } else {
2245                         unsigned long seg_skip;
2246
2247                         /* skip first vec */
2248                         offset -= bvec->bv_len;
2249                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2250
2251                         iter->bvec = bvec + seg_skip;
2252                         iter->nr_segs -= seg_skip;
2253                         iter->count -= bvec->bv_len + offset;
2254                         iter->iov_offset = offset & ~PAGE_MASK;
2255                 }
2256         }
2257
2258         return len;
2259 }
2260
2261 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2262 {
2263         if (needs_lock)
2264                 mutex_unlock(&ctx->uring_lock);
2265 }
2266
2267 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2268 {
2269         /*
2270          * "Normal" inline submissions always hold the uring_lock, since we
2271          * grab it from the system call. Same is true for the SQPOLL offload.
2272          * The only exception is when we've detached the request and issue it
2273          * from an async worker thread, grab the lock for that case.
2274          */
2275         if (needs_lock)
2276                 mutex_lock(&ctx->uring_lock);
2277 }
2278
2279 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2280                                           int bgid, struct io_buffer *kbuf,
2281                                           bool needs_lock)
2282 {
2283         struct io_buffer *head;
2284
2285         if (req->flags & REQ_F_BUFFER_SELECTED)
2286                 return kbuf;
2287
2288         io_ring_submit_lock(req->ctx, needs_lock);
2289
2290         lockdep_assert_held(&req->ctx->uring_lock);
2291
2292         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2293         if (head) {
2294                 if (!list_empty(&head->list)) {
2295                         kbuf = list_last_entry(&head->list, struct io_buffer,
2296                                                         list);
2297                         list_del(&kbuf->list);
2298                 } else {
2299                         kbuf = head;
2300                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2301                 }
2302                 if (*len > kbuf->len)
2303                         *len = kbuf->len;
2304         } else {
2305                 kbuf = ERR_PTR(-ENOBUFS);
2306         }
2307
2308         io_ring_submit_unlock(req->ctx, needs_lock);
2309
2310         return kbuf;
2311 }
2312
2313 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2314                                         bool needs_lock)
2315 {
2316         struct io_buffer *kbuf;
2317         u16 bgid;
2318
2319         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2320         bgid = req->buf_index;
2321         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2322         if (IS_ERR(kbuf))
2323                 return kbuf;
2324         req->rw.addr = (u64) (unsigned long) kbuf;
2325         req->flags |= REQ_F_BUFFER_SELECTED;
2326         return u64_to_user_ptr(kbuf->addr);
2327 }
2328
2329 #ifdef CONFIG_COMPAT
2330 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2331                                 bool needs_lock)
2332 {
2333         struct compat_iovec __user *uiov;
2334         compat_ssize_t clen;
2335         void __user *buf;
2336         ssize_t len;
2337
2338         uiov = u64_to_user_ptr(req->rw.addr);
2339         if (!access_ok(uiov, sizeof(*uiov)))
2340                 return -EFAULT;
2341         if (__get_user(clen, &uiov->iov_len))
2342                 return -EFAULT;
2343         if (clen < 0)
2344                 return -EINVAL;
2345
2346         len = clen;
2347         buf = io_rw_buffer_select(req, &len, needs_lock);
2348         if (IS_ERR(buf))
2349                 return PTR_ERR(buf);
2350         iov[0].iov_base = buf;
2351         iov[0].iov_len = (compat_size_t) len;
2352         return 0;
2353 }
2354 #endif
2355
2356 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2357                                       bool needs_lock)
2358 {
2359         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2360         void __user *buf;
2361         ssize_t len;
2362
2363         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2364                 return -EFAULT;
2365
2366         len = iov[0].iov_len;
2367         if (len < 0)
2368                 return -EINVAL;
2369         buf = io_rw_buffer_select(req, &len, needs_lock);
2370         if (IS_ERR(buf))
2371                 return PTR_ERR(buf);
2372         iov[0].iov_base = buf;
2373         iov[0].iov_len = len;
2374         return 0;
2375 }
2376
2377 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2378                                     bool needs_lock)
2379 {
2380         if (req->flags & REQ_F_BUFFER_SELECTED) {
2381                 struct io_buffer *kbuf;
2382
2383                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2384                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2385                 iov[0].iov_len = kbuf->len;
2386                 return 0;
2387         }
2388         if (!req->rw.len)
2389                 return 0;
2390         else if (req->rw.len > 1)
2391                 return -EINVAL;
2392
2393 #ifdef CONFIG_COMPAT
2394         if (req->ctx->compat)
2395                 return io_compat_import(req, iov, needs_lock);
2396 #endif
2397
2398         return __io_iov_buffer_select(req, iov, needs_lock);
2399 }
2400
2401 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2402                                struct iovec **iovec, struct iov_iter *iter,
2403                                bool needs_lock)
2404 {
2405         void __user *buf = u64_to_user_ptr(req->rw.addr);
2406         size_t sqe_len = req->rw.len;
2407         ssize_t ret;
2408         u8 opcode;
2409
2410         opcode = req->opcode;
2411         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2412                 *iovec = NULL;
2413                 return io_import_fixed(req, rw, iter);
2414         }
2415
2416         /* buffer index only valid with fixed read/write, or buffer select  */
2417         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2418                 return -EINVAL;
2419
2420         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2421                 if (req->flags & REQ_F_BUFFER_SELECT) {
2422                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2423                         if (IS_ERR(buf)) {
2424                                 *iovec = NULL;
2425                                 return PTR_ERR(buf);
2426                         }
2427                         req->rw.len = sqe_len;
2428                 }
2429
2430                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2431                 *iovec = NULL;
2432                 return ret < 0 ? ret : sqe_len;
2433         }
2434
2435         if (req->io) {
2436                 struct io_async_rw *iorw = &req->io->rw;
2437
2438                 *iovec = iorw->iov;
2439                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2440                 if (iorw->iov == iorw->fast_iov)
2441                         *iovec = NULL;
2442                 return iorw->size;
2443         }
2444
2445         if (req->flags & REQ_F_BUFFER_SELECT) {
2446                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2447                 if (!ret) {
2448                         ret = (*iovec)->iov_len;
2449                         iov_iter_init(iter, rw, *iovec, 1, ret);
2450                 }
2451                 *iovec = NULL;
2452                 return ret;
2453         }
2454
2455 #ifdef CONFIG_COMPAT
2456         if (req->ctx->compat)
2457                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2458                                                 iovec, iter);
2459 #endif
2460
2461         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2462 }
2463
2464 /*
2465  * For files that don't have ->read_iter() and ->write_iter(), handle them
2466  * by looping over ->read() or ->write() manually.
2467  */
2468 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2469                            struct iov_iter *iter)
2470 {
2471         ssize_t ret = 0;
2472
2473         /*
2474          * Don't support polled IO through this interface, and we can't
2475          * support non-blocking either. For the latter, this just causes
2476          * the kiocb to be handled from an async context.
2477          */
2478         if (kiocb->ki_flags & IOCB_HIPRI)
2479                 return -EOPNOTSUPP;
2480         if (kiocb->ki_flags & IOCB_NOWAIT)
2481                 return -EAGAIN;
2482
2483         while (iov_iter_count(iter)) {
2484                 struct iovec iovec;
2485                 ssize_t nr;
2486
2487                 if (!iov_iter_is_bvec(iter)) {
2488                         iovec = iov_iter_iovec(iter);
2489                 } else {
2490                         /* fixed buffers import bvec */
2491                         iovec.iov_base = kmap(iter->bvec->bv_page)
2492                                                 + iter->iov_offset;
2493                         iovec.iov_len = min(iter->count,
2494                                         iter->bvec->bv_len - iter->iov_offset);
2495                 }
2496
2497                 if (rw == READ) {
2498                         nr = file->f_op->read(file, iovec.iov_base,
2499                                               iovec.iov_len, &kiocb->ki_pos);
2500                 } else {
2501                         nr = file->f_op->write(file, iovec.iov_base,
2502                                                iovec.iov_len, &kiocb->ki_pos);
2503                 }
2504
2505                 if (iov_iter_is_bvec(iter))
2506                         kunmap(iter->bvec->bv_page);
2507
2508                 if (nr < 0) {
2509                         if (!ret)
2510                                 ret = nr;
2511                         break;
2512                 }
2513                 ret += nr;
2514                 if (nr != iovec.iov_len)
2515                         break;
2516                 iov_iter_advance(iter, nr);
2517         }
2518
2519         return ret;
2520 }
2521
2522 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2523                           struct iovec *iovec, struct iovec *fast_iov,
2524                           struct iov_iter *iter)
2525 {
2526         req->io->rw.nr_segs = iter->nr_segs;
2527         req->io->rw.size = io_size;
2528         req->io->rw.iov = iovec;
2529         if (!req->io->rw.iov) {
2530                 req->io->rw.iov = req->io->rw.fast_iov;
2531                 if (req->io->rw.iov != fast_iov)
2532                         memcpy(req->io->rw.iov, fast_iov,
2533                                sizeof(struct iovec) * iter->nr_segs);
2534         } else {
2535                 req->flags |= REQ_F_NEED_CLEANUP;
2536         }
2537 }
2538
2539 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2540 {
2541         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2542         return req->io == NULL;
2543 }
2544
2545 static int io_alloc_async_ctx(struct io_kiocb *req)
2546 {
2547         if (!io_op_defs[req->opcode].async_ctx)
2548                 return 0;
2549
2550         return  __io_alloc_async_ctx(req);
2551 }
2552
2553 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2554                              struct iovec *iovec, struct iovec *fast_iov,
2555                              struct iov_iter *iter)
2556 {
2557         if (!io_op_defs[req->opcode].async_ctx)
2558                 return 0;
2559         if (!req->io) {
2560                 if (__io_alloc_async_ctx(req))
2561                         return -ENOMEM;
2562
2563                 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2564         }
2565         return 0;
2566 }
2567
2568 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2569                         bool force_nonblock)
2570 {
2571         struct io_async_ctx *io;
2572         struct iov_iter iter;
2573         ssize_t ret;
2574
2575         ret = io_prep_rw(req, sqe, force_nonblock);
2576         if (ret)
2577                 return ret;
2578
2579         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2580                 return -EBADF;
2581
2582         /* either don't need iovec imported or already have it */
2583         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2584                 return 0;
2585
2586         io = req->io;
2587         io->rw.iov = io->rw.fast_iov;
2588         req->io = NULL;
2589         ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2590         req->io = io;
2591         if (ret < 0)
2592                 return ret;
2593
2594         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2595         return 0;
2596 }
2597
2598 static int io_read(struct io_kiocb *req, bool force_nonblock)
2599 {
2600         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2601         struct kiocb *kiocb = &req->rw.kiocb;
2602         struct iov_iter iter;
2603         size_t iov_count;
2604         ssize_t io_size, ret;
2605
2606         ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2607         if (ret < 0)
2608                 return ret;
2609
2610         /* Ensure we clear previously set non-block flag */
2611         if (!force_nonblock)
2612                 kiocb->ki_flags &= ~IOCB_NOWAIT;
2613
2614         req->result = 0;
2615         io_size = ret;
2616         if (req->flags & REQ_F_LINK_HEAD)
2617                 req->result = io_size;
2618
2619         /*
2620          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2621          * we know to async punt it even if it was opened O_NONBLOCK
2622          */
2623         if (force_nonblock && !io_file_supports_async(req->file, READ))
2624                 goto copy_iov;
2625
2626         iov_count = iov_iter_count(&iter);
2627         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2628         if (!ret) {
2629                 ssize_t ret2;
2630
2631                 if (req->file->f_op->read_iter)
2632                         ret2 = call_read_iter(req->file, kiocb, &iter);
2633                 else
2634                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
2635
2636                 /* Catch -EAGAIN return for forced non-blocking submission */
2637                 if (!force_nonblock || ret2 != -EAGAIN) {
2638                         kiocb_done(kiocb, ret2);
2639                 } else {
2640 copy_iov:
2641                         ret = io_setup_async_rw(req, io_size, iovec,
2642                                                 inline_vecs, &iter);
2643                         if (ret)
2644                                 goto out_free;
2645                         /* any defer here is final, must blocking retry */
2646                         if (!(req->flags & REQ_F_NOWAIT) &&
2647                             !file_can_poll(req->file))
2648                                 req->flags |= REQ_F_MUST_PUNT;
2649                         return -EAGAIN;
2650                 }
2651         }
2652 out_free:
2653         kfree(iovec);
2654         req->flags &= ~REQ_F_NEED_CLEANUP;
2655         return ret;
2656 }
2657
2658 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2659                          bool force_nonblock)
2660 {
2661         struct io_async_ctx *io;
2662         struct iov_iter iter;
2663         ssize_t ret;
2664
2665         ret = io_prep_rw(req, sqe, force_nonblock);
2666         if (ret)
2667                 return ret;
2668
2669         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2670                 return -EBADF;
2671
2672         req->fsize = rlimit(RLIMIT_FSIZE);
2673
2674         /* either don't need iovec imported or already have it */
2675         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2676                 return 0;
2677
2678         io = req->io;
2679         io->rw.iov = io->rw.fast_iov;
2680         req->io = NULL;
2681         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
2682         req->io = io;
2683         if (ret < 0)
2684                 return ret;
2685
2686         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2687         return 0;
2688 }
2689
2690 static int io_write(struct io_kiocb *req, bool force_nonblock)
2691 {
2692         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2693         struct kiocb *kiocb = &req->rw.kiocb;
2694         struct iov_iter iter;
2695         size_t iov_count;
2696         ssize_t ret, io_size;
2697
2698         ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
2699         if (ret < 0)
2700                 return ret;
2701
2702         /* Ensure we clear previously set non-block flag */
2703         if (!force_nonblock)
2704                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2705
2706         req->result = 0;
2707         io_size = ret;
2708         if (req->flags & REQ_F_LINK_HEAD)
2709                 req->result = io_size;
2710
2711         /*
2712          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2713          * we know to async punt it even if it was opened O_NONBLOCK
2714          */
2715         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
2716                 goto copy_iov;
2717
2718         /* file path doesn't support NOWAIT for non-direct_IO */
2719         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2720             (req->flags & REQ_F_ISREG))
2721                 goto copy_iov;
2722
2723         iov_count = iov_iter_count(&iter);
2724         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2725         if (!ret) {
2726                 ssize_t ret2;
2727
2728                 /*
2729                  * Open-code file_start_write here to grab freeze protection,
2730                  * which will be released by another thread in
2731                  * io_complete_rw().  Fool lockdep by telling it the lock got
2732                  * released so that it doesn't complain about the held lock when
2733                  * we return to userspace.
2734                  */
2735                 if (req->flags & REQ_F_ISREG) {
2736                         __sb_start_write(file_inode(req->file)->i_sb,
2737                                                 SB_FREEZE_WRITE, true);
2738                         __sb_writers_release(file_inode(req->file)->i_sb,
2739                                                 SB_FREEZE_WRITE);
2740                 }
2741                 kiocb->ki_flags |= IOCB_WRITE;
2742
2743                 if (!force_nonblock)
2744                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2745
2746                 if (req->file->f_op->write_iter)
2747                         ret2 = call_write_iter(req->file, kiocb, &iter);
2748                 else
2749                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
2750
2751                 if (!force_nonblock)
2752                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2753
2754                 /*
2755                  * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
2756                  * retry them without IOCB_NOWAIT.
2757                  */
2758                 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2759                         ret2 = -EAGAIN;
2760                 if (!force_nonblock || ret2 != -EAGAIN) {
2761                         kiocb_done(kiocb, ret2);
2762                 } else {
2763 copy_iov:
2764                         ret = io_setup_async_rw(req, io_size, iovec,
2765                                                 inline_vecs, &iter);
2766                         if (ret)
2767                                 goto out_free;
2768                         /* any defer here is final, must blocking retry */
2769                         if (!(req->flags & REQ_F_NOWAIT) &&
2770                             !file_can_poll(req->file))
2771                                 req->flags |= REQ_F_MUST_PUNT;
2772                         return -EAGAIN;
2773                 }
2774         }
2775 out_free:
2776         req->flags &= ~REQ_F_NEED_CLEANUP;
2777         kfree(iovec);
2778         return ret;
2779 }
2780
2781 static int __io_splice_prep(struct io_kiocb *req,
2782                             const struct io_uring_sqe *sqe)
2783 {
2784         struct io_splice* sp = &req->splice;
2785         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2786         int ret;
2787
2788         if (req->flags & REQ_F_NEED_CLEANUP)
2789                 return 0;
2790         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2791                 return -EINVAL;
2792
2793         sp->file_in = NULL;
2794         sp->len = READ_ONCE(sqe->len);
2795         sp->flags = READ_ONCE(sqe->splice_flags);
2796
2797         if (unlikely(sp->flags & ~valid_flags))
2798                 return -EINVAL;
2799
2800         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2801                           (sp->flags & SPLICE_F_FD_IN_FIXED));
2802         if (ret)
2803                 return ret;
2804         req->flags |= REQ_F_NEED_CLEANUP;
2805
2806         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
2807                 /*
2808                  * Splice operation will be punted aync, and here need to
2809                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
2810                  */
2811                 io_req_init_async(req);
2812                 req->work.flags |= IO_WQ_WORK_UNBOUND;
2813         }
2814
2815         return 0;
2816 }
2817
2818 static int io_tee_prep(struct io_kiocb *req,
2819                        const struct io_uring_sqe *sqe)
2820 {
2821         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
2822                 return -EINVAL;
2823         return __io_splice_prep(req, sqe);
2824 }
2825
2826 static int io_tee(struct io_kiocb *req, bool force_nonblock)
2827 {
2828         struct io_splice *sp = &req->splice;
2829         struct file *in = sp->file_in;
2830         struct file *out = sp->file_out;
2831         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2832         long ret = 0;
2833
2834         if (force_nonblock)
2835                 return -EAGAIN;
2836         if (sp->len)
2837                 ret = do_tee(in, out, sp->len, flags);
2838
2839         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2840         req->flags &= ~REQ_F_NEED_CLEANUP;
2841
2842         io_cqring_add_event(req, ret);
2843         if (ret != sp->len)
2844                 req_set_fail_links(req);
2845         io_put_req(req);
2846         return 0;
2847 }
2848
2849 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2850 {
2851         struct io_splice* sp = &req->splice;
2852
2853         sp->off_in = READ_ONCE(sqe->splice_off_in);
2854         sp->off_out = READ_ONCE(sqe->off);
2855         return __io_splice_prep(req, sqe);
2856 }
2857
2858 static int io_splice(struct io_kiocb *req, bool force_nonblock)
2859 {
2860         struct io_splice *sp = &req->splice;
2861         struct file *in = sp->file_in;
2862         struct file *out = sp->file_out;
2863         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2864         loff_t *poff_in, *poff_out;
2865         long ret = 0;
2866
2867         if (force_nonblock)
2868                 return -EAGAIN;
2869
2870         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2871         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2872
2873         if (sp->len)
2874                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2875
2876         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2877         req->flags &= ~REQ_F_NEED_CLEANUP;
2878
2879         io_cqring_add_event(req, ret);
2880         if (ret != sp->len)
2881                 req_set_fail_links(req);
2882         io_put_req(req);
2883         return 0;
2884 }
2885
2886 /*
2887  * IORING_OP_NOP just posts a completion event, nothing else.
2888  */
2889 static int io_nop(struct io_kiocb *req)
2890 {
2891         struct io_ring_ctx *ctx = req->ctx;
2892
2893         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2894                 return -EINVAL;
2895
2896         io_cqring_add_event(req, 0);
2897         io_put_req(req);
2898         return 0;
2899 }
2900
2901 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2902 {
2903         struct io_ring_ctx *ctx = req->ctx;
2904
2905         if (!req->file)
2906                 return -EBADF;
2907
2908         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2909                 return -EINVAL;
2910         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2911                 return -EINVAL;
2912
2913         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2914         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2915                 return -EINVAL;
2916
2917         req->sync.off = READ_ONCE(sqe->off);
2918         req->sync.len = READ_ONCE(sqe->len);
2919         return 0;
2920 }
2921
2922 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
2923 {
2924         loff_t end = req->sync.off + req->sync.len;
2925         int ret;
2926
2927         /* fsync always requires a blocking context */
2928         if (force_nonblock)
2929                 return -EAGAIN;
2930
2931         ret = vfs_fsync_range(req->file, req->sync.off,
2932                                 end > 0 ? end : LLONG_MAX,
2933                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2934         if (ret < 0)
2935                 req_set_fail_links(req);
2936         io_cqring_add_event(req, ret);
2937         io_put_req(req);
2938         return 0;
2939 }
2940
2941 static int io_fallocate_prep(struct io_kiocb *req,
2942                              const struct io_uring_sqe *sqe)
2943 {
2944         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2945                 return -EINVAL;
2946         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2947                 return -EINVAL;
2948
2949         req->sync.off = READ_ONCE(sqe->off);
2950         req->sync.len = READ_ONCE(sqe->addr);
2951         req->sync.mode = READ_ONCE(sqe->len);
2952         req->fsize = rlimit(RLIMIT_FSIZE);
2953         return 0;
2954 }
2955
2956 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
2957 {
2958         int ret;
2959
2960         /* fallocate always requiring blocking context */
2961         if (force_nonblock)
2962                 return -EAGAIN;
2963
2964         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2965         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2966                                 req->sync.len);
2967         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2968         if (ret < 0)
2969                 req_set_fail_links(req);
2970         io_cqring_add_event(req, ret);
2971         io_put_req(req);
2972         return 0;
2973 }
2974
2975 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2976 {
2977         const char __user *fname;
2978         int ret;
2979
2980         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2981                 return -EINVAL;
2982         if (unlikely(sqe->ioprio || sqe->buf_index))
2983                 return -EINVAL;
2984         if (unlikely(req->flags & REQ_F_FIXED_FILE))
2985                 return -EBADF;
2986
2987         /* open.how should be already initialised */
2988         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2989                 req->open.how.flags |= O_LARGEFILE;
2990
2991         req->open.dfd = READ_ONCE(sqe->fd);
2992         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2993         req->open.filename = getname(fname);
2994         if (IS_ERR(req->open.filename)) {
2995                 ret = PTR_ERR(req->open.filename);
2996                 req->open.filename = NULL;
2997                 return ret;
2998         }
2999         req->open.nofile = rlimit(RLIMIT_NOFILE);
3000         req->flags |= REQ_F_NEED_CLEANUP;
3001         return 0;
3002 }
3003
3004 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3005 {
3006         u64 flags, mode;
3007
3008         if (req->flags & REQ_F_NEED_CLEANUP)
3009                 return 0;
3010         mode = READ_ONCE(sqe->len);
3011         flags = READ_ONCE(sqe->open_flags);
3012         req->open.how = build_open_how(flags, mode);
3013         return __io_openat_prep(req, sqe);
3014 }
3015
3016 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3017 {
3018         struct open_how __user *how;
3019         size_t len;
3020         int ret;
3021
3022         if (req->flags & REQ_F_NEED_CLEANUP)
3023                 return 0;
3024         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3025         len = READ_ONCE(sqe->len);
3026         if (len < OPEN_HOW_SIZE_VER0)
3027                 return -EINVAL;
3028
3029         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3030                                         len);
3031         if (ret)
3032                 return ret;
3033
3034         return __io_openat_prep(req, sqe);
3035 }
3036
3037 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3038 {
3039         struct open_flags op;
3040         struct file *file;
3041         int ret;
3042
3043         if (force_nonblock)
3044                 return -EAGAIN;
3045
3046         ret = build_open_flags(&req->open.how, &op);
3047         if (ret)
3048                 goto err;
3049
3050         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3051         if (ret < 0)
3052                 goto err;
3053
3054         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3055         if (IS_ERR(file)) {
3056                 put_unused_fd(ret);
3057                 ret = PTR_ERR(file);
3058         } else {
3059                 fsnotify_open(file);
3060                 fd_install(ret, file);
3061         }
3062 err:
3063         putname(req->open.filename);
3064         req->flags &= ~REQ_F_NEED_CLEANUP;
3065         if (ret < 0)
3066                 req_set_fail_links(req);
3067         io_cqring_add_event(req, ret);
3068         io_put_req(req);
3069         return 0;
3070 }
3071
3072 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3073 {
3074         return io_openat2(req, force_nonblock);
3075 }
3076
3077 static int io_remove_buffers_prep(struct io_kiocb *req,
3078                                   const struct io_uring_sqe *sqe)
3079 {
3080         struct io_provide_buf *p = &req->pbuf;
3081         u64 tmp;
3082
3083         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3084                 return -EINVAL;
3085
3086         tmp = READ_ONCE(sqe->fd);
3087         if (!tmp || tmp > USHRT_MAX)
3088                 return -EINVAL;
3089
3090         memset(p, 0, sizeof(*p));
3091         p->nbufs = tmp;
3092         p->bgid = READ_ONCE(sqe->buf_group);
3093         return 0;
3094 }
3095
3096 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3097                                int bgid, unsigned nbufs)
3098 {
3099         unsigned i = 0;
3100
3101         /* shouldn't happen */
3102         if (!nbufs)
3103                 return 0;
3104
3105         /* the head kbuf is the list itself */
3106         while (!list_empty(&buf->list)) {
3107                 struct io_buffer *nxt;
3108
3109                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3110                 list_del(&nxt->list);
3111                 kfree(nxt);
3112                 if (++i == nbufs)
3113                         return i;
3114         }
3115         i++;
3116         kfree(buf);
3117         idr_remove(&ctx->io_buffer_idr, bgid);
3118
3119         return i;
3120 }
3121
3122 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3123 {
3124         struct io_provide_buf *p = &req->pbuf;
3125         struct io_ring_ctx *ctx = req->ctx;
3126         struct io_buffer *head;
3127         int ret = 0;
3128
3129         io_ring_submit_lock(ctx, !force_nonblock);
3130
3131         lockdep_assert_held(&ctx->uring_lock);
3132
3133         ret = -ENOENT;
3134         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3135         if (head)
3136                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3137
3138         io_ring_submit_lock(ctx, !force_nonblock);
3139         if (ret < 0)
3140                 req_set_fail_links(req);
3141         io_cqring_add_event(req, ret);
3142         io_put_req(req);
3143         return 0;
3144 }
3145
3146 static int io_provide_buffers_prep(struct io_kiocb *req,
3147                                    const struct io_uring_sqe *sqe)
3148 {
3149         struct io_provide_buf *p = &req->pbuf;
3150         u64 tmp;
3151
3152         if (sqe->ioprio || sqe->rw_flags)
3153                 return -EINVAL;
3154
3155         tmp = READ_ONCE(sqe->fd);
3156         if (!tmp || tmp > USHRT_MAX)
3157                 return -E2BIG;
3158         p->nbufs = tmp;
3159         p->addr = READ_ONCE(sqe->addr);
3160         p->len = READ_ONCE(sqe->len);
3161
3162         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3163                 return -EFAULT;
3164
3165         p->bgid = READ_ONCE(sqe->buf_group);
3166         tmp = READ_ONCE(sqe->off);
3167         if (tmp > USHRT_MAX)
3168                 return -E2BIG;
3169         p->bid = tmp;
3170         return 0;
3171 }
3172
3173 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3174 {
3175         struct io_buffer *buf;
3176         u64 addr = pbuf->addr;
3177         int i, bid = pbuf->bid;
3178
3179         for (i = 0; i < pbuf->nbufs; i++) {
3180                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3181                 if (!buf)
3182                         break;
3183
3184                 buf->addr = addr;
3185                 buf->len = pbuf->len;
3186                 buf->bid = bid;
3187                 addr += pbuf->len;
3188                 bid++;
3189                 if (!*head) {
3190                         INIT_LIST_HEAD(&buf->list);
3191                         *head = buf;
3192                 } else {
3193                         list_add_tail(&buf->list, &(*head)->list);
3194                 }
3195         }
3196
3197         return i ? i : -ENOMEM;
3198 }
3199
3200 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3201 {
3202         struct io_provide_buf *p = &req->pbuf;
3203         struct io_ring_ctx *ctx = req->ctx;
3204         struct io_buffer *head, *list;
3205         int ret = 0;
3206
3207         io_ring_submit_lock(ctx, !force_nonblock);
3208
3209         lockdep_assert_held(&ctx->uring_lock);
3210
3211         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3212
3213         ret = io_add_buffers(p, &head);
3214         if (ret < 0)
3215                 goto out;
3216
3217         if (!list) {
3218                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3219                                         GFP_KERNEL);
3220                 if (ret < 0) {
3221                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3222                         goto out;
3223                 }
3224         }
3225 out:
3226         io_ring_submit_unlock(ctx, !force_nonblock);
3227         if (ret < 0)
3228                 req_set_fail_links(req);
3229         io_cqring_add_event(req, ret);
3230         io_put_req(req);
3231         return 0;
3232 }
3233
3234 static int io_epoll_ctl_prep(struct io_kiocb *req,
3235                              const struct io_uring_sqe *sqe)
3236 {
3237 #if defined(CONFIG_EPOLL)
3238         if (sqe->ioprio || sqe->buf_index)
3239                 return -EINVAL;
3240         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3241                 return -EINVAL;
3242
3243         req->epoll.epfd = READ_ONCE(sqe->fd);
3244         req->epoll.op = READ_ONCE(sqe->len);
3245         req->epoll.fd = READ_ONCE(sqe->off);
3246
3247         if (ep_op_has_event(req->epoll.op)) {
3248                 struct epoll_event __user *ev;
3249
3250                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3251                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3252                         return -EFAULT;
3253         }
3254
3255         return 0;
3256 #else
3257         return -EOPNOTSUPP;
3258 #endif
3259 }
3260
3261 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3262 {
3263 #if defined(CONFIG_EPOLL)
3264         struct io_epoll *ie = &req->epoll;
3265         int ret;
3266
3267         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3268         if (force_nonblock && ret == -EAGAIN)
3269                 return -EAGAIN;
3270
3271         if (ret < 0)
3272                 req_set_fail_links(req);
3273         io_cqring_add_event(req, ret);
3274         io_put_req(req);
3275         return 0;
3276 #else
3277         return -EOPNOTSUPP;
3278 #endif
3279 }
3280
3281 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3282 {
3283 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3284         if (sqe->ioprio || sqe->buf_index || sqe->off)
3285                 return -EINVAL;
3286         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3287                 return -EINVAL;
3288
3289         req->madvise.addr = READ_ONCE(sqe->addr);
3290         req->madvise.len = READ_ONCE(sqe->len);
3291         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3292         return 0;
3293 #else
3294         return -EOPNOTSUPP;
3295 #endif
3296 }
3297
3298 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3299 {
3300 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3301         struct io_madvise *ma = &req->madvise;
3302         int ret;
3303
3304         if (force_nonblock)
3305                 return -EAGAIN;
3306
3307         ret = do_madvise(ma->addr, ma->len, ma->advice);
3308         if (ret < 0)
3309                 req_set_fail_links(req);
3310         io_cqring_add_event(req, ret);
3311         io_put_req(req);
3312         return 0;
3313 #else
3314         return -EOPNOTSUPP;
3315 #endif
3316 }
3317
3318 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3319 {
3320         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3321                 return -EINVAL;
3322         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3323                 return -EINVAL;
3324
3325         req->fadvise.offset = READ_ONCE(sqe->off);
3326         req->fadvise.len = READ_ONCE(sqe->len);
3327         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3328         return 0;
3329 }
3330
3331 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3332 {
3333         struct io_fadvise *fa = &req->fadvise;
3334         int ret;
3335
3336         if (force_nonblock) {
3337                 switch (fa->advice) {
3338                 case POSIX_FADV_NORMAL:
3339                 case POSIX_FADV_RANDOM:
3340                 case POSIX_FADV_SEQUENTIAL:
3341                         break;
3342                 default:
3343                         return -EAGAIN;
3344                 }
3345         }
3346
3347         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3348         if (ret < 0)
3349                 req_set_fail_links(req);
3350         io_cqring_add_event(req, ret);
3351         io_put_req(req);
3352         return 0;
3353 }
3354
3355 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3356 {
3357         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3358                 return -EINVAL;
3359         if (sqe->ioprio || sqe->buf_index)
3360                 return -EINVAL;
3361         if (req->flags & REQ_F_FIXED_FILE)
3362                 return -EBADF;
3363
3364         req->statx.dfd = READ_ONCE(sqe->fd);
3365         req->statx.mask = READ_ONCE(sqe->len);
3366         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
3367         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3368         req->statx.flags = READ_ONCE(sqe->statx_flags);
3369
3370         return 0;
3371 }
3372
3373 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3374 {
3375         struct io_statx *ctx = &req->statx;
3376         int ret;
3377
3378         if (force_nonblock) {
3379                 /* only need file table for an actual valid fd */
3380                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3381                         req->flags |= REQ_F_NO_FILE_TABLE;
3382                 return -EAGAIN;
3383         }
3384
3385         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
3386                        ctx->buffer);
3387
3388         if (ret < 0)
3389                 req_set_fail_links(req);
3390         io_cqring_add_event(req, ret);
3391         io_put_req(req);
3392         return 0;
3393 }
3394
3395 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3396 {
3397         /*
3398          * If we queue this for async, it must not be cancellable. That would
3399          * leave the 'file' in an undeterminate state, and here need to modify
3400          * io_wq_work.flags, so initialize io_wq_work firstly.
3401          */
3402         io_req_init_async(req);
3403         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3404
3405         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3406                 return -EINVAL;
3407         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3408             sqe->rw_flags || sqe->buf_index)
3409                 return -EINVAL;
3410         if (req->flags & REQ_F_FIXED_FILE)
3411                 return -EBADF;
3412
3413         req->close.fd = READ_ONCE(sqe->fd);
3414         if ((req->file && req->file->f_op == &io_uring_fops) ||
3415             req->close.fd == req->ctx->ring_fd)
3416                 return -EBADF;
3417
3418         req->close.put_file = NULL;
3419         return 0;
3420 }
3421
3422 static int io_close(struct io_kiocb *req, bool force_nonblock)
3423 {
3424         struct io_close *close = &req->close;
3425         int ret;
3426
3427         /* might be already done during nonblock submission */
3428         if (!close->put_file) {
3429                 ret = __close_fd_get_file(close->fd, &close->put_file);
3430                 if (ret < 0)
3431                         return (ret == -ENOENT) ? -EBADF : ret;
3432         }
3433
3434         /* if the file has a flush method, be safe and punt to async */
3435         if (close->put_file->f_op->flush && force_nonblock) {
3436                 /* avoid grabbing files - we don't need the files */
3437                 req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
3438                 return -EAGAIN;
3439         }
3440
3441         /* No ->flush() or already async, safely close from here */
3442         ret = filp_close(close->put_file, req->work.files);
3443         if (ret < 0)
3444                 req_set_fail_links(req);
3445         io_cqring_add_event(req, ret);
3446         fput(close->put_file);
3447         close->put_file = NULL;
3448         io_put_req(req);
3449         return 0;
3450 }
3451
3452 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3453 {
3454         struct io_ring_ctx *ctx = req->ctx;
3455
3456         if (!req->file)
3457                 return -EBADF;
3458
3459         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3460                 return -EINVAL;
3461         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3462                 return -EINVAL;
3463
3464         req->sync.off = READ_ONCE(sqe->off);
3465         req->sync.len = READ_ONCE(sqe->len);
3466         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3467         return 0;
3468 }
3469
3470 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3471 {
3472         int ret;
3473
3474         /* sync_file_range always requires a blocking context */
3475         if (force_nonblock)
3476                 return -EAGAIN;
3477
3478         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3479                                 req->sync.flags);
3480         if (ret < 0)
3481                 req_set_fail_links(req);
3482         io_cqring_add_event(req, ret);
3483         io_put_req(req);
3484         return 0;
3485 }
3486
3487 #if defined(CONFIG_NET)
3488 static int io_setup_async_msg(struct io_kiocb *req,
3489                               struct io_async_msghdr *kmsg)
3490 {
3491         if (req->io)
3492                 return -EAGAIN;
3493         if (io_alloc_async_ctx(req)) {
3494                 if (kmsg->iov != kmsg->fast_iov)
3495                         kfree(kmsg->iov);
3496                 return -ENOMEM;
3497         }
3498         req->flags |= REQ_F_NEED_CLEANUP;
3499         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3500         return -EAGAIN;
3501 }
3502
3503 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3504 {
3505         struct io_sr_msg *sr = &req->sr_msg;
3506         struct io_async_ctx *io = req->io;
3507         int ret;
3508
3509         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3510                 return -EINVAL;
3511
3512         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3513         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3514         sr->len = READ_ONCE(sqe->len);
3515
3516 #ifdef CONFIG_COMPAT
3517         if (req->ctx->compat)
3518                 sr->msg_flags |= MSG_CMSG_COMPAT;
3519 #endif
3520
3521         if (!io || req->opcode == IORING_OP_SEND)
3522                 return 0;
3523         /* iovec is already imported */
3524         if (req->flags & REQ_F_NEED_CLEANUP)
3525                 return 0;
3526
3527         io->msg.iov = io->msg.fast_iov;
3528         ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3529                                         &io->msg.iov);
3530         if (!ret)
3531                 req->flags |= REQ_F_NEED_CLEANUP;
3532         return ret;
3533 }
3534
3535 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3536 {
3537         struct io_async_msghdr *kmsg = NULL;
3538         struct socket *sock;
3539         int ret;
3540
3541         sock = sock_from_file(req->file, &ret);
3542         if (sock) {
3543                 struct io_async_ctx io;
3544                 unsigned flags;
3545
3546                 if (req->io) {
3547                         kmsg = &req->io->msg;
3548                         kmsg->msg.msg_name = &req->io->msg.addr;
3549                         /* if iov is set, it's allocated already */
3550                         if (!kmsg->iov)
3551                                 kmsg->iov = kmsg->fast_iov;
3552                         kmsg->msg.msg_iter.iov = kmsg->iov;
3553                 } else {
3554                         struct io_sr_msg *sr = &req->sr_msg;
3555
3556                         kmsg = &io.msg;
3557                         kmsg->msg.msg_name = &io.msg.addr;
3558
3559                         io.msg.iov = io.msg.fast_iov;
3560                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3561                                         sr->msg_flags, &io.msg.iov);
3562                         if (ret)
3563                                 return ret;
3564                 }
3565
3566                 flags = req->sr_msg.msg_flags;
3567                 if (flags & MSG_DONTWAIT)
3568                         req->flags |= REQ_F_NOWAIT;
3569                 else if (force_nonblock)
3570                         flags |= MSG_DONTWAIT;
3571
3572                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3573                 if (force_nonblock && ret == -EAGAIN)
3574                         return io_setup_async_msg(req, kmsg);
3575                 if (ret == -ERESTARTSYS)
3576                         ret = -EINTR;
3577         }
3578
3579         if (kmsg && kmsg->iov != kmsg->fast_iov)
3580                 kfree(kmsg->iov);
3581         req->flags &= ~REQ_F_NEED_CLEANUP;
3582         io_cqring_add_event(req, ret);
3583         if (ret < 0)
3584                 req_set_fail_links(req);
3585         io_put_req(req);
3586         return 0;
3587 }
3588
3589 static int io_send(struct io_kiocb *req, bool force_nonblock)
3590 {
3591         struct socket *sock;
3592         int ret;
3593
3594         sock = sock_from_file(req->file, &ret);
3595         if (sock) {
3596                 struct io_sr_msg *sr = &req->sr_msg;
3597                 struct msghdr msg;
3598                 struct iovec iov;
3599                 unsigned flags;
3600
3601                 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3602                                                 &msg.msg_iter);
3603                 if (ret)
3604                         return ret;
3605
3606                 msg.msg_name = NULL;
3607                 msg.msg_control = NULL;
3608                 msg.msg_controllen = 0;
3609                 msg.msg_namelen = 0;
3610
3611                 flags = req->sr_msg.msg_flags;
3612                 if (flags & MSG_DONTWAIT)
3613                         req->flags |= REQ_F_NOWAIT;
3614                 else if (force_nonblock)
3615                         flags |= MSG_DONTWAIT;
3616
3617                 msg.msg_flags = flags;
3618                 ret = sock_sendmsg(sock, &msg);
3619                 if (force_nonblock && ret == -EAGAIN)
3620                         return -EAGAIN;
3621                 if (ret == -ERESTARTSYS)
3622                         ret = -EINTR;
3623         }
3624
3625         io_cqring_add_event(req, ret);
3626         if (ret < 0)
3627                 req_set_fail_links(req);
3628         io_put_req(req);
3629         return 0;
3630 }
3631
3632 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3633 {
3634         struct io_sr_msg *sr = &req->sr_msg;
3635         struct iovec __user *uiov;
3636         size_t iov_len;
3637         int ret;
3638
3639         ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3640                                         &uiov, &iov_len);
3641         if (ret)
3642                 return ret;
3643
3644         if (req->flags & REQ_F_BUFFER_SELECT) {
3645                 if (iov_len > 1)
3646                         return -EINVAL;
3647                 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3648                         return -EFAULT;
3649                 sr->len = io->msg.iov[0].iov_len;
3650                 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3651                                 sr->len);
3652                 io->msg.iov = NULL;
3653         } else {
3654                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3655                                         &io->msg.iov, &io->msg.msg.msg_iter);
3656                 if (ret > 0)
3657                         ret = 0;
3658         }
3659
3660         return ret;
3661 }
3662
3663 #ifdef CONFIG_COMPAT
3664 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3665                                         struct io_async_ctx *io)
3666 {
3667         struct compat_msghdr __user *msg_compat;
3668         struct io_sr_msg *sr = &req->sr_msg;
3669         struct compat_iovec __user *uiov;
3670         compat_uptr_t ptr;
3671         compat_size_t len;
3672         int ret;
3673
3674         msg_compat = (struct compat_msghdr __user *) sr->msg;
3675         ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3676                                         &ptr, &len);
3677         if (ret)
3678                 return ret;
3679
3680         uiov = compat_ptr(ptr);
3681         if (req->flags & REQ_F_BUFFER_SELECT) {
3682                 compat_ssize_t clen;
3683
3684                 if (len > 1)
3685                         return -EINVAL;
3686                 if (!access_ok(uiov, sizeof(*uiov)))
3687                         return -EFAULT;
3688                 if (__get_user(clen, &uiov->iov_len))
3689                         return -EFAULT;
3690                 if (clen < 0)
3691                         return -EINVAL;
3692                 sr->len = io->msg.iov[0].iov_len;
3693                 io->msg.iov = NULL;
3694         } else {
3695                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3696                                                 &io->msg.iov,
3697                                                 &io->msg.msg.msg_iter);
3698                 if (ret < 0)
3699                         return ret;
3700         }
3701
3702         return 0;
3703 }
3704 #endif
3705
3706 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3707 {
3708         io->msg.iov = io->msg.fast_iov;
3709
3710 #ifdef CONFIG_COMPAT
3711         if (req->ctx->compat)
3712                 return __io_compat_recvmsg_copy_hdr(req, io);
3713 #endif
3714
3715         return __io_recvmsg_copy_hdr(req, io);
3716 }
3717
3718 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
3719                                                int *cflags, bool needs_lock)
3720 {
3721         struct io_sr_msg *sr = &req->sr_msg;
3722         struct io_buffer *kbuf;
3723
3724         if (!(req->flags & REQ_F_BUFFER_SELECT))
3725                 return NULL;
3726
3727         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
3728         if (IS_ERR(kbuf))
3729                 return kbuf;
3730
3731         sr->kbuf = kbuf;
3732         req->flags |= REQ_F_BUFFER_SELECTED;
3733
3734         *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
3735         *cflags |= IORING_CQE_F_BUFFER;
3736         return kbuf;
3737 }
3738
3739 static int io_recvmsg_prep(struct io_kiocb *req,
3740                            const struct io_uring_sqe *sqe)
3741 {
3742         struct io_sr_msg *sr = &req->sr_msg;
3743         struct io_async_ctx *io = req->io;
3744         int ret;
3745
3746         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3747                 return -EINVAL;
3748
3749         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3750         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3751         sr->len = READ_ONCE(sqe->len);
3752         sr->bgid = READ_ONCE(sqe->buf_group);
3753
3754 #ifdef CONFIG_COMPAT
3755         if (req->ctx->compat)
3756                 sr->msg_flags |= MSG_CMSG_COMPAT;
3757 #endif
3758
3759         if (!io || req->opcode == IORING_OP_RECV)
3760                 return 0;
3761         /* iovec is already imported */
3762         if (req->flags & REQ_F_NEED_CLEANUP)
3763                 return 0;
3764
3765         ret = io_recvmsg_copy_hdr(req, io);
3766         if (!ret)
3767                 req->flags |= REQ_F_NEED_CLEANUP;
3768         return ret;
3769 }
3770
3771 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
3772 {
3773         struct io_async_msghdr *kmsg = NULL;
3774         struct socket *sock;
3775         int ret, cflags = 0;
3776
3777         sock = sock_from_file(req->file, &ret);
3778         if (sock) {
3779                 struct io_buffer *kbuf;
3780                 struct io_async_ctx io;
3781                 unsigned flags;
3782
3783                 if (req->io) {
3784                         kmsg = &req->io->msg;
3785                         kmsg->msg.msg_name = &req->io->msg.addr;
3786                         /* if iov is set, it's allocated already */
3787                         if (!kmsg->iov)
3788                                 kmsg->iov = kmsg->fast_iov;
3789                         kmsg->msg.msg_iter.iov = kmsg->iov;
3790                 } else {
3791                         kmsg = &io.msg;
3792                         kmsg->msg.msg_name = &io.msg.addr;
3793
3794                         ret = io_recvmsg_copy_hdr(req, &io);
3795                         if (ret)
3796                                 return ret;
3797                 }
3798
3799                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3800                 if (IS_ERR(kbuf)) {
3801                         return PTR_ERR(kbuf);
3802                 } else if (kbuf) {
3803                         kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3804                         iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
3805                                         1, req->sr_msg.len);
3806                 }
3807
3808                 flags = req->sr_msg.msg_flags;
3809                 if (flags & MSG_DONTWAIT)
3810                         req->flags |= REQ_F_NOWAIT;
3811                 else if (force_nonblock)
3812                         flags |= MSG_DONTWAIT;
3813
3814                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3815                                                 kmsg->uaddr, flags);
3816                 if (force_nonblock && ret == -EAGAIN)
3817                         return io_setup_async_msg(req, kmsg);
3818                 if (ret == -ERESTARTSYS)
3819                         ret = -EINTR;
3820         }
3821
3822         if (kmsg && kmsg->iov != kmsg->fast_iov)
3823                 kfree(kmsg->iov);
3824         req->flags &= ~REQ_F_NEED_CLEANUP;
3825         __io_cqring_add_event(req, ret, cflags);
3826         if (ret < 0)
3827                 req_set_fail_links(req);
3828         io_put_req(req);
3829         return 0;
3830 }
3831
3832 static int io_recv(struct io_kiocb *req, bool force_nonblock)
3833 {
3834         struct io_buffer *kbuf = NULL;
3835         struct socket *sock;
3836         int ret, cflags = 0;
3837
3838         sock = sock_from_file(req->file, &ret);
3839         if (sock) {
3840                 struct io_sr_msg *sr = &req->sr_msg;
3841                 void __user *buf = sr->buf;
3842                 struct msghdr msg;
3843                 struct iovec iov;
3844                 unsigned flags;
3845
3846                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3847                 if (IS_ERR(kbuf))
3848                         return PTR_ERR(kbuf);
3849                 else if (kbuf)
3850                         buf = u64_to_user_ptr(kbuf->addr);
3851
3852                 ret = import_single_range(READ, buf, sr->len, &iov,
3853                                                 &msg.msg_iter);
3854                 if (ret) {
3855                         kfree(kbuf);
3856                         return ret;
3857                 }
3858
3859                 req->flags |= REQ_F_NEED_CLEANUP;
3860                 msg.msg_name = NULL;
3861                 msg.msg_control = NULL;
3862                 msg.msg_controllen = 0;
3863                 msg.msg_namelen = 0;
3864                 msg.msg_iocb = NULL;
3865                 msg.msg_flags = 0;
3866
3867                 flags = req->sr_msg.msg_flags;
3868                 if (flags & MSG_DONTWAIT)
3869                         req->flags |= REQ_F_NOWAIT;
3870                 else if (force_nonblock)
3871                         flags |= MSG_DONTWAIT;
3872
3873                 ret = sock_recvmsg(sock, &msg, flags);
3874                 if (force_nonblock && ret == -EAGAIN)
3875                         return -EAGAIN;
3876                 if (ret == -ERESTARTSYS)
3877                         ret = -EINTR;
3878         }
3879
3880         kfree(kbuf);
3881         req->flags &= ~REQ_F_NEED_CLEANUP;
3882         __io_cqring_add_event(req, ret, cflags);
3883         if (ret < 0)
3884                 req_set_fail_links(req);
3885         io_put_req(req);
3886         return 0;
3887 }
3888
3889 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3890 {
3891         struct io_accept *accept = &req->accept;
3892
3893         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3894                 return -EINVAL;
3895         if (sqe->ioprio || sqe->len || sqe->buf_index)
3896                 return -EINVAL;
3897
3898         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3899         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3900         accept->flags = READ_ONCE(sqe->accept_flags);
3901         accept->nofile = rlimit(RLIMIT_NOFILE);
3902         return 0;
3903 }
3904
3905 static int io_accept(struct io_kiocb *req, bool force_nonblock)
3906 {
3907         struct io_accept *accept = &req->accept;
3908         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
3909         int ret;
3910
3911         if (req->file->f_flags & O_NONBLOCK)
3912                 req->flags |= REQ_F_NOWAIT;
3913
3914         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3915                                         accept->addr_len, accept->flags,
3916                                         accept->nofile);
3917         if (ret == -EAGAIN && force_nonblock)
3918                 return -EAGAIN;
3919         if (ret < 0) {
3920                 if (ret == -ERESTARTSYS)
3921                         ret = -EINTR;
3922                 req_set_fail_links(req);
3923         }
3924         io_cqring_add_event(req, ret);
3925         io_put_req(req);
3926         return 0;
3927 }
3928
3929 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3930 {
3931         struct io_connect *conn = &req->connect;
3932         struct io_async_ctx *io = req->io;
3933
3934         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3935                 return -EINVAL;
3936         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3937                 return -EINVAL;
3938
3939         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3940         conn->addr_len =  READ_ONCE(sqe->addr2);
3941
3942         if (!io)
3943                 return 0;
3944
3945         return move_addr_to_kernel(conn->addr, conn->addr_len,
3946                                         &io->connect.address);
3947 }
3948
3949 static int io_connect(struct io_kiocb *req, bool force_nonblock)
3950 {
3951         struct io_async_ctx __io, *io;
3952         unsigned file_flags;
3953         int ret;
3954
3955         if (req->io) {
3956                 io = req->io;
3957         } else {
3958                 ret = move_addr_to_kernel(req->connect.addr,
3959                                                 req->connect.addr_len,
3960                                                 &__io.connect.address);
3961                 if (ret)
3962                         goto out;
3963                 io = &__io;
3964         }
3965
3966         file_flags = force_nonblock ? O_NONBLOCK : 0;
3967
3968         ret = __sys_connect_file(req->file, &io->connect.address,
3969                                         req->connect.addr_len, file_flags);
3970         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
3971                 if (req->io)
3972                         return -EAGAIN;
3973                 if (io_alloc_async_ctx(req)) {
3974                         ret = -ENOMEM;
3975                         goto out;
3976                 }
3977                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
3978                 return -EAGAIN;
3979         }
3980         if (ret == -ERESTARTSYS)
3981                 ret = -EINTR;
3982 out:
3983         if (ret < 0)
3984                 req_set_fail_links(req);
3985         io_cqring_add_event(req, ret);
3986         io_put_req(req);
3987         return 0;
3988 }
3989 #else /* !CONFIG_NET */
3990 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3991 {
3992         return -EOPNOTSUPP;
3993 }
3994
3995 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3996 {
3997         return -EOPNOTSUPP;
3998 }
3999
4000 static int io_send(struct io_kiocb *req, bool force_nonblock)
4001 {
4002         return -EOPNOTSUPP;
4003 }
4004
4005 static int io_recvmsg_prep(struct io_kiocb *req,
4006                            const struct io_uring_sqe *sqe)
4007 {
4008         return -EOPNOTSUPP;
4009 }
4010
4011 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4012 {
4013         return -EOPNOTSUPP;
4014 }
4015
4016 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4017 {
4018         return -EOPNOTSUPP;
4019 }
4020
4021 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4022 {
4023         return -EOPNOTSUPP;
4024 }
4025
4026 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4027 {
4028         return -EOPNOTSUPP;
4029 }
4030
4031 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4032 {
4033         return -EOPNOTSUPP;
4034 }
4035
4036 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4037 {
4038         return -EOPNOTSUPP;
4039 }
4040 #endif /* CONFIG_NET */
4041
4042 struct io_poll_table {
4043         struct poll_table_struct pt;
4044         struct io_kiocb *req;
4045         int error;
4046 };
4047
4048 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4049                            __poll_t mask, task_work_func_t func)
4050 {
4051         struct task_struct *tsk;
4052         int ret;
4053
4054         /* for instances that support it check for an event match first: */
4055         if (mask && !(mask & poll->events))
4056                 return 0;
4057
4058         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4059
4060         list_del_init(&poll->wait.entry);
4061
4062         tsk = req->task;
4063         req->result = mask;
4064         init_task_work(&req->task_work, func);
4065         /*
4066          * If this fails, then the task is exiting. When a task exits, the
4067          * work gets canceled, so just cancel this request as well instead
4068          * of executing it. We can't safely execute it anyway, as we may not
4069          * have the needed state needed for it anyway.
4070          */
4071         ret = task_work_add(tsk, &req->task_work, true);
4072         if (unlikely(ret)) {
4073                 WRITE_ONCE(poll->canceled, true);
4074                 tsk = io_wq_get_task(req->ctx->io_wq);
4075                 task_work_add(tsk, &req->task_work, true);
4076         }
4077         wake_up_process(tsk);
4078         return 1;
4079 }
4080
4081 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4082         __acquires(&req->ctx->completion_lock)
4083 {
4084         struct io_ring_ctx *ctx = req->ctx;
4085
4086         if (!req->result && !READ_ONCE(poll->canceled)) {
4087                 struct poll_table_struct pt = { ._key = poll->events };
4088
4089                 req->result = vfs_poll(req->file, &pt) & poll->events;
4090         }
4091
4092         spin_lock_irq(&ctx->completion_lock);
4093         if (!req->result && !READ_ONCE(poll->canceled)) {
4094                 add_wait_queue(poll->head, &poll->wait);
4095                 return true;
4096         }
4097
4098         return false;
4099 }
4100
4101 static void io_poll_remove_double(struct io_kiocb *req)
4102 {
4103         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4104
4105         lockdep_assert_held(&req->ctx->completion_lock);
4106
4107         if (poll && poll->head) {
4108                 struct wait_queue_head *head = poll->head;
4109
4110                 spin_lock(&head->lock);
4111                 list_del_init(&poll->wait.entry);
4112                 if (poll->wait.private)
4113                         refcount_dec(&req->refs);
4114                 poll->head = NULL;
4115                 spin_unlock(&head->lock);
4116         }
4117 }
4118
4119 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4120 {
4121         struct io_ring_ctx *ctx = req->ctx;
4122
4123         io_poll_remove_double(req);
4124         req->poll.done = true;
4125         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4126         io_commit_cqring(ctx);
4127 }
4128
4129 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4130 {
4131         struct io_ring_ctx *ctx = req->ctx;
4132
4133         if (io_poll_rewait(req, &req->poll)) {
4134                 spin_unlock_irq(&ctx->completion_lock);
4135                 return;
4136         }
4137
4138         hash_del(&req->hash_node);
4139         io_poll_complete(req, req->result, 0);
4140         req->flags |= REQ_F_COMP_LOCKED;
4141         io_put_req_find_next(req, nxt);
4142         spin_unlock_irq(&ctx->completion_lock);
4143
4144         io_cqring_ev_posted(ctx);
4145 }
4146
4147 static void io_poll_task_func(struct callback_head *cb)
4148 {
4149         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4150         struct io_kiocb *nxt = NULL;
4151
4152         io_poll_task_handler(req, &nxt);
4153         if (nxt) {
4154                 struct io_ring_ctx *ctx = nxt->ctx;
4155
4156                 mutex_lock(&ctx->uring_lock);
4157                 __io_queue_sqe(nxt, NULL);
4158                 mutex_unlock(&ctx->uring_lock);
4159         }
4160 }
4161
4162 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4163                                int sync, void *key)
4164 {
4165         struct io_kiocb *req = wait->private;
4166         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4167         __poll_t mask = key_to_poll(key);
4168
4169         /* for instances that support it check for an event match first: */
4170         if (mask && !(mask & poll->events))
4171                 return 0;
4172
4173         if (req->poll.head) {
4174                 bool done;
4175
4176                 spin_lock(&req->poll.head->lock);
4177                 done = list_empty(&req->poll.wait.entry);
4178                 if (!done)
4179                         list_del_init(&req->poll.wait.entry);
4180                 spin_unlock(&req->poll.head->lock);
4181                 if (!done)
4182                         __io_async_wake(req, poll, mask, io_poll_task_func);
4183         }
4184         refcount_dec(&req->refs);
4185         return 1;
4186 }
4187
4188 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4189                               wait_queue_func_t wake_func)
4190 {
4191         poll->head = NULL;
4192         poll->done = false;
4193         poll->canceled = false;
4194         poll->events = events;
4195         INIT_LIST_HEAD(&poll->wait.entry);
4196         init_waitqueue_func_entry(&poll->wait, wake_func);
4197 }
4198
4199 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4200                             struct wait_queue_head *head)
4201 {
4202         struct io_kiocb *req = pt->req;
4203
4204         /*
4205          * If poll->head is already set, it's because the file being polled
4206          * uses multiple waitqueues for poll handling (eg one for read, one
4207          * for write). Setup a separate io_poll_iocb if this happens.
4208          */
4209         if (unlikely(poll->head)) {
4210                 /* already have a 2nd entry, fail a third attempt */
4211                 if (req->io) {
4212                         pt->error = -EINVAL;
4213                         return;
4214                 }
4215                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4216                 if (!poll) {
4217                         pt->error = -ENOMEM;
4218                         return;
4219                 }
4220                 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4221                 refcount_inc(&req->refs);
4222                 poll->wait.private = req;
4223                 req->io = (void *) poll;
4224         }
4225
4226         pt->error = 0;
4227         poll->head = head;
4228         add_wait_queue(head, &poll->wait);
4229 }
4230
4231 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4232                                struct poll_table_struct *p)
4233 {
4234         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4235
4236         __io_queue_proc(&pt->req->apoll->poll, pt, head);
4237 }
4238
4239 static void io_async_task_func(struct callback_head *cb)
4240 {
4241         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4242         struct async_poll *apoll = req->apoll;
4243         struct io_ring_ctx *ctx = req->ctx;
4244         bool canceled = false;
4245
4246         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4247
4248         if (io_poll_rewait(req, &apoll->poll)) {
4249                 spin_unlock_irq(&ctx->completion_lock);
4250                 return;
4251         }
4252
4253         /* If req is still hashed, it cannot have been canceled. Don't check. */
4254         if (hash_hashed(&req->hash_node)) {
4255                 hash_del(&req->hash_node);
4256         } else {
4257                 canceled = READ_ONCE(apoll->poll.canceled);
4258                 if (canceled) {
4259                         io_cqring_fill_event(req, -ECANCELED);
4260                         io_commit_cqring(ctx);
4261                 }
4262         }
4263
4264         spin_unlock_irq(&ctx->completion_lock);
4265
4266         /* restore ->work in case we need to retry again */
4267         if (req->flags & REQ_F_WORK_INITIALIZED)
4268                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4269         kfree(apoll);
4270
4271         if (!canceled) {
4272                 __set_current_state(TASK_RUNNING);
4273                 mutex_lock(&ctx->uring_lock);
4274                 __io_queue_sqe(req, NULL);
4275                 mutex_unlock(&ctx->uring_lock);
4276         } else {
4277                 io_cqring_ev_posted(ctx);
4278                 req_set_fail_links(req);
4279                 io_double_put_req(req);
4280         }
4281 }
4282
4283 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4284                         void *key)
4285 {
4286         struct io_kiocb *req = wait->private;
4287         struct io_poll_iocb *poll = &req->apoll->poll;
4288
4289         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4290                                         key_to_poll(key));
4291
4292         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4293 }
4294
4295 static void io_poll_req_insert(struct io_kiocb *req)
4296 {
4297         struct io_ring_ctx *ctx = req->ctx;
4298         struct hlist_head *list;
4299
4300         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4301         hlist_add_head(&req->hash_node, list);
4302 }
4303
4304 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4305                                       struct io_poll_iocb *poll,
4306                                       struct io_poll_table *ipt, __poll_t mask,
4307                                       wait_queue_func_t wake_func)
4308         __acquires(&ctx->completion_lock)
4309 {
4310         struct io_ring_ctx *ctx = req->ctx;
4311         bool cancel = false;
4312
4313         poll->file = req->file;
4314         io_init_poll_iocb(poll, mask, wake_func);
4315         poll->wait.private = req;
4316
4317         ipt->pt._key = mask;
4318         ipt->req = req;
4319         ipt->error = -EINVAL;
4320
4321         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4322
4323         spin_lock_irq(&ctx->completion_lock);
4324         if (likely(poll->head)) {
4325                 spin_lock(&poll->head->lock);
4326                 if (unlikely(list_empty(&poll->wait.entry))) {
4327                         if (ipt->error)
4328                                 cancel = true;
4329                         ipt->error = 0;
4330                         mask = 0;
4331                 }
4332                 if (mask || ipt->error)
4333                         list_del_init(&poll->wait.entry);
4334                 else if (cancel)
4335                         WRITE_ONCE(poll->canceled, true);
4336                 else if (!poll->done) /* actually waiting for an event */
4337                         io_poll_req_insert(req);
4338                 spin_unlock(&poll->head->lock);
4339         }
4340
4341         return mask;
4342 }
4343
4344 static bool io_arm_poll_handler(struct io_kiocb *req)
4345 {
4346         const struct io_op_def *def = &io_op_defs[req->opcode];
4347         struct io_ring_ctx *ctx = req->ctx;
4348         struct async_poll *apoll;
4349         struct io_poll_table ipt;
4350         __poll_t mask, ret;
4351         bool had_io;
4352
4353         if (!req->file || !file_can_poll(req->file))
4354                 return false;
4355         if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4356                 return false;
4357         if (!def->pollin && !def->pollout)
4358                 return false;
4359
4360         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4361         if (unlikely(!apoll))
4362                 return false;
4363
4364         req->flags |= REQ_F_POLLED;
4365         if (req->flags & REQ_F_WORK_INITIALIZED)
4366                 memcpy(&apoll->work, &req->work, sizeof(req->work));
4367         had_io = req->io != NULL;
4368
4369         get_task_struct(current);
4370         req->task = current;
4371         req->apoll = apoll;
4372         INIT_HLIST_NODE(&req->hash_node);
4373
4374         mask = 0;
4375         if (def->pollin)
4376                 mask |= POLLIN | POLLRDNORM;
4377         if (def->pollout)
4378                 mask |= POLLOUT | POLLWRNORM;
4379         mask |= POLLERR | POLLPRI;
4380
4381         ipt.pt._qproc = io_async_queue_proc;
4382
4383         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4384                                         io_async_wake);
4385         if (ret) {
4386                 ipt.error = 0;
4387                 /* only remove double add if we did it here */
4388                 if (!had_io)
4389                         io_poll_remove_double(req);
4390                 spin_unlock_irq(&ctx->completion_lock);
4391                 if (req->flags & REQ_F_WORK_INITIALIZED)
4392                         memcpy(&req->work, &apoll->work, sizeof(req->work));
4393                 kfree(apoll);
4394                 return false;
4395         }
4396         spin_unlock_irq(&ctx->completion_lock);
4397         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4398                                         apoll->poll.events);
4399         return true;
4400 }
4401
4402 static bool __io_poll_remove_one(struct io_kiocb *req,
4403                                  struct io_poll_iocb *poll)
4404 {
4405         bool do_complete = false;
4406
4407         spin_lock(&poll->head->lock);
4408         WRITE_ONCE(poll->canceled, true);
4409         if (!list_empty(&poll->wait.entry)) {
4410                 list_del_init(&poll->wait.entry);
4411                 do_complete = true;
4412         }
4413         spin_unlock(&poll->head->lock);
4414         hash_del(&req->hash_node);
4415         return do_complete;
4416 }
4417
4418 static bool io_poll_remove_one(struct io_kiocb *req)
4419 {
4420         bool do_complete;
4421
4422         if (req->opcode == IORING_OP_POLL_ADD) {
4423                 io_poll_remove_double(req);
4424                 do_complete = __io_poll_remove_one(req, &req->poll);
4425         } else {
4426                 struct async_poll *apoll = req->apoll;
4427
4428                 /* non-poll requests have submit ref still */
4429                 do_complete = __io_poll_remove_one(req, &apoll->poll);
4430                 if (do_complete) {
4431                         io_put_req(req);
4432                         /*
4433                          * restore ->work because we will call
4434                          * io_req_work_drop_env below when dropping the
4435                          * final reference.
4436                          */
4437                         if (req->flags & REQ_F_WORK_INITIALIZED)
4438                                 memcpy(&req->work, &apoll->work,
4439                                        sizeof(req->work));
4440                         kfree(apoll);
4441                 }
4442         }
4443
4444         if (do_complete) {
4445                 io_cqring_fill_event(req, -ECANCELED);
4446                 io_commit_cqring(req->ctx);
4447                 req->flags |= REQ_F_COMP_LOCKED;
4448                 io_put_req(req);
4449         }
4450
4451         return do_complete;
4452 }
4453
4454 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4455 {
4456         struct hlist_node *tmp;
4457         struct io_kiocb *req;
4458         int posted = 0, i;
4459
4460         spin_lock_irq(&ctx->completion_lock);
4461         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4462                 struct hlist_head *list;
4463
4464                 list = &ctx->cancel_hash[i];
4465                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4466                         posted += io_poll_remove_one(req);
4467         }
4468         spin_unlock_irq(&ctx->completion_lock);
4469
4470         if (posted)
4471                 io_cqring_ev_posted(ctx);
4472 }
4473
4474 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4475 {
4476         struct hlist_head *list;
4477         struct io_kiocb *req;
4478
4479         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4480         hlist_for_each_entry(req, list, hash_node) {
4481                 if (sqe_addr != req->user_data)
4482                         continue;
4483                 if (io_poll_remove_one(req))
4484                         return 0;
4485                 return -EALREADY;
4486         }
4487
4488         return -ENOENT;
4489 }
4490
4491 static int io_poll_remove_prep(struct io_kiocb *req,
4492                                const struct io_uring_sqe *sqe)
4493 {
4494         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4495                 return -EINVAL;
4496         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4497             sqe->poll_events)
4498                 return -EINVAL;
4499
4500         req->poll.addr = READ_ONCE(sqe->addr);
4501         return 0;
4502 }
4503
4504 /*
4505  * Find a running poll command that matches one specified in sqe->addr,
4506  * and remove it if found.
4507  */
4508 static int io_poll_remove(struct io_kiocb *req)
4509 {
4510         struct io_ring_ctx *ctx = req->ctx;
4511         u64 addr;
4512         int ret;
4513
4514         addr = req->poll.addr;
4515         spin_lock_irq(&ctx->completion_lock);
4516         ret = io_poll_cancel(ctx, addr);
4517         spin_unlock_irq(&ctx->completion_lock);
4518
4519         io_cqring_add_event(req, ret);
4520         if (ret < 0)
4521                 req_set_fail_links(req);
4522         io_put_req(req);
4523         return 0;
4524 }
4525
4526 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4527                         void *key)
4528 {
4529         struct io_kiocb *req = wait->private;
4530         struct io_poll_iocb *poll = &req->poll;
4531
4532         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4533 }
4534
4535 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4536                                struct poll_table_struct *p)
4537 {
4538         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4539
4540         __io_queue_proc(&pt->req->poll, pt, head);
4541 }
4542
4543 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4544 {
4545         struct io_poll_iocb *poll = &req->poll;
4546         u16 events;
4547
4548         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4549                 return -EINVAL;
4550         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4551                 return -EINVAL;
4552         if (!poll->file)
4553                 return -EBADF;
4554
4555         events = READ_ONCE(sqe->poll_events);
4556         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
4557
4558         get_task_struct(current);
4559         req->task = current;
4560         return 0;
4561 }
4562
4563 static int io_poll_add(struct io_kiocb *req)
4564 {
4565         struct io_poll_iocb *poll = &req->poll;
4566         struct io_ring_ctx *ctx = req->ctx;
4567         struct io_poll_table ipt;
4568         __poll_t mask;
4569
4570         INIT_HLIST_NODE(&req->hash_node);
4571         INIT_LIST_HEAD(&req->list);
4572         ipt.pt._qproc = io_poll_queue_proc;
4573
4574         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4575                                         io_poll_wake);
4576
4577         if (mask) { /* no async, we'd stolen it */
4578                 ipt.error = 0;
4579                 io_poll_complete(req, mask, 0);
4580         }
4581         spin_unlock_irq(&ctx->completion_lock);
4582
4583         if (mask) {
4584                 io_cqring_ev_posted(ctx);
4585                 io_put_req(req);
4586         }
4587         return ipt.error;
4588 }
4589
4590 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4591 {
4592         struct io_timeout_data *data = container_of(timer,
4593                                                 struct io_timeout_data, timer);
4594         struct io_kiocb *req = data->req;
4595         struct io_ring_ctx *ctx = req->ctx;
4596         unsigned long flags;
4597
4598         atomic_inc(&ctx->cq_timeouts);
4599
4600         spin_lock_irqsave(&ctx->completion_lock, flags);
4601         /*
4602          * We could be racing with timeout deletion. If the list is empty,
4603          * then timeout lookup already found it and will be handling it.
4604          */
4605         if (!list_empty(&req->list))
4606                 list_del_init(&req->list);
4607
4608         io_cqring_fill_event(req, -ETIME);
4609         io_commit_cqring(ctx);
4610         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4611
4612         io_cqring_ev_posted(ctx);
4613         req_set_fail_links(req);
4614         io_put_req(req);
4615         return HRTIMER_NORESTART;
4616 }
4617
4618 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4619 {
4620         struct io_kiocb *req;
4621         int ret = -ENOENT;
4622
4623         list_for_each_entry(req, &ctx->timeout_list, list) {
4624                 if (user_data == req->user_data) {
4625                         list_del_init(&req->list);
4626                         ret = 0;
4627                         break;
4628                 }
4629         }
4630
4631         if (ret == -ENOENT)
4632                 return ret;
4633
4634         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4635         if (ret == -1)
4636                 return -EALREADY;
4637
4638         req_set_fail_links(req);
4639         io_cqring_fill_event(req, -ECANCELED);
4640         io_put_req(req);
4641         return 0;
4642 }
4643
4644 static int io_timeout_remove_prep(struct io_kiocb *req,
4645                                   const struct io_uring_sqe *sqe)
4646 {
4647         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4648                 return -EINVAL;
4649         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4650                 return -EINVAL;
4651
4652         req->timeout.addr = READ_ONCE(sqe->addr);
4653         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4654         if (req->timeout.flags)
4655                 return -EINVAL;
4656
4657         return 0;
4658 }
4659
4660 /*
4661  * Remove or update an existing timeout command
4662  */
4663 static int io_timeout_remove(struct io_kiocb *req)
4664 {
4665         struct io_ring_ctx *ctx = req->ctx;
4666         int ret;
4667
4668         spin_lock_irq(&ctx->completion_lock);
4669         ret = io_timeout_cancel(ctx, req->timeout.addr);
4670
4671         io_cqring_fill_event(req, ret);
4672         io_commit_cqring(ctx);
4673         spin_unlock_irq(&ctx->completion_lock);
4674         io_cqring_ev_posted(ctx);
4675         if (ret < 0)
4676                 req_set_fail_links(req);
4677         io_put_req(req);
4678         return 0;
4679 }
4680
4681 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4682                            bool is_timeout_link)
4683 {
4684         struct io_timeout_data *data;
4685         unsigned flags;
4686         u32 off = READ_ONCE(sqe->off);
4687
4688         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4689                 return -EINVAL;
4690         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
4691                 return -EINVAL;
4692         if (off && is_timeout_link)
4693                 return -EINVAL;
4694         flags = READ_ONCE(sqe->timeout_flags);
4695         if (flags & ~IORING_TIMEOUT_ABS)
4696                 return -EINVAL;
4697
4698         req->timeout.off = off;
4699
4700         if (!req->io && io_alloc_async_ctx(req))
4701                 return -ENOMEM;
4702
4703         data = &req->io->timeout;
4704         data->req = req;
4705         req->flags |= REQ_F_TIMEOUT;
4706
4707         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
4708                 return -EFAULT;
4709
4710         if (flags & IORING_TIMEOUT_ABS)
4711                 data->mode = HRTIMER_MODE_ABS;
4712         else
4713                 data->mode = HRTIMER_MODE_REL;
4714
4715         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4716         return 0;
4717 }
4718
4719 static int io_timeout(struct io_kiocb *req)
4720 {
4721         struct io_ring_ctx *ctx = req->ctx;
4722         struct io_timeout_data *data = &req->io->timeout;
4723         struct list_head *entry;
4724         u32 tail, off = req->timeout.off;
4725
4726         spin_lock_irq(&ctx->completion_lock);
4727
4728         /*
4729          * sqe->off holds how many events that need to occur for this
4730          * timeout event to be satisfied. If it isn't set, then this is
4731          * a pure timeout request, sequence isn't used.
4732          */
4733         if (!off) {
4734                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4735                 entry = ctx->timeout_list.prev;
4736                 goto add;
4737         }
4738
4739         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
4740         req->timeout.target_seq = tail + off;
4741
4742         /*
4743          * Insertion sort, ensuring the first entry in the list is always
4744          * the one we need first.
4745          */
4746         list_for_each_prev(entry, &ctx->timeout_list) {
4747                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
4748
4749                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4750                         continue;
4751                 /* nxt.seq is behind @tail, otherwise would've been completed */
4752                 if (off >= nxt->timeout.target_seq - tail)
4753                         break;
4754         }
4755 add:
4756         list_add(&req->list, entry);
4757         data->timer.function = io_timeout_fn;
4758         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
4759         spin_unlock_irq(&ctx->completion_lock);
4760         return 0;
4761 }
4762
4763 static bool io_cancel_cb(struct io_wq_work *work, void *data)
4764 {
4765         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4766
4767         return req->user_data == (unsigned long) data;
4768 }
4769
4770 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
4771 {
4772         enum io_wq_cancel cancel_ret;
4773         int ret = 0;
4774
4775         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4776         switch (cancel_ret) {
4777         case IO_WQ_CANCEL_OK:
4778                 ret = 0;
4779                 break;
4780         case IO_WQ_CANCEL_RUNNING:
4781                 ret = -EALREADY;
4782                 break;
4783         case IO_WQ_CANCEL_NOTFOUND:
4784                 ret = -ENOENT;
4785                 break;
4786         }
4787
4788         return ret;
4789 }
4790
4791 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4792                                      struct io_kiocb *req, __u64 sqe_addr,
4793                                      int success_ret)
4794 {
4795         unsigned long flags;
4796         int ret;
4797
4798         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4799         if (ret != -ENOENT) {
4800                 spin_lock_irqsave(&ctx->completion_lock, flags);
4801                 goto done;
4802         }
4803
4804         spin_lock_irqsave(&ctx->completion_lock, flags);
4805         ret = io_timeout_cancel(ctx, sqe_addr);
4806         if (ret != -ENOENT)
4807                 goto done;
4808         ret = io_poll_cancel(ctx, sqe_addr);
4809 done:
4810         if (!ret)
4811                 ret = success_ret;
4812         io_cqring_fill_event(req, ret);
4813         io_commit_cqring(ctx);
4814         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4815         io_cqring_ev_posted(ctx);
4816
4817         if (ret < 0)
4818                 req_set_fail_links(req);
4819         io_put_req(req);
4820 }
4821
4822 static int io_async_cancel_prep(struct io_kiocb *req,
4823                                 const struct io_uring_sqe *sqe)
4824 {
4825         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4826                 return -EINVAL;
4827         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4828             sqe->cancel_flags)
4829                 return -EINVAL;
4830
4831         req->cancel.addr = READ_ONCE(sqe->addr);
4832         return 0;
4833 }
4834
4835 static int io_async_cancel(struct io_kiocb *req)
4836 {
4837         struct io_ring_ctx *ctx = req->ctx;
4838
4839         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
4840         return 0;
4841 }
4842
4843 static int io_files_update_prep(struct io_kiocb *req,
4844                                 const struct io_uring_sqe *sqe)
4845 {
4846         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4847                 return -EINVAL;
4848
4849         req->files_update.offset = READ_ONCE(sqe->off);
4850         req->files_update.nr_args = READ_ONCE(sqe->len);
4851         if (!req->files_update.nr_args)
4852                 return -EINVAL;
4853         req->files_update.arg = READ_ONCE(sqe->addr);
4854         return 0;
4855 }
4856
4857 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
4858 {
4859         struct io_ring_ctx *ctx = req->ctx;
4860         struct io_uring_files_update up;
4861         int ret;
4862
4863         if (force_nonblock)
4864                 return -EAGAIN;
4865
4866         up.offset = req->files_update.offset;
4867         up.fds = req->files_update.arg;
4868
4869         mutex_lock(&ctx->uring_lock);
4870         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4871         mutex_unlock(&ctx->uring_lock);
4872
4873         if (ret < 0)
4874                 req_set_fail_links(req);
4875         io_cqring_add_event(req, ret);
4876         io_put_req(req);
4877         return 0;
4878 }
4879
4880 static int io_req_defer_prep(struct io_kiocb *req,
4881                              const struct io_uring_sqe *sqe)
4882 {
4883         ssize_t ret = 0;
4884
4885         if (!sqe)
4886                 return 0;
4887
4888         io_req_init_async(req);
4889
4890         if (io_op_defs[req->opcode].file_table) {
4891                 ret = io_grab_files(req);
4892                 if (unlikely(ret))
4893                         return ret;
4894         }
4895
4896         io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4897
4898         switch (req->opcode) {
4899         case IORING_OP_NOP:
4900                 break;
4901         case IORING_OP_READV:
4902         case IORING_OP_READ_FIXED:
4903         case IORING_OP_READ:
4904                 ret = io_read_prep(req, sqe, true);
4905                 break;
4906         case IORING_OP_WRITEV:
4907         case IORING_OP_WRITE_FIXED:
4908         case IORING_OP_WRITE:
4909                 ret = io_write_prep(req, sqe, true);
4910                 break;
4911         case IORING_OP_POLL_ADD:
4912                 ret = io_poll_add_prep(req, sqe);
4913                 break;
4914         case IORING_OP_POLL_REMOVE:
4915                 ret = io_poll_remove_prep(req, sqe);
4916                 break;
4917         case IORING_OP_FSYNC:
4918                 ret = io_prep_fsync(req, sqe);
4919                 break;
4920         case IORING_OP_SYNC_FILE_RANGE:
4921                 ret = io_prep_sfr(req, sqe);
4922                 break;
4923         case IORING_OP_SENDMSG:
4924         case IORING_OP_SEND:
4925                 ret = io_sendmsg_prep(req, sqe);
4926                 break;
4927         case IORING_OP_RECVMSG:
4928         case IORING_OP_RECV:
4929                 ret = io_recvmsg_prep(req, sqe);
4930                 break;
4931         case IORING_OP_CONNECT:
4932                 ret = io_connect_prep(req, sqe);
4933                 break;
4934         case IORING_OP_TIMEOUT:
4935                 ret = io_timeout_prep(req, sqe, false);
4936                 break;
4937         case IORING_OP_TIMEOUT_REMOVE:
4938                 ret = io_timeout_remove_prep(req, sqe);
4939                 break;
4940         case IORING_OP_ASYNC_CANCEL:
4941                 ret = io_async_cancel_prep(req, sqe);
4942                 break;
4943         case IORING_OP_LINK_TIMEOUT:
4944                 ret = io_timeout_prep(req, sqe, true);
4945                 break;
4946         case IORING_OP_ACCEPT:
4947                 ret = io_accept_prep(req, sqe);
4948                 break;
4949         case IORING_OP_FALLOCATE:
4950                 ret = io_fallocate_prep(req, sqe);
4951                 break;
4952         case IORING_OP_OPENAT:
4953                 ret = io_openat_prep(req, sqe);
4954                 break;
4955         case IORING_OP_CLOSE:
4956                 ret = io_close_prep(req, sqe);
4957                 break;
4958         case IORING_OP_FILES_UPDATE:
4959                 ret = io_files_update_prep(req, sqe);
4960                 break;
4961         case IORING_OP_STATX:
4962                 ret = io_statx_prep(req, sqe);
4963                 break;
4964         case IORING_OP_FADVISE:
4965                 ret = io_fadvise_prep(req, sqe);
4966                 break;
4967         case IORING_OP_MADVISE:
4968                 ret = io_madvise_prep(req, sqe);
4969                 break;
4970         case IORING_OP_OPENAT2:
4971                 ret = io_openat2_prep(req, sqe);
4972                 break;
4973         case IORING_OP_EPOLL_CTL:
4974                 ret = io_epoll_ctl_prep(req, sqe);
4975                 break;
4976         case IORING_OP_SPLICE:
4977                 ret = io_splice_prep(req, sqe);
4978                 break;
4979         case IORING_OP_PROVIDE_BUFFERS:
4980                 ret = io_provide_buffers_prep(req, sqe);
4981                 break;
4982         case IORING_OP_REMOVE_BUFFERS:
4983                 ret = io_remove_buffers_prep(req, sqe);
4984                 break;
4985         case IORING_OP_TEE:
4986                 ret = io_tee_prep(req, sqe);
4987                 break;
4988         default:
4989                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4990                                 req->opcode);
4991                 ret = -EINVAL;
4992                 break;
4993         }
4994
4995         return ret;
4996 }
4997
4998 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4999 {
5000         struct io_ring_ctx *ctx = req->ctx;
5001         int ret;
5002
5003         /* Still need defer if there is pending req in defer list. */
5004         if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5005                 return 0;
5006
5007         if (!req->io) {
5008                 if (io_alloc_async_ctx(req))
5009                         return -EAGAIN;
5010                 ret = io_req_defer_prep(req, sqe);
5011                 if (ret < 0)
5012                         return ret;
5013         }
5014
5015         spin_lock_irq(&ctx->completion_lock);
5016         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5017                 spin_unlock_irq(&ctx->completion_lock);
5018                 return 0;
5019         }
5020
5021         trace_io_uring_defer(ctx, req, req->user_data);
5022         list_add_tail(&req->list, &ctx->defer_list);
5023         spin_unlock_irq(&ctx->completion_lock);
5024         return -EIOCBQUEUED;
5025 }
5026
5027 static void io_cleanup_req(struct io_kiocb *req)
5028 {
5029         struct io_async_ctx *io = req->io;
5030
5031         switch (req->opcode) {
5032         case IORING_OP_READV:
5033         case IORING_OP_READ_FIXED:
5034         case IORING_OP_READ:
5035                 if (req->flags & REQ_F_BUFFER_SELECTED)
5036                         kfree((void *)(unsigned long)req->rw.addr);
5037                 /* fallthrough */
5038         case IORING_OP_WRITEV:
5039         case IORING_OP_WRITE_FIXED:
5040         case IORING_OP_WRITE:
5041                 if (io->rw.iov != io->rw.fast_iov)
5042                         kfree(io->rw.iov);
5043                 break;
5044         case IORING_OP_RECVMSG:
5045                 if (req->flags & REQ_F_BUFFER_SELECTED)
5046                         kfree(req->sr_msg.kbuf);
5047                 /* fallthrough */
5048         case IORING_OP_SENDMSG:
5049                 if (io->msg.iov != io->msg.fast_iov)
5050                         kfree(io->msg.iov);
5051                 break;
5052         case IORING_OP_RECV:
5053                 if (req->flags & REQ_F_BUFFER_SELECTED)
5054                         kfree(req->sr_msg.kbuf);
5055                 break;
5056         case IORING_OP_OPENAT:
5057         case IORING_OP_OPENAT2:
5058                 break;
5059         case IORING_OP_SPLICE:
5060         case IORING_OP_TEE:
5061                 io_put_file(req, req->splice.file_in,
5062                             (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5063                 break;
5064         }
5065
5066         req->flags &= ~REQ_F_NEED_CLEANUP;
5067 }
5068
5069 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5070                         bool force_nonblock)
5071 {
5072         struct io_ring_ctx *ctx = req->ctx;
5073         int ret;
5074
5075         switch (req->opcode) {
5076         case IORING_OP_NOP:
5077                 ret = io_nop(req);
5078                 break;
5079         case IORING_OP_READV:
5080         case IORING_OP_READ_FIXED:
5081         case IORING_OP_READ:
5082                 if (sqe) {
5083                         ret = io_read_prep(req, sqe, force_nonblock);
5084                         if (ret < 0)
5085                                 break;
5086                 }
5087                 ret = io_read(req, force_nonblock);
5088                 break;
5089         case IORING_OP_WRITEV:
5090         case IORING_OP_WRITE_FIXED:
5091         case IORING_OP_WRITE:
5092                 if (sqe) {
5093                         ret = io_write_prep(req, sqe, force_nonblock);
5094                         if (ret < 0)
5095                                 break;
5096                 }
5097                 ret = io_write(req, force_nonblock);
5098                 break;
5099         case IORING_OP_FSYNC:
5100                 if (sqe) {
5101                         ret = io_prep_fsync(req, sqe);
5102                         if (ret < 0)
5103                                 break;
5104                 }
5105                 ret = io_fsync(req, force_nonblock);
5106                 break;
5107         case IORING_OP_POLL_ADD:
5108                 if (sqe) {
5109                         ret = io_poll_add_prep(req, sqe);
5110                         if (ret)
5111                                 break;
5112                 }
5113                 ret = io_poll_add(req);
5114                 break;
5115         case IORING_OP_POLL_REMOVE:
5116                 if (sqe) {
5117                         ret = io_poll_remove_prep(req, sqe);
5118                         if (ret < 0)
5119                                 break;
5120                 }
5121                 ret = io_poll_remove(req);
5122                 break;
5123         case IORING_OP_SYNC_FILE_RANGE:
5124                 if (sqe) {
5125                         ret = io_prep_sfr(req, sqe);
5126                         if (ret < 0)
5127                                 break;
5128                 }
5129                 ret = io_sync_file_range(req, force_nonblock);
5130                 break;
5131         case IORING_OP_SENDMSG:
5132         case IORING_OP_SEND:
5133                 if (sqe) {
5134                         ret = io_sendmsg_prep(req, sqe);
5135                         if (ret < 0)
5136                                 break;
5137                 }
5138                 if (req->opcode == IORING_OP_SENDMSG)
5139                         ret = io_sendmsg(req, force_nonblock);
5140                 else
5141                         ret = io_send(req, force_nonblock);
5142                 break;
5143         case IORING_OP_RECVMSG:
5144         case IORING_OP_RECV:
5145                 if (sqe) {
5146                         ret = io_recvmsg_prep(req, sqe);
5147                         if (ret)
5148                                 break;
5149                 }
5150                 if (req->opcode == IORING_OP_RECVMSG)
5151                         ret = io_recvmsg(req, force_nonblock);
5152                 else
5153                         ret = io_recv(req, force_nonblock);
5154                 break;
5155         case IORING_OP_TIMEOUT:
5156                 if (sqe) {
5157                         ret = io_timeout_prep(req, sqe, false);
5158                         if (ret)
5159                                 break;
5160                 }
5161                 ret = io_timeout(req);
5162                 break;
5163         case IORING_OP_TIMEOUT_REMOVE:
5164                 if (sqe) {
5165                         ret = io_timeout_remove_prep(req, sqe);
5166                         if (ret)
5167                                 break;
5168                 }
5169                 ret = io_timeout_remove(req);
5170                 break;
5171         case IORING_OP_ACCEPT:
5172                 if (sqe) {
5173                         ret = io_accept_prep(req, sqe);
5174                         if (ret)
5175                                 break;
5176                 }
5177                 ret = io_accept(req, force_nonblock);
5178                 break;
5179         case IORING_OP_CONNECT:
5180                 if (sqe) {
5181                         ret = io_connect_prep(req, sqe);
5182                         if (ret)
5183                                 break;
5184                 }
5185                 ret = io_connect(req, force_nonblock);
5186                 break;
5187         case IORING_OP_ASYNC_CANCEL:
5188                 if (sqe) {
5189                         ret = io_async_cancel_prep(req, sqe);
5190                         if (ret)
5191                                 break;
5192                 }
5193                 ret = io_async_cancel(req);
5194                 break;
5195         case IORING_OP_FALLOCATE:
5196                 if (sqe) {
5197                         ret = io_fallocate_prep(req, sqe);
5198                         if (ret)
5199                                 break;
5200                 }
5201                 ret = io_fallocate(req, force_nonblock);
5202                 break;
5203         case IORING_OP_OPENAT:
5204                 if (sqe) {
5205                         ret = io_openat_prep(req, sqe);
5206                         if (ret)
5207                                 break;
5208                 }
5209                 ret = io_openat(req, force_nonblock);
5210                 break;
5211         case IORING_OP_CLOSE:
5212                 if (sqe) {
5213                         ret = io_close_prep(req, sqe);
5214                         if (ret)
5215                                 break;
5216                 }
5217                 ret = io_close(req, force_nonblock);
5218                 break;
5219         case IORING_OP_FILES_UPDATE:
5220                 if (sqe) {
5221                         ret = io_files_update_prep(req, sqe);
5222                         if (ret)
5223                                 break;
5224                 }
5225                 ret = io_files_update(req, force_nonblock);
5226                 break;
5227         case IORING_OP_STATX:
5228                 if (sqe) {
5229                         ret = io_statx_prep(req, sqe);
5230                         if (ret)
5231                                 break;
5232                 }
5233                 ret = io_statx(req, force_nonblock);
5234                 break;
5235         case IORING_OP_FADVISE:
5236                 if (sqe) {
5237                         ret = io_fadvise_prep(req, sqe);
5238                         if (ret)
5239                                 break;
5240                 }
5241                 ret = io_fadvise(req, force_nonblock);
5242                 break;
5243         case IORING_OP_MADVISE:
5244                 if (sqe) {
5245                         ret = io_madvise_prep(req, sqe);
5246                         if (ret)
5247                                 break;
5248                 }
5249                 ret = io_madvise(req, force_nonblock);
5250                 break;
5251         case IORING_OP_OPENAT2:
5252                 if (sqe) {
5253                         ret = io_openat2_prep(req, sqe);
5254                         if (ret)
5255                                 break;
5256                 }
5257                 ret = io_openat2(req, force_nonblock);
5258                 break;
5259         case IORING_OP_EPOLL_CTL:
5260                 if (sqe) {
5261                         ret = io_epoll_ctl_prep(req, sqe);
5262                         if (ret)
5263                                 break;
5264                 }
5265                 ret = io_epoll_ctl(req, force_nonblock);
5266                 break;
5267         case IORING_OP_SPLICE:
5268                 if (sqe) {
5269                         ret = io_splice_prep(req, sqe);
5270                         if (ret < 0)
5271                                 break;
5272                 }
5273                 ret = io_splice(req, force_nonblock);
5274                 break;
5275         case IORING_OP_PROVIDE_BUFFERS:
5276                 if (sqe) {
5277                         ret = io_provide_buffers_prep(req, sqe);
5278                         if (ret)
5279                                 break;
5280                 }
5281                 ret = io_provide_buffers(req, force_nonblock);
5282                 break;
5283         case IORING_OP_REMOVE_BUFFERS:
5284                 if (sqe) {
5285                         ret = io_remove_buffers_prep(req, sqe);
5286                         if (ret)
5287                                 break;
5288                 }
5289                 ret = io_remove_buffers(req, force_nonblock);
5290                 break;
5291         case IORING_OP_TEE:
5292                 if (sqe) {
5293                         ret = io_tee_prep(req, sqe);
5294                         if (ret < 0)
5295                                 break;
5296                 }
5297                 ret = io_tee(req, force_nonblock);
5298                 break;
5299         default:
5300                 ret = -EINVAL;
5301                 break;
5302         }
5303
5304         if (ret)
5305                 return ret;
5306
5307         /* If the op doesn't have a file, we're not polling for it */
5308         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5309                 const bool in_async = io_wq_current_is_worker();
5310
5311                 if (req->result == -EAGAIN)
5312                         return -EAGAIN;
5313
5314                 /* workqueue context doesn't hold uring_lock, grab it now */
5315                 if (in_async)
5316                         mutex_lock(&ctx->uring_lock);
5317
5318                 io_iopoll_req_issued(req);
5319
5320                 if (in_async)
5321                         mutex_unlock(&ctx->uring_lock);
5322         }
5323
5324         return 0;
5325 }
5326
5327 static void io_arm_async_linked_timeout(struct io_kiocb *req)
5328 {
5329         struct io_kiocb *link;
5330
5331         /* link head's timeout is queued in io_queue_async_work() */
5332         if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
5333                 return;
5334
5335         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
5336         io_queue_linked_timeout(link);
5337 }
5338
5339 static void io_wq_submit_work(struct io_wq_work **workptr)
5340 {
5341         struct io_wq_work *work = *workptr;
5342         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5343         int ret = 0;
5344
5345         io_arm_async_linked_timeout(req);
5346
5347         /* if NO_CANCEL is set, we must still run the work */
5348         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5349                                 IO_WQ_WORK_CANCEL) {
5350                 ret = -ECANCELED;
5351         }
5352
5353         if (!ret) {
5354                 do {
5355                         ret = io_issue_sqe(req, NULL, false);
5356                         /*
5357                          * We can get EAGAIN for polled IO even though we're
5358                          * forcing a sync submission from here, since we can't
5359                          * wait for request slots on the block side.
5360                          */
5361                         if (ret != -EAGAIN)
5362                                 break;
5363                         cond_resched();
5364                 } while (1);
5365         }
5366
5367         if (ret) {
5368                 req_set_fail_links(req);
5369                 io_cqring_add_event(req, ret);
5370                 io_put_req(req);
5371         }
5372
5373         io_steal_work(req, workptr);
5374 }
5375
5376 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5377                                               int index)
5378 {
5379         struct fixed_file_table *table;
5380
5381         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5382         return table->files[index & IORING_FILE_TABLE_MASK];
5383 }
5384
5385 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5386                         int fd, struct file **out_file, bool fixed)
5387 {
5388         struct io_ring_ctx *ctx = req->ctx;
5389         struct file *file;
5390
5391         if (fixed) {
5392                 if (unlikely(!ctx->file_data ||
5393                     (unsigned) fd >= ctx->nr_user_files))
5394                         return -EBADF;
5395                 fd = array_index_nospec(fd, ctx->nr_user_files);
5396                 file = io_file_from_index(ctx, fd);
5397                 if (file) {
5398                         req->fixed_file_refs = ctx->file_data->cur_refs;
5399                         percpu_ref_get(req->fixed_file_refs);
5400                 }
5401         } else {
5402                 trace_io_uring_file_get(ctx, fd);
5403                 file = __io_file_get(state, fd);
5404         }
5405
5406         if (file || io_op_defs[req->opcode].needs_file_no_error) {
5407                 *out_file = file;
5408                 return 0;
5409         }
5410         return -EBADF;
5411 }
5412
5413 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5414                            int fd)
5415 {
5416         bool fixed;
5417
5418         fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5419         if (unlikely(!fixed && io_async_submit(req->ctx)))
5420                 return -EBADF;
5421
5422         return io_file_get(state, req, fd, &req->file, fixed);
5423 }
5424
5425 static int io_grab_files(struct io_kiocb *req)
5426 {
5427         int ret = -EBADF;
5428         struct io_ring_ctx *ctx = req->ctx;
5429
5430         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5431                 return 0;
5432         if (!ctx->ring_file)
5433                 return -EBADF;
5434
5435         rcu_read_lock();
5436         spin_lock_irq(&ctx->inflight_lock);
5437         /*
5438          * We use the f_ops->flush() handler to ensure that we can flush
5439          * out work accessing these files if the fd is closed. Check if
5440          * the fd has changed since we started down this path, and disallow
5441          * this operation if it has.
5442          */
5443         if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5444                 list_add(&req->inflight_entry, &ctx->inflight_list);
5445                 req->flags |= REQ_F_INFLIGHT;
5446                 req->work.files = current->files;
5447                 ret = 0;
5448         }
5449         spin_unlock_irq(&ctx->inflight_lock);
5450         rcu_read_unlock();
5451
5452         return ret;
5453 }
5454
5455 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5456 {
5457         struct io_timeout_data *data = container_of(timer,
5458                                                 struct io_timeout_data, timer);
5459         struct io_kiocb *req = data->req;
5460         struct io_ring_ctx *ctx = req->ctx;
5461         struct io_kiocb *prev = NULL;
5462         unsigned long flags;
5463
5464         spin_lock_irqsave(&ctx->completion_lock, flags);
5465
5466         /*
5467          * We don't expect the list to be empty, that will only happen if we
5468          * race with the completion of the linked work.
5469          */
5470         if (!list_empty(&req->link_list)) {
5471                 prev = list_entry(req->link_list.prev, struct io_kiocb,
5472                                   link_list);
5473                 if (refcount_inc_not_zero(&prev->refs)) {
5474                         list_del_init(&req->link_list);
5475                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
5476                 } else
5477                         prev = NULL;
5478         }
5479
5480         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5481
5482         if (prev) {
5483                 req_set_fail_links(prev);
5484                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5485                 io_put_req(prev);
5486         } else {
5487                 io_cqring_add_event(req, -ETIME);
5488                 io_put_req(req);
5489         }
5490         return HRTIMER_NORESTART;
5491 }
5492
5493 static void io_queue_linked_timeout(struct io_kiocb *req)
5494 {
5495         struct io_ring_ctx *ctx = req->ctx;
5496
5497         /*
5498          * If the list is now empty, then our linked request finished before
5499          * we got a chance to setup the timer
5500          */
5501         spin_lock_irq(&ctx->completion_lock);
5502         if (!list_empty(&req->link_list)) {
5503                 struct io_timeout_data *data = &req->io->timeout;
5504
5505                 data->timer.function = io_link_timeout_fn;
5506                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5507                                 data->mode);
5508         }
5509         spin_unlock_irq(&ctx->completion_lock);
5510
5511         /* drop submission reference */
5512         io_put_req(req);
5513 }
5514
5515 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5516 {
5517         struct io_kiocb *nxt;
5518
5519         if (!(req->flags & REQ_F_LINK_HEAD))
5520                 return NULL;
5521         /* for polled retry, if flag is set, we already went through here */
5522         if (req->flags & REQ_F_POLLED)
5523                 return NULL;
5524
5525         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5526                                         link_list);
5527         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5528                 return NULL;
5529
5530         req->flags |= REQ_F_LINK_TIMEOUT;
5531         return nxt;
5532 }
5533
5534 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5535 {
5536         struct io_kiocb *linked_timeout;
5537         struct io_kiocb *nxt;
5538         const struct cred *old_creds = NULL;
5539         int ret;
5540
5541 again:
5542         linked_timeout = io_prep_linked_timeout(req);
5543
5544         if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
5545             req->work.creds != current_cred()) {
5546                 if (old_creds)
5547                         revert_creds(old_creds);
5548                 if (old_creds == req->work.creds)
5549                         old_creds = NULL; /* restored original creds */
5550                 else
5551                         old_creds = override_creds(req->work.creds);
5552         }
5553
5554         ret = io_issue_sqe(req, sqe, true);
5555
5556         /*
5557          * We async punt it if the file wasn't marked NOWAIT, or if the file
5558          * doesn't support non-blocking read/write attempts
5559          */
5560         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5561             (req->flags & REQ_F_MUST_PUNT))) {
5562                 if (io_arm_poll_handler(req)) {
5563                         if (linked_timeout)
5564                                 io_queue_linked_timeout(linked_timeout);
5565                         goto exit;
5566                 }
5567 punt:
5568                 io_req_init_async(req);
5569
5570                 if (io_op_defs[req->opcode].file_table) {
5571                         ret = io_grab_files(req);
5572                         if (ret)
5573                                 goto err;
5574                 }
5575
5576                 /*
5577                  * Queued up for async execution, worker will release
5578                  * submit reference when the iocb is actually submitted.
5579                  */
5580                 io_queue_async_work(req);
5581                 goto exit;
5582         }
5583
5584 err:
5585         nxt = NULL;
5586         /* drop submission reference */
5587         io_put_req_find_next(req, &nxt);
5588
5589         if (linked_timeout) {
5590                 if (!ret)
5591                         io_queue_linked_timeout(linked_timeout);
5592                 else
5593                         io_put_req(linked_timeout);
5594         }
5595
5596         /* and drop final reference, if we failed */
5597         if (ret) {
5598                 io_cqring_add_event(req, ret);
5599                 req_set_fail_links(req);
5600                 io_put_req(req);
5601         }
5602         if (nxt) {
5603                 req = nxt;
5604
5605                 if (req->flags & REQ_F_FORCE_ASYNC)
5606                         goto punt;
5607                 goto again;
5608         }
5609 exit:
5610         if (old_creds)
5611                 revert_creds(old_creds);
5612 }
5613
5614 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5615 {
5616         int ret;
5617
5618         ret = io_req_defer(req, sqe);
5619         if (ret) {
5620                 if (ret != -EIOCBQUEUED) {
5621 fail_req:
5622                         io_cqring_add_event(req, ret);
5623                         req_set_fail_links(req);
5624                         io_double_put_req(req);
5625                 }
5626         } else if (req->flags & REQ_F_FORCE_ASYNC) {
5627                 if (!req->io) {
5628                         ret = -EAGAIN;
5629                         if (io_alloc_async_ctx(req))
5630                                 goto fail_req;
5631                         ret = io_req_defer_prep(req, sqe);
5632                         if (unlikely(ret < 0))
5633                                 goto fail_req;
5634                 }
5635
5636                 /*
5637                  * Never try inline submit of IOSQE_ASYNC is set, go straight
5638                  * to async execution.
5639                  */
5640                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5641                 io_queue_async_work(req);
5642         } else {
5643                 __io_queue_sqe(req, sqe);
5644         }
5645 }
5646
5647 static inline void io_queue_link_head(struct io_kiocb *req)
5648 {
5649         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5650                 io_cqring_add_event(req, -ECANCELED);
5651                 io_double_put_req(req);
5652         } else
5653                 io_queue_sqe(req, NULL);
5654 }
5655
5656 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5657                          struct io_kiocb **link)
5658 {
5659         struct io_ring_ctx *ctx = req->ctx;
5660         int ret;
5661
5662         /*
5663          * If we already have a head request, queue this one for async
5664          * submittal once the head completes. If we don't have a head but
5665          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5666          * submitted sync once the chain is complete. If none of those
5667          * conditions are true (normal request), then just queue it.
5668          */
5669         if (*link) {
5670                 struct io_kiocb *head = *link;
5671
5672                 /*
5673                  * Taking sequential execution of a link, draining both sides
5674                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5675                  * requests in the link. So, it drains the head and the
5676                  * next after the link request. The last one is done via
5677                  * drain_next flag to persist the effect across calls.
5678                  */
5679                 if (req->flags & REQ_F_IO_DRAIN) {
5680                         head->flags |= REQ_F_IO_DRAIN;
5681                         ctx->drain_next = 1;
5682                 }
5683                 if (io_alloc_async_ctx(req))
5684                         return -EAGAIN;
5685
5686                 ret = io_req_defer_prep(req, sqe);
5687                 if (ret) {
5688                         /* fail even hard links since we don't submit */
5689                         head->flags |= REQ_F_FAIL_LINK;
5690                         return ret;
5691                 }
5692                 trace_io_uring_link(ctx, req, head);
5693                 list_add_tail(&req->link_list, &head->link_list);
5694
5695                 /* last request of a link, enqueue the link */
5696                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
5697                         io_queue_link_head(head);
5698                         *link = NULL;
5699                 }
5700         } else {
5701                 if (unlikely(ctx->drain_next)) {
5702                         req->flags |= REQ_F_IO_DRAIN;
5703                         ctx->drain_next = 0;
5704                 }
5705                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
5706                         req->flags |= REQ_F_LINK_HEAD;
5707                         INIT_LIST_HEAD(&req->link_list);
5708
5709                         if (io_alloc_async_ctx(req))
5710                                 return -EAGAIN;
5711
5712                         ret = io_req_defer_prep(req, sqe);
5713                         if (ret)
5714                                 req->flags |= REQ_F_FAIL_LINK;
5715                         *link = req;
5716                 } else {
5717                         io_queue_sqe(req, sqe);
5718                 }
5719         }
5720
5721         return 0;
5722 }
5723
5724 /*
5725  * Batched submission is done, ensure local IO is flushed out.
5726  */
5727 static void io_submit_state_end(struct io_submit_state *state)
5728 {
5729         blk_finish_plug(&state->plug);
5730         io_state_file_put(state);
5731         if (state->free_reqs)
5732                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
5733 }
5734
5735 /*
5736  * Start submission side cache.
5737  */
5738 static void io_submit_state_start(struct io_submit_state *state,
5739                                   unsigned int max_ios)
5740 {
5741         blk_start_plug(&state->plug);
5742         state->free_reqs = 0;
5743         state->file = NULL;
5744         state->ios_left = max_ios;
5745 }
5746
5747 static void io_commit_sqring(struct io_ring_ctx *ctx)
5748 {
5749         struct io_rings *rings = ctx->rings;
5750
5751         /*
5752          * Ensure any loads from the SQEs are done at this point,
5753          * since once we write the new head, the application could
5754          * write new data to them.
5755          */
5756         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
5757 }
5758
5759 /*
5760  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
5761  * that is mapped by userspace. This means that care needs to be taken to
5762  * ensure that reads are stable, as we cannot rely on userspace always
5763  * being a good citizen. If members of the sqe are validated and then later
5764  * used, it's important that those reads are done through READ_ONCE() to
5765  * prevent a re-load down the line.
5766  */
5767 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
5768 {
5769         u32 *sq_array = ctx->sq_array;
5770         unsigned head;
5771
5772         /*
5773          * The cached sq head (or cq tail) serves two purposes:
5774          *
5775          * 1) allows us to batch the cost of updating the user visible
5776          *    head updates.
5777          * 2) allows the kernel side to track the head on its own, even
5778          *    though the application is the one updating it.
5779          */
5780         head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
5781         if (likely(head < ctx->sq_entries))
5782                 return &ctx->sq_sqes[head];
5783
5784         /* drop invalid entries */
5785         ctx->cached_sq_dropped++;
5786         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
5787         return NULL;
5788 }
5789
5790 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
5791 {
5792         ctx->cached_sq_head++;
5793 }
5794
5795 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
5796                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
5797                                 IOSQE_BUFFER_SELECT)
5798
5799 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
5800                        const struct io_uring_sqe *sqe,
5801                        struct io_submit_state *state)
5802 {
5803         unsigned int sqe_flags;
5804         int id;
5805
5806         /*
5807          * All io need record the previous position, if LINK vs DARIN,
5808          * it can be used to mark the position of the first IO in the
5809          * link list.
5810          */
5811         req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
5812         req->opcode = READ_ONCE(sqe->opcode);
5813         req->user_data = READ_ONCE(sqe->user_data);
5814         req->io = NULL;
5815         req->file = NULL;
5816         req->ctx = ctx;
5817         req->flags = 0;
5818         /* one is dropped after submission, the other at completion */
5819         refcount_set(&req->refs, 2);
5820         req->task = NULL;
5821         req->result = 0;
5822
5823         if (unlikely(req->opcode >= IORING_OP_LAST))
5824                 return -EINVAL;
5825
5826         if (io_op_defs[req->opcode].needs_mm && !current->mm) {
5827                 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
5828                         return -EFAULT;
5829                 kthread_use_mm(ctx->sqo_mm);
5830         }
5831
5832         sqe_flags = READ_ONCE(sqe->flags);
5833         /* enforce forwards compatibility on users */
5834         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
5835                 return -EINVAL;
5836
5837         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
5838             !io_op_defs[req->opcode].buffer_select)
5839                 return -EOPNOTSUPP;
5840
5841         id = READ_ONCE(sqe->personality);
5842         if (id) {
5843                 io_req_init_async(req);
5844                 req->work.creds = idr_find(&ctx->personality_idr, id);
5845                 if (unlikely(!req->work.creds))
5846                         return -EINVAL;
5847                 get_cred(req->work.creds);
5848         }
5849
5850         /* same numerical values with corresponding REQ_F_*, safe to copy */
5851         req->flags |= sqe_flags;
5852
5853         if (!io_op_defs[req->opcode].needs_file)
5854                 return 0;
5855
5856         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
5857 }
5858
5859 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
5860                           struct file *ring_file, int ring_fd)
5861 {
5862         struct io_submit_state state, *statep = NULL;
5863         struct io_kiocb *link = NULL;
5864         int i, submitted = 0;
5865
5866         /* if we have a backlog and couldn't flush it all, return BUSY */
5867         if (test_bit(0, &ctx->sq_check_overflow)) {
5868                 if (!list_empty(&ctx->cq_overflow_list) &&
5869                     !io_cqring_overflow_flush(ctx, false))
5870                         return -EBUSY;
5871         }
5872
5873         /* make sure SQ entry isn't read before tail */
5874         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
5875
5876         if (!percpu_ref_tryget_many(&ctx->refs, nr))
5877                 return -EAGAIN;
5878
5879         if (nr > IO_PLUG_THRESHOLD) {
5880                 io_submit_state_start(&state, nr);
5881                 statep = &state;
5882         }
5883
5884         ctx->ring_fd = ring_fd;
5885         ctx->ring_file = ring_file;
5886
5887         for (i = 0; i < nr; i++) {
5888                 const struct io_uring_sqe *sqe;
5889                 struct io_kiocb *req;
5890                 int err;
5891
5892                 sqe = io_get_sqe(ctx);
5893                 if (unlikely(!sqe)) {
5894                         io_consume_sqe(ctx);
5895                         break;
5896                 }
5897                 req = io_alloc_req(ctx, statep);
5898                 if (unlikely(!req)) {
5899                         if (!submitted)
5900                                 submitted = -EAGAIN;
5901                         break;
5902                 }
5903
5904                 err = io_init_req(ctx, req, sqe, statep);
5905                 io_consume_sqe(ctx);
5906                 /* will complete beyond this point, count as submitted */
5907                 submitted++;
5908
5909                 if (unlikely(err)) {
5910 fail_req:
5911                         io_cqring_add_event(req, err);
5912                         io_double_put_req(req);
5913                         break;
5914                 }
5915
5916                 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5917                                                 true, io_async_submit(ctx));
5918                 err = io_submit_sqe(req, sqe, &link);
5919                 if (err)
5920                         goto fail_req;
5921         }
5922
5923         if (unlikely(submitted != nr)) {
5924                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5925
5926                 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5927         }
5928         if (link)
5929                 io_queue_link_head(link);
5930         if (statep)
5931                 io_submit_state_end(&state);
5932
5933          /* Commit SQ ring head once we've consumed and submitted all SQEs */
5934         io_commit_sqring(ctx);
5935
5936         return submitted;
5937 }
5938
5939 static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
5940 {
5941         struct mm_struct *mm = current->mm;
5942
5943         if (mm) {
5944                 kthread_unuse_mm(mm);
5945                 mmput(mm);
5946         }
5947 }
5948
5949 static int io_sq_thread(void *data)
5950 {
5951         struct io_ring_ctx *ctx = data;
5952         const struct cred *old_cred;
5953         DEFINE_WAIT(wait);
5954         unsigned long timeout;
5955         int ret = 0;
5956
5957         complete(&ctx->sq_thread_comp);
5958
5959         old_cred = override_creds(ctx->creds);
5960
5961         timeout = jiffies + ctx->sq_thread_idle;
5962         while (!kthread_should_park()) {
5963                 unsigned int to_submit;
5964
5965                 if (!list_empty(&ctx->poll_list)) {
5966                         unsigned nr_events = 0;
5967
5968                         mutex_lock(&ctx->uring_lock);
5969                         if (!list_empty(&ctx->poll_list))
5970                                 io_iopoll_getevents(ctx, &nr_events, 0);
5971                         else
5972                                 timeout = jiffies + ctx->sq_thread_idle;
5973                         mutex_unlock(&ctx->uring_lock);
5974                 }
5975
5976                 to_submit = io_sqring_entries(ctx);
5977
5978                 /*
5979                  * If submit got -EBUSY, flag us as needing the application
5980                  * to enter the kernel to reap and flush events.
5981                  */
5982                 if (!to_submit || ret == -EBUSY) {
5983                         /*
5984                          * Drop cur_mm before scheduling, we can't hold it for
5985                          * long periods (or over schedule()). Do this before
5986                          * adding ourselves to the waitqueue, as the unuse/drop
5987                          * may sleep.
5988                          */
5989                         io_sq_thread_drop_mm(ctx);
5990
5991                         /*
5992                          * We're polling. If we're within the defined idle
5993                          * period, then let us spin without work before going
5994                          * to sleep. The exception is if we got EBUSY doing
5995                          * more IO, we should wait for the application to
5996                          * reap events and wake us up.
5997                          */
5998                         if (!list_empty(&ctx->poll_list) ||
5999                             (!time_after(jiffies, timeout) && ret != -EBUSY &&
6000                             !percpu_ref_is_dying(&ctx->refs))) {
6001                                 if (current->task_works)
6002                                         task_work_run();
6003                                 cond_resched();
6004                                 continue;
6005                         }
6006
6007                         prepare_to_wait(&ctx->sqo_wait, &wait,
6008                                                 TASK_INTERRUPTIBLE);
6009
6010                         /*
6011                          * While doing polled IO, before going to sleep, we need
6012                          * to check if there are new reqs added to poll_list, it
6013                          * is because reqs may have been punted to io worker and
6014                          * will be added to poll_list later, hence check the
6015                          * poll_list again.
6016                          */
6017                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6018                             !list_empty_careful(&ctx->poll_list)) {
6019                                 finish_wait(&ctx->sqo_wait, &wait);
6020                                 continue;
6021                         }
6022
6023                         /* Tell userspace we may need a wakeup call */
6024                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6025                         /* make sure to read SQ tail after writing flags */
6026                         smp_mb();
6027
6028                         to_submit = io_sqring_entries(ctx);
6029                         if (!to_submit || ret == -EBUSY) {
6030                                 if (kthread_should_park()) {
6031                                         finish_wait(&ctx->sqo_wait, &wait);
6032                                         break;
6033                                 }
6034                                 if (current->task_works) {
6035                                         task_work_run();
6036                                         finish_wait(&ctx->sqo_wait, &wait);
6037                                         continue;
6038                                 }
6039                                 if (signal_pending(current))
6040                                         flush_signals(current);
6041                                 schedule();
6042                                 finish_wait(&ctx->sqo_wait, &wait);
6043
6044                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6045                                 ret = 0;
6046                                 continue;
6047                         }
6048                         finish_wait(&ctx->sqo_wait, &wait);
6049
6050                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6051                 }
6052
6053                 mutex_lock(&ctx->uring_lock);
6054                 if (likely(!percpu_ref_is_dying(&ctx->refs)))
6055                         ret = io_submit_sqes(ctx, to_submit, NULL, -1);
6056                 mutex_unlock(&ctx->uring_lock);
6057                 timeout = jiffies + ctx->sq_thread_idle;
6058         }
6059
6060         if (current->task_works)
6061                 task_work_run();
6062
6063         io_sq_thread_drop_mm(ctx);
6064         revert_creds(old_cred);
6065
6066         kthread_parkme();
6067
6068         return 0;
6069 }
6070
6071 struct io_wait_queue {
6072         struct wait_queue_entry wq;
6073         struct io_ring_ctx *ctx;
6074         unsigned to_wait;
6075         unsigned nr_timeouts;
6076 };
6077
6078 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6079 {
6080         struct io_ring_ctx *ctx = iowq->ctx;
6081
6082         /*
6083          * Wake up if we have enough events, or if a timeout occurred since we
6084          * started waiting. For timeouts, we always want to return to userspace,
6085          * regardless of event count.
6086          */
6087         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6088                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6089 }
6090
6091 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6092                             int wake_flags, void *key)
6093 {
6094         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6095                                                         wq);
6096
6097         /* use noflush == true, as we can't safely rely on locking context */
6098         if (!io_should_wake(iowq, true))
6099                 return -1;
6100
6101         return autoremove_wake_function(curr, mode, wake_flags, key);
6102 }
6103
6104 /*
6105  * Wait until events become available, if we don't already have some. The
6106  * application must reap them itself, as they reside on the shared cq ring.
6107  */
6108 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6109                           const sigset_t __user *sig, size_t sigsz)
6110 {
6111         struct io_wait_queue iowq = {
6112                 .wq = {
6113                         .private        = current,
6114                         .func           = io_wake_function,
6115                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6116                 },
6117                 .ctx            = ctx,
6118                 .to_wait        = min_events,
6119         };
6120         struct io_rings *rings = ctx->rings;
6121         int ret = 0;
6122
6123         do {
6124                 if (io_cqring_events(ctx, false) >= min_events)
6125                         return 0;
6126                 if (!current->task_works)
6127                         break;
6128                 task_work_run();
6129         } while (1);
6130
6131         if (sig) {
6132 #ifdef CONFIG_COMPAT
6133                 if (in_compat_syscall())
6134                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6135                                                       sigsz);
6136                 else
6137 #endif
6138                         ret = set_user_sigmask(sig, sigsz);
6139
6140                 if (ret)
6141                         return ret;
6142         }
6143
6144         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6145         trace_io_uring_cqring_wait(ctx, min_events);
6146         do {
6147                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6148                                                 TASK_INTERRUPTIBLE);
6149                 if (current->task_works)
6150                         task_work_run();
6151                 if (io_should_wake(&iowq, false))
6152                         break;
6153                 schedule();
6154                 if (signal_pending(current)) {
6155                         ret = -EINTR;
6156                         break;
6157                 }
6158         } while (1);
6159         finish_wait(&ctx->wait, &iowq.wq);
6160
6161         restore_saved_sigmask_unless(ret == -EINTR);
6162
6163         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6164 }
6165
6166 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6167 {
6168 #if defined(CONFIG_UNIX)
6169         if (ctx->ring_sock) {
6170                 struct sock *sock = ctx->ring_sock->sk;
6171                 struct sk_buff *skb;
6172
6173                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6174                         kfree_skb(skb);
6175         }
6176 #else
6177         int i;
6178
6179         for (i = 0; i < ctx->nr_user_files; i++) {
6180                 struct file *file;
6181
6182                 file = io_file_from_index(ctx, i);
6183                 if (file)
6184                         fput(file);
6185         }
6186 #endif
6187 }
6188
6189 static void io_file_ref_kill(struct percpu_ref *ref)
6190 {
6191         struct fixed_file_data *data;
6192
6193         data = container_of(ref, struct fixed_file_data, refs);
6194         complete(&data->done);
6195 }
6196
6197 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6198 {
6199         struct fixed_file_data *data = ctx->file_data;
6200         struct fixed_file_ref_node *ref_node = NULL;
6201         unsigned nr_tables, i;
6202
6203         if (!data)
6204                 return -ENXIO;
6205
6206         spin_lock(&data->lock);
6207         if (!list_empty(&data->ref_list))
6208                 ref_node = list_first_entry(&data->ref_list,
6209                                 struct fixed_file_ref_node, node);
6210         spin_unlock(&data->lock);
6211         if (ref_node)
6212                 percpu_ref_kill(&ref_node->refs);
6213
6214         percpu_ref_kill(&data->refs);
6215
6216         /* wait for all refs nodes to complete */
6217         flush_delayed_work(&ctx->file_put_work);
6218         wait_for_completion(&data->done);
6219
6220         __io_sqe_files_unregister(ctx);
6221         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6222         for (i = 0; i < nr_tables; i++)
6223                 kfree(data->table[i].files);
6224         kfree(data->table);
6225         percpu_ref_exit(&data->refs);
6226         kfree(data);
6227         ctx->file_data = NULL;
6228         ctx->nr_user_files = 0;
6229         return 0;
6230 }
6231
6232 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6233 {
6234         if (ctx->sqo_thread) {
6235                 wait_for_completion(&ctx->sq_thread_comp);
6236                 /*
6237                  * The park is a bit of a work-around, without it we get
6238                  * warning spews on shutdown with SQPOLL set and affinity
6239                  * set to a single CPU.
6240                  */
6241                 kthread_park(ctx->sqo_thread);
6242                 kthread_stop(ctx->sqo_thread);
6243                 ctx->sqo_thread = NULL;
6244         }
6245 }
6246
6247 static void io_finish_async(struct io_ring_ctx *ctx)
6248 {
6249         io_sq_thread_stop(ctx);
6250
6251         if (ctx->io_wq) {
6252                 io_wq_destroy(ctx->io_wq);
6253                 ctx->io_wq = NULL;
6254         }
6255 }
6256
6257 #if defined(CONFIG_UNIX)
6258 /*
6259  * Ensure the UNIX gc is aware of our file set, so we are certain that
6260  * the io_uring can be safely unregistered on process exit, even if we have
6261  * loops in the file referencing.
6262  */
6263 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6264 {
6265         struct sock *sk = ctx->ring_sock->sk;
6266         struct scm_fp_list *fpl;
6267         struct sk_buff *skb;
6268         int i, nr_files;
6269
6270         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6271         if (!fpl)
6272                 return -ENOMEM;
6273
6274         skb = alloc_skb(0, GFP_KERNEL);
6275         if (!skb) {
6276                 kfree(fpl);
6277                 return -ENOMEM;
6278         }
6279
6280         skb->sk = sk;
6281
6282         nr_files = 0;
6283         fpl->user = get_uid(ctx->user);
6284         for (i = 0; i < nr; i++) {
6285                 struct file *file = io_file_from_index(ctx, i + offset);
6286
6287                 if (!file)
6288                         continue;
6289                 fpl->fp[nr_files] = get_file(file);
6290                 unix_inflight(fpl->user, fpl->fp[nr_files]);
6291                 nr_files++;
6292         }
6293
6294         if (nr_files) {
6295                 fpl->max = SCM_MAX_FD;
6296                 fpl->count = nr_files;
6297                 UNIXCB(skb).fp = fpl;
6298                 skb->destructor = unix_destruct_scm;
6299                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6300                 skb_queue_head(&sk->sk_receive_queue, skb);
6301
6302                 for (i = 0; i < nr_files; i++)
6303                         fput(fpl->fp[i]);
6304         } else {
6305                 kfree_skb(skb);
6306                 kfree(fpl);
6307         }
6308
6309         return 0;
6310 }
6311
6312 /*
6313  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6314  * causes regular reference counting to break down. We rely on the UNIX
6315  * garbage collection to take care of this problem for us.
6316  */
6317 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6318 {
6319         unsigned left, total;
6320         int ret = 0;
6321
6322         total = 0;
6323         left = ctx->nr_user_files;
6324         while (left) {
6325                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6326
6327                 ret = __io_sqe_files_scm(ctx, this_files, total);
6328                 if (ret)
6329                         break;
6330                 left -= this_files;
6331                 total += this_files;
6332         }
6333
6334         if (!ret)
6335                 return 0;
6336
6337         while (total < ctx->nr_user_files) {
6338                 struct file *file = io_file_from_index(ctx, total);
6339
6340                 if (file)
6341                         fput(file);
6342                 total++;
6343         }
6344
6345         return ret;
6346 }
6347 #else
6348 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6349 {
6350         return 0;
6351 }
6352 #endif
6353
6354 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6355                                     unsigned nr_files)
6356 {
6357         int i;
6358
6359         for (i = 0; i < nr_tables; i++) {
6360                 struct fixed_file_table *table = &ctx->file_data->table[i];
6361                 unsigned this_files;
6362
6363                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6364                 table->files = kcalloc(this_files, sizeof(struct file *),
6365                                         GFP_KERNEL);
6366                 if (!table->files)
6367                         break;
6368                 nr_files -= this_files;
6369         }
6370
6371         if (i == nr_tables)
6372                 return 0;
6373
6374         for (i = 0; i < nr_tables; i++) {
6375                 struct fixed_file_table *table = &ctx->file_data->table[i];
6376                 kfree(table->files);
6377         }
6378         return 1;
6379 }
6380
6381 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6382 {
6383 #if defined(CONFIG_UNIX)
6384         struct sock *sock = ctx->ring_sock->sk;
6385         struct sk_buff_head list, *head = &sock->sk_receive_queue;
6386         struct sk_buff *skb;
6387         int i;
6388
6389         __skb_queue_head_init(&list);
6390
6391         /*
6392          * Find the skb that holds this file in its SCM_RIGHTS. When found,
6393          * remove this entry and rearrange the file array.
6394          */
6395         skb = skb_dequeue(head);
6396         while (skb) {
6397                 struct scm_fp_list *fp;
6398
6399                 fp = UNIXCB(skb).fp;
6400                 for (i = 0; i < fp->count; i++) {
6401                         int left;
6402
6403                         if (fp->fp[i] != file)
6404                                 continue;
6405
6406                         unix_notinflight(fp->user, fp->fp[i]);
6407                         left = fp->count - 1 - i;
6408                         if (left) {
6409                                 memmove(&fp->fp[i], &fp->fp[i + 1],
6410                                                 left * sizeof(struct file *));
6411                         }
6412                         fp->count--;
6413                         if (!fp->count) {
6414                                 kfree_skb(skb);
6415                                 skb = NULL;
6416                         } else {
6417                                 __skb_queue_tail(&list, skb);
6418                         }
6419                         fput(file);
6420                         file = NULL;
6421                         break;
6422                 }
6423
6424                 if (!file)
6425                         break;
6426
6427                 __skb_queue_tail(&list, skb);
6428
6429                 skb = skb_dequeue(head);
6430         }
6431
6432         if (skb_peek(&list)) {
6433                 spin_lock_irq(&head->lock);
6434                 while ((skb = __skb_dequeue(&list)) != NULL)
6435                         __skb_queue_tail(head, skb);
6436                 spin_unlock_irq(&head->lock);
6437         }
6438 #else
6439         fput(file);
6440 #endif
6441 }
6442
6443 struct io_file_put {
6444         struct list_head list;
6445         struct file *file;
6446 };
6447
6448 static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
6449 {
6450         struct fixed_file_data *file_data = ref_node->file_data;
6451         struct io_ring_ctx *ctx = file_data->ctx;
6452         struct io_file_put *pfile, *tmp;
6453
6454         list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6455                 list_del(&pfile->list);
6456                 io_ring_file_put(ctx, pfile->file);
6457                 kfree(pfile);
6458         }
6459
6460         spin_lock(&file_data->lock);
6461         list_del(&ref_node->node);
6462         spin_unlock(&file_data->lock);
6463
6464         percpu_ref_exit(&ref_node->refs);
6465         kfree(ref_node);
6466         percpu_ref_put(&file_data->refs);
6467 }
6468
6469 static void io_file_put_work(struct work_struct *work)
6470 {
6471         struct io_ring_ctx *ctx;
6472         struct llist_node *node;
6473
6474         ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
6475         node = llist_del_all(&ctx->file_put_llist);
6476
6477         while (node) {
6478                 struct fixed_file_ref_node *ref_node;
6479                 struct llist_node *next = node->next;
6480
6481                 ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
6482                 __io_file_put_work(ref_node);
6483                 node = next;
6484         }
6485 }
6486
6487 static void io_file_data_ref_zero(struct percpu_ref *ref)
6488 {
6489         struct fixed_file_ref_node *ref_node;
6490         struct io_ring_ctx *ctx;
6491         bool first_add;
6492         int delay = HZ;
6493
6494         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6495         ctx = ref_node->file_data->ctx;
6496
6497         if (percpu_ref_is_dying(&ctx->file_data->refs))
6498                 delay = 0;
6499
6500         first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
6501         if (!delay)
6502                 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
6503         else if (first_add)
6504                 queue_delayed_work(system_wq, &ctx->file_put_work, delay);
6505 }
6506
6507 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6508                         struct io_ring_ctx *ctx)
6509 {
6510         struct fixed_file_ref_node *ref_node;
6511
6512         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6513         if (!ref_node)
6514                 return ERR_PTR(-ENOMEM);
6515
6516         if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6517                             0, GFP_KERNEL)) {
6518                 kfree(ref_node);
6519                 return ERR_PTR(-ENOMEM);
6520         }
6521         INIT_LIST_HEAD(&ref_node->node);
6522         INIT_LIST_HEAD(&ref_node->file_list);
6523         ref_node->file_data = ctx->file_data;
6524         return ref_node;
6525 }
6526
6527 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6528 {
6529         percpu_ref_exit(&ref_node->refs);
6530         kfree(ref_node);
6531 }
6532
6533 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6534                                  unsigned nr_args)
6535 {
6536         __s32 __user *fds = (__s32 __user *) arg;
6537         unsigned nr_tables;
6538         struct file *file;
6539         int fd, ret = 0;
6540         unsigned i;
6541         struct fixed_file_ref_node *ref_node;
6542
6543         if (ctx->file_data)
6544                 return -EBUSY;
6545         if (!nr_args)
6546                 return -EINVAL;
6547         if (nr_args > IORING_MAX_FIXED_FILES)
6548                 return -EMFILE;
6549
6550         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6551         if (!ctx->file_data)
6552                 return -ENOMEM;
6553         ctx->file_data->ctx = ctx;
6554         init_completion(&ctx->file_data->done);
6555         INIT_LIST_HEAD(&ctx->file_data->ref_list);
6556         spin_lock_init(&ctx->file_data->lock);
6557
6558         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6559         ctx->file_data->table = kcalloc(nr_tables,
6560                                         sizeof(struct fixed_file_table),
6561                                         GFP_KERNEL);
6562         if (!ctx->file_data->table) {
6563                 kfree(ctx->file_data);
6564                 ctx->file_data = NULL;
6565                 return -ENOMEM;
6566         }
6567
6568         if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6569                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6570                 kfree(ctx->file_data->table);
6571                 kfree(ctx->file_data);
6572                 ctx->file_data = NULL;
6573                 return -ENOMEM;
6574         }
6575
6576         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6577                 percpu_ref_exit(&ctx->file_data->refs);
6578                 kfree(ctx->file_data->table);
6579                 kfree(ctx->file_data);
6580                 ctx->file_data = NULL;
6581                 return -ENOMEM;
6582         }
6583
6584         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6585                 struct fixed_file_table *table;
6586                 unsigned index;
6587
6588                 ret = -EFAULT;
6589                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6590                         break;
6591                 /* allow sparse sets */
6592                 if (fd == -1) {
6593                         ret = 0;
6594                         continue;
6595                 }
6596
6597                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6598                 index = i & IORING_FILE_TABLE_MASK;
6599                 file = fget(fd);
6600
6601                 ret = -EBADF;
6602                 if (!file)
6603                         break;
6604
6605                 /*
6606                  * Don't allow io_uring instances to be registered. If UNIX
6607                  * isn't enabled, then this causes a reference cycle and this
6608                  * instance can never get freed. If UNIX is enabled we'll
6609                  * handle it just fine, but there's still no point in allowing
6610                  * a ring fd as it doesn't support regular read/write anyway.
6611                  */
6612                 if (file->f_op == &io_uring_fops) {
6613                         fput(file);
6614                         break;
6615                 }
6616                 ret = 0;
6617                 table->files[index] = file;
6618         }
6619
6620         if (ret) {
6621                 for (i = 0; i < ctx->nr_user_files; i++) {
6622                         file = io_file_from_index(ctx, i);
6623                         if (file)
6624                                 fput(file);
6625                 }
6626                 for (i = 0; i < nr_tables; i++)
6627                         kfree(ctx->file_data->table[i].files);
6628
6629                 kfree(ctx->file_data->table);
6630                 kfree(ctx->file_data);
6631                 ctx->file_data = NULL;
6632                 ctx->nr_user_files = 0;
6633                 return ret;
6634         }
6635
6636         ret = io_sqe_files_scm(ctx);
6637         if (ret) {
6638                 io_sqe_files_unregister(ctx);
6639                 return ret;
6640         }
6641
6642         ref_node = alloc_fixed_file_ref_node(ctx);
6643         if (IS_ERR(ref_node)) {
6644                 io_sqe_files_unregister(ctx);
6645                 return PTR_ERR(ref_node);
6646         }
6647
6648         ctx->file_data->cur_refs = &ref_node->refs;
6649         spin_lock(&ctx->file_data->lock);
6650         list_add(&ref_node->node, &ctx->file_data->ref_list);
6651         spin_unlock(&ctx->file_data->lock);
6652         percpu_ref_get(&ctx->file_data->refs);
6653         return ret;
6654 }
6655
6656 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6657                                 int index)
6658 {
6659 #if defined(CONFIG_UNIX)
6660         struct sock *sock = ctx->ring_sock->sk;
6661         struct sk_buff_head *head = &sock->sk_receive_queue;
6662         struct sk_buff *skb;
6663
6664         /*
6665          * See if we can merge this file into an existing skb SCM_RIGHTS
6666          * file set. If there's no room, fall back to allocating a new skb
6667          * and filling it in.
6668          */
6669         spin_lock_irq(&head->lock);
6670         skb = skb_peek(head);
6671         if (skb) {
6672                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6673
6674                 if (fpl->count < SCM_MAX_FD) {
6675                         __skb_unlink(skb, head);
6676                         spin_unlock_irq(&head->lock);
6677                         fpl->fp[fpl->count] = get_file(file);
6678                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
6679                         fpl->count++;
6680                         spin_lock_irq(&head->lock);
6681                         __skb_queue_head(head, skb);
6682                 } else {
6683                         skb = NULL;
6684                 }
6685         }
6686         spin_unlock_irq(&head->lock);
6687
6688         if (skb) {
6689                 fput(file);
6690                 return 0;
6691         }
6692
6693         return __io_sqe_files_scm(ctx, 1, index);
6694 #else
6695         return 0;
6696 #endif
6697 }
6698
6699 static int io_queue_file_removal(struct fixed_file_data *data,
6700                                  struct file *file)
6701 {
6702         struct io_file_put *pfile;
6703         struct percpu_ref *refs = data->cur_refs;
6704         struct fixed_file_ref_node *ref_node;
6705
6706         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6707         if (!pfile)
6708                 return -ENOMEM;
6709
6710         ref_node = container_of(refs, struct fixed_file_ref_node, refs);
6711         pfile->file = file;
6712         list_add(&pfile->list, &ref_node->file_list);
6713
6714         return 0;
6715 }
6716
6717 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6718                                  struct io_uring_files_update *up,
6719                                  unsigned nr_args)
6720 {
6721         struct fixed_file_data *data = ctx->file_data;
6722         struct fixed_file_ref_node *ref_node;
6723         struct file *file;
6724         __s32 __user *fds;
6725         int fd, i, err;
6726         __u32 done;
6727         bool needs_switch = false;
6728
6729         if (check_add_overflow(up->offset, nr_args, &done))
6730                 return -EOVERFLOW;
6731         if (done > ctx->nr_user_files)
6732                 return -EINVAL;
6733
6734         ref_node = alloc_fixed_file_ref_node(ctx);
6735         if (IS_ERR(ref_node))
6736                 return PTR_ERR(ref_node);
6737
6738         done = 0;
6739         fds = u64_to_user_ptr(up->fds);
6740         while (nr_args) {
6741                 struct fixed_file_table *table;
6742                 unsigned index;
6743
6744                 err = 0;
6745                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6746                         err = -EFAULT;
6747                         break;
6748                 }
6749                 i = array_index_nospec(up->offset, ctx->nr_user_files);
6750                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6751                 index = i & IORING_FILE_TABLE_MASK;
6752                 if (table->files[index]) {
6753                         file = io_file_from_index(ctx, index);
6754                         err = io_queue_file_removal(data, file);
6755                         if (err)
6756                                 break;
6757                         table->files[index] = NULL;
6758                         needs_switch = true;
6759                 }
6760                 if (fd != -1) {
6761                         file = fget(fd);
6762                         if (!file) {
6763                                 err = -EBADF;
6764                                 break;
6765                         }
6766                         /*
6767                          * Don't allow io_uring instances to be registered. If
6768                          * UNIX isn't enabled, then this causes a reference
6769                          * cycle and this instance can never get freed. If UNIX
6770                          * is enabled we'll handle it just fine, but there's
6771                          * still no point in allowing a ring fd as it doesn't
6772                          * support regular read/write anyway.
6773                          */
6774                         if (file->f_op == &io_uring_fops) {
6775                                 fput(file);
6776                                 err = -EBADF;
6777                                 break;
6778                         }
6779                         table->files[index] = file;
6780                         err = io_sqe_file_register(ctx, file, i);
6781                         if (err)
6782                                 break;
6783                 }
6784                 nr_args--;
6785                 done++;
6786                 up->offset++;
6787         }
6788
6789         if (needs_switch) {
6790                 percpu_ref_kill(data->cur_refs);
6791                 spin_lock(&data->lock);
6792                 list_add(&ref_node->node, &data->ref_list);
6793                 data->cur_refs = &ref_node->refs;
6794                 spin_unlock(&data->lock);
6795                 percpu_ref_get(&ctx->file_data->refs);
6796         } else
6797                 destroy_fixed_file_ref_node(ref_node);
6798
6799         return done ? done : err;
6800 }
6801
6802 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6803                                unsigned nr_args)
6804 {
6805         struct io_uring_files_update up;
6806
6807         if (!ctx->file_data)
6808                 return -ENXIO;
6809         if (!nr_args)
6810                 return -EINVAL;
6811         if (copy_from_user(&up, arg, sizeof(up)))
6812                 return -EFAULT;
6813         if (up.resv)
6814                 return -EINVAL;
6815
6816         return __io_sqe_files_update(ctx, &up, nr_args);
6817 }
6818
6819 static void io_free_work(struct io_wq_work *work)
6820 {
6821         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6822
6823         /* Consider that io_steal_work() relies on this ref */
6824         io_put_req(req);
6825 }
6826
6827 static int io_init_wq_offload(struct io_ring_ctx *ctx,
6828                               struct io_uring_params *p)
6829 {
6830         struct io_wq_data data;
6831         struct fd f;
6832         struct io_ring_ctx *ctx_attach;
6833         unsigned int concurrency;
6834         int ret = 0;
6835
6836         data.user = ctx->user;
6837         data.free_work = io_free_work;
6838         data.do_work = io_wq_submit_work;
6839
6840         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6841                 /* Do QD, or 4 * CPUS, whatever is smallest */
6842                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6843
6844                 ctx->io_wq = io_wq_create(concurrency, &data);
6845                 if (IS_ERR(ctx->io_wq)) {
6846                         ret = PTR_ERR(ctx->io_wq);
6847                         ctx->io_wq = NULL;
6848                 }
6849                 return ret;
6850         }
6851
6852         f = fdget(p->wq_fd);
6853         if (!f.file)
6854                 return -EBADF;
6855
6856         if (f.file->f_op != &io_uring_fops) {
6857                 ret = -EINVAL;
6858                 goto out_fput;
6859         }
6860
6861         ctx_attach = f.file->private_data;
6862         /* @io_wq is protected by holding the fd */
6863         if (!io_wq_get(ctx_attach->io_wq, &data)) {
6864                 ret = -EINVAL;
6865                 goto out_fput;
6866         }
6867
6868         ctx->io_wq = ctx_attach->io_wq;
6869 out_fput:
6870         fdput(f);
6871         return ret;
6872 }
6873
6874 static int io_sq_offload_start(struct io_ring_ctx *ctx,
6875                                struct io_uring_params *p)
6876 {
6877         int ret;
6878
6879         mmgrab(current->mm);
6880         ctx->sqo_mm = current->mm;
6881
6882         if (ctx->flags & IORING_SETUP_SQPOLL) {
6883                 ret = -EPERM;
6884                 if (!capable(CAP_SYS_ADMIN))
6885                         goto err;
6886
6887                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6888                 if (!ctx->sq_thread_idle)
6889                         ctx->sq_thread_idle = HZ;
6890
6891                 if (p->flags & IORING_SETUP_SQ_AFF) {
6892                         int cpu = p->sq_thread_cpu;
6893
6894                         ret = -EINVAL;
6895                         if (cpu >= nr_cpu_ids)
6896                                 goto err;
6897                         if (!cpu_online(cpu))
6898                                 goto err;
6899
6900                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6901                                                         ctx, cpu,
6902                                                         "io_uring-sq");
6903                 } else {
6904                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6905                                                         "io_uring-sq");
6906                 }
6907                 if (IS_ERR(ctx->sqo_thread)) {
6908                         ret = PTR_ERR(ctx->sqo_thread);
6909                         ctx->sqo_thread = NULL;
6910                         goto err;
6911                 }
6912                 wake_up_process(ctx->sqo_thread);
6913         } else if (p->flags & IORING_SETUP_SQ_AFF) {
6914                 /* Can't have SQ_AFF without SQPOLL */
6915                 ret = -EINVAL;
6916                 goto err;
6917         }
6918
6919         ret = io_init_wq_offload(ctx, p);
6920         if (ret)
6921                 goto err;
6922
6923         return 0;
6924 err:
6925         io_finish_async(ctx);
6926         mmdrop(ctx->sqo_mm);
6927         ctx->sqo_mm = NULL;
6928         return ret;
6929 }
6930
6931 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6932 {
6933         atomic_long_sub(nr_pages, &user->locked_vm);
6934 }
6935
6936 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6937 {
6938         unsigned long page_limit, cur_pages, new_pages;
6939
6940         /* Don't allow more pages than we can safely lock */
6941         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6942
6943         do {
6944                 cur_pages = atomic_long_read(&user->locked_vm);
6945                 new_pages = cur_pages + nr_pages;
6946                 if (new_pages > page_limit)
6947                         return -ENOMEM;
6948         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6949                                         new_pages) != cur_pages);
6950
6951         return 0;
6952 }
6953
6954 static void io_mem_free(void *ptr)
6955 {
6956         struct page *page;
6957
6958         if (!ptr)
6959                 return;
6960
6961         page = virt_to_head_page(ptr);
6962         if (put_page_testzero(page))
6963                 free_compound_page(page);
6964 }
6965
6966 static void *io_mem_alloc(size_t size)
6967 {
6968         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6969                                 __GFP_NORETRY;
6970
6971         return (void *) __get_free_pages(gfp_flags, get_order(size));
6972 }
6973
6974 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6975                                 size_t *sq_offset)
6976 {
6977         struct io_rings *rings;
6978         size_t off, sq_array_size;
6979
6980         off = struct_size(rings, cqes, cq_entries);
6981         if (off == SIZE_MAX)
6982                 return SIZE_MAX;
6983
6984 #ifdef CONFIG_SMP
6985         off = ALIGN(off, SMP_CACHE_BYTES);
6986         if (off == 0)
6987                 return SIZE_MAX;
6988 #endif
6989
6990         sq_array_size = array_size(sizeof(u32), sq_entries);
6991         if (sq_array_size == SIZE_MAX)
6992                 return SIZE_MAX;
6993
6994         if (check_add_overflow(off, sq_array_size, &off))
6995                 return SIZE_MAX;
6996
6997         if (sq_offset)
6998                 *sq_offset = off;
6999
7000         return off;
7001 }
7002
7003 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7004 {
7005         size_t pages;
7006
7007         pages = (size_t)1 << get_order(
7008                 rings_size(sq_entries, cq_entries, NULL));
7009         pages += (size_t)1 << get_order(
7010                 array_size(sizeof(struct io_uring_sqe), sq_entries));
7011
7012         return pages;
7013 }
7014
7015 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7016 {
7017         int i, j;
7018
7019         if (!ctx->user_bufs)
7020                 return -ENXIO;
7021
7022         for (i = 0; i < ctx->nr_user_bufs; i++) {
7023                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7024
7025                 for (j = 0; j < imu->nr_bvecs; j++)
7026                         unpin_user_page(imu->bvec[j].bv_page);
7027
7028                 if (ctx->account_mem)
7029                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
7030                 kvfree(imu->bvec);
7031                 imu->nr_bvecs = 0;
7032         }
7033
7034         kfree(ctx->user_bufs);
7035         ctx->user_bufs = NULL;
7036         ctx->nr_user_bufs = 0;
7037         return 0;
7038 }
7039
7040 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7041                        void __user *arg, unsigned index)
7042 {
7043         struct iovec __user *src;
7044
7045 #ifdef CONFIG_COMPAT
7046         if (ctx->compat) {
7047                 struct compat_iovec __user *ciovs;
7048                 struct compat_iovec ciov;
7049
7050                 ciovs = (struct compat_iovec __user *) arg;
7051                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7052                         return -EFAULT;
7053
7054                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7055                 dst->iov_len = ciov.iov_len;
7056                 return 0;
7057         }
7058 #endif
7059         src = (struct iovec __user *) arg;
7060         if (copy_from_user(dst, &src[index], sizeof(*dst)))
7061                 return -EFAULT;
7062         return 0;
7063 }
7064
7065 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7066                                   unsigned nr_args)
7067 {
7068         struct vm_area_struct **vmas = NULL;
7069         struct page **pages = NULL;
7070         int i, j, got_pages = 0;
7071         int ret = -EINVAL;
7072
7073         if (ctx->user_bufs)
7074                 return -EBUSY;
7075         if (!nr_args || nr_args > UIO_MAXIOV)
7076                 return -EINVAL;
7077
7078         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7079                                         GFP_KERNEL);
7080         if (!ctx->user_bufs)
7081                 return -ENOMEM;
7082
7083         for (i = 0; i < nr_args; i++) {
7084                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7085                 unsigned long off, start, end, ubuf;
7086                 int pret, nr_pages;
7087                 struct iovec iov;
7088                 size_t size;
7089
7090                 ret = io_copy_iov(ctx, &iov, arg, i);
7091                 if (ret)
7092                         goto err;
7093
7094                 /*
7095                  * Don't impose further limits on the size and buffer
7096                  * constraints here, we'll -EINVAL later when IO is
7097                  * submitted if they are wrong.
7098                  */
7099                 ret = -EFAULT;
7100                 if (!iov.iov_base || !iov.iov_len)
7101                         goto err;
7102
7103                 /* arbitrary limit, but we need something */
7104                 if (iov.iov_len > SZ_1G)
7105                         goto err;
7106
7107                 ubuf = (unsigned long) iov.iov_base;
7108                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7109                 start = ubuf >> PAGE_SHIFT;
7110                 nr_pages = end - start;
7111
7112                 if (ctx->account_mem) {
7113                         ret = io_account_mem(ctx->user, nr_pages);
7114                         if (ret)
7115                                 goto err;
7116                 }
7117
7118                 ret = 0;
7119                 if (!pages || nr_pages > got_pages) {
7120                         kvfree(vmas);
7121                         kvfree(pages);
7122                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7123                                                 GFP_KERNEL);
7124                         vmas = kvmalloc_array(nr_pages,
7125                                         sizeof(struct vm_area_struct *),
7126                                         GFP_KERNEL);
7127                         if (!pages || !vmas) {
7128                                 ret = -ENOMEM;
7129                                 if (ctx->account_mem)
7130                                         io_unaccount_mem(ctx->user, nr_pages);
7131                                 goto err;
7132                         }
7133                         got_pages = nr_pages;
7134                 }
7135
7136                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7137                                                 GFP_KERNEL);
7138                 ret = -ENOMEM;
7139                 if (!imu->bvec) {
7140                         if (ctx->account_mem)
7141                                 io_unaccount_mem(ctx->user, nr_pages);
7142                         goto err;
7143                 }
7144
7145                 ret = 0;
7146                 mmap_read_lock(current->mm);
7147                 pret = pin_user_pages(ubuf, nr_pages,
7148                                       FOLL_WRITE | FOLL_LONGTERM,
7149                                       pages, vmas);
7150                 if (pret == nr_pages) {
7151                         /* don't support file backed memory */
7152                         for (j = 0; j < nr_pages; j++) {
7153                                 struct vm_area_struct *vma = vmas[j];
7154
7155                                 if (vma->vm_file &&
7156                                     !is_file_hugepages(vma->vm_file)) {
7157                                         ret = -EOPNOTSUPP;
7158                                         break;
7159                                 }
7160                         }
7161                 } else {
7162                         ret = pret < 0 ? pret : -EFAULT;
7163                 }
7164                 mmap_read_unlock(current->mm);
7165                 if (ret) {
7166                         /*
7167                          * if we did partial map, or found file backed vmas,
7168                          * release any pages we did get
7169                          */
7170                         if (pret > 0)
7171                                 unpin_user_pages(pages, pret);
7172                         if (ctx->account_mem)
7173                                 io_unaccount_mem(ctx->user, nr_pages);
7174                         kvfree(imu->bvec);
7175                         goto err;
7176                 }
7177
7178                 off = ubuf & ~PAGE_MASK;
7179                 size = iov.iov_len;
7180                 for (j = 0; j < nr_pages; j++) {
7181                         size_t vec_len;
7182
7183                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
7184                         imu->bvec[j].bv_page = pages[j];
7185                         imu->bvec[j].bv_len = vec_len;
7186                         imu->bvec[j].bv_offset = off;
7187                         off = 0;
7188                         size -= vec_len;
7189                 }
7190                 /* store original address for later verification */
7191                 imu->ubuf = ubuf;
7192                 imu->len = iov.iov_len;
7193                 imu->nr_bvecs = nr_pages;
7194
7195                 ctx->nr_user_bufs++;
7196         }
7197         kvfree(pages);
7198         kvfree(vmas);
7199         return 0;
7200 err:
7201         kvfree(pages);
7202         kvfree(vmas);
7203         io_sqe_buffer_unregister(ctx);
7204         return ret;
7205 }
7206
7207 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7208 {
7209         __s32 __user *fds = arg;
7210         int fd;
7211
7212         if (ctx->cq_ev_fd)
7213                 return -EBUSY;
7214
7215         if (copy_from_user(&fd, fds, sizeof(*fds)))
7216                 return -EFAULT;
7217
7218         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7219         if (IS_ERR(ctx->cq_ev_fd)) {
7220                 int ret = PTR_ERR(ctx->cq_ev_fd);
7221                 ctx->cq_ev_fd = NULL;
7222                 return ret;
7223         }
7224
7225         return 0;
7226 }
7227
7228 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7229 {
7230         if (ctx->cq_ev_fd) {
7231                 eventfd_ctx_put(ctx->cq_ev_fd);
7232                 ctx->cq_ev_fd = NULL;
7233                 return 0;
7234         }
7235
7236         return -ENXIO;
7237 }
7238
7239 static int __io_destroy_buffers(int id, void *p, void *data)
7240 {
7241         struct io_ring_ctx *ctx = data;
7242         struct io_buffer *buf = p;
7243
7244         __io_remove_buffers(ctx, buf, id, -1U);
7245         return 0;
7246 }
7247
7248 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7249 {
7250         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7251         idr_destroy(&ctx->io_buffer_idr);
7252 }
7253
7254 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7255 {
7256         io_finish_async(ctx);
7257         if (ctx->sqo_mm)
7258                 mmdrop(ctx->sqo_mm);
7259
7260         io_iopoll_reap_events(ctx);
7261         io_sqe_buffer_unregister(ctx);
7262         io_sqe_files_unregister(ctx);
7263         io_eventfd_unregister(ctx);
7264         io_destroy_buffers(ctx);
7265         idr_destroy(&ctx->personality_idr);
7266
7267 #if defined(CONFIG_UNIX)
7268         if (ctx->ring_sock) {
7269                 ctx->ring_sock->file = NULL; /* so that iput() is called */
7270                 sock_release(ctx->ring_sock);
7271         }
7272 #endif
7273
7274         io_mem_free(ctx->rings);
7275         io_mem_free(ctx->sq_sqes);
7276
7277         percpu_ref_exit(&ctx->refs);
7278         if (ctx->account_mem)
7279                 io_unaccount_mem(ctx->user,
7280                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
7281         free_uid(ctx->user);
7282         put_cred(ctx->creds);
7283         kfree(ctx->cancel_hash);
7284         kmem_cache_free(req_cachep, ctx->fallback_req);
7285         kfree(ctx);
7286 }
7287
7288 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7289 {
7290         struct io_ring_ctx *ctx = file->private_data;
7291         __poll_t mask = 0;
7292
7293         poll_wait(file, &ctx->cq_wait, wait);
7294         /*
7295          * synchronizes with barrier from wq_has_sleeper call in
7296          * io_commit_cqring
7297          */
7298         smp_rmb();
7299         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7300             ctx->rings->sq_ring_entries)
7301                 mask |= EPOLLOUT | EPOLLWRNORM;
7302         if (io_cqring_events(ctx, false))
7303                 mask |= EPOLLIN | EPOLLRDNORM;
7304
7305         return mask;
7306 }
7307
7308 static int io_uring_fasync(int fd, struct file *file, int on)
7309 {
7310         struct io_ring_ctx *ctx = file->private_data;
7311
7312         return fasync_helper(fd, file, on, &ctx->cq_fasync);
7313 }
7314
7315 static int io_remove_personalities(int id, void *p, void *data)
7316 {
7317         struct io_ring_ctx *ctx = data;
7318         const struct cred *cred;
7319
7320         cred = idr_remove(&ctx->personality_idr, id);
7321         if (cred)
7322                 put_cred(cred);
7323         return 0;
7324 }
7325
7326 static void io_ring_exit_work(struct work_struct *work)
7327 {
7328         struct io_ring_ctx *ctx;
7329
7330         ctx = container_of(work, struct io_ring_ctx, exit_work);
7331         if (ctx->rings)
7332                 io_cqring_overflow_flush(ctx, true);
7333
7334         wait_for_completion(&ctx->ref_comp);
7335         io_ring_ctx_free(ctx);
7336 }
7337
7338 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7339 {
7340         mutex_lock(&ctx->uring_lock);
7341         percpu_ref_kill(&ctx->refs);
7342         mutex_unlock(&ctx->uring_lock);
7343
7344         io_kill_timeouts(ctx);
7345         io_poll_remove_all(ctx);
7346
7347         if (ctx->io_wq)
7348                 io_wq_cancel_all(ctx->io_wq);
7349
7350         io_iopoll_reap_events(ctx);
7351         /* if we failed setting up the ctx, we might not have any rings */
7352         if (ctx->rings)
7353                 io_cqring_overflow_flush(ctx, true);
7354         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7355         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7356         queue_work(system_wq, &ctx->exit_work);
7357 }
7358
7359 static int io_uring_release(struct inode *inode, struct file *file)
7360 {
7361         struct io_ring_ctx *ctx = file->private_data;
7362
7363         file->private_data = NULL;
7364         io_ring_ctx_wait_and_kill(ctx);
7365         return 0;
7366 }
7367
7368 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7369                                   struct files_struct *files)
7370 {
7371         while (!list_empty_careful(&ctx->inflight_list)) {
7372                 struct io_kiocb *cancel_req = NULL, *req;
7373                 DEFINE_WAIT(wait);
7374
7375                 spin_lock_irq(&ctx->inflight_lock);
7376                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7377                         if (req->work.files != files)
7378                                 continue;
7379                         /* req is being completed, ignore */
7380                         if (!refcount_inc_not_zero(&req->refs))
7381                                 continue;
7382                         cancel_req = req;
7383                         break;
7384                 }
7385                 if (cancel_req)
7386                         prepare_to_wait(&ctx->inflight_wait, &wait,
7387                                                 TASK_UNINTERRUPTIBLE);
7388                 spin_unlock_irq(&ctx->inflight_lock);
7389
7390                 /* We need to keep going until we don't find a matching req */
7391                 if (!cancel_req)
7392                         break;
7393
7394                 if (cancel_req->flags & REQ_F_OVERFLOW) {
7395                         spin_lock_irq(&ctx->completion_lock);
7396                         list_del(&cancel_req->list);
7397                         cancel_req->flags &= ~REQ_F_OVERFLOW;
7398                         if (list_empty(&ctx->cq_overflow_list)) {
7399                                 clear_bit(0, &ctx->sq_check_overflow);
7400                                 clear_bit(0, &ctx->cq_check_overflow);
7401                         }
7402                         spin_unlock_irq(&ctx->completion_lock);
7403
7404                         WRITE_ONCE(ctx->rings->cq_overflow,
7405                                 atomic_inc_return(&ctx->cached_cq_overflow));
7406
7407                         /*
7408                          * Put inflight ref and overflow ref. If that's
7409                          * all we had, then we're done with this request.
7410                          */
7411                         if (refcount_sub_and_test(2, &cancel_req->refs)) {
7412                                 io_free_req(cancel_req);
7413                                 finish_wait(&ctx->inflight_wait, &wait);
7414                                 continue;
7415                         }
7416                 } else {
7417                         io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7418                         io_put_req(cancel_req);
7419                 }
7420
7421                 schedule();
7422                 finish_wait(&ctx->inflight_wait, &wait);
7423         }
7424 }
7425
7426 static int io_uring_flush(struct file *file, void *data)
7427 {
7428         struct io_ring_ctx *ctx = file->private_data;
7429
7430         io_uring_cancel_files(ctx, data);
7431
7432         /*
7433          * If the task is going away, cancel work it may have pending
7434          */
7435         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7436                 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
7437
7438         return 0;
7439 }
7440
7441 static void *io_uring_validate_mmap_request(struct file *file,
7442                                             loff_t pgoff, size_t sz)
7443 {
7444         struct io_ring_ctx *ctx = file->private_data;
7445         loff_t offset = pgoff << PAGE_SHIFT;
7446         struct page *page;
7447         void *ptr;
7448
7449         switch (offset) {
7450         case IORING_OFF_SQ_RING:
7451         case IORING_OFF_CQ_RING:
7452                 ptr = ctx->rings;
7453                 break;
7454         case IORING_OFF_SQES:
7455                 ptr = ctx->sq_sqes;
7456                 break;
7457         default:
7458                 return ERR_PTR(-EINVAL);
7459         }
7460
7461         page = virt_to_head_page(ptr);
7462         if (sz > page_size(page))
7463                 return ERR_PTR(-EINVAL);
7464
7465         return ptr;
7466 }
7467
7468 #ifdef CONFIG_MMU
7469
7470 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7471 {
7472         size_t sz = vma->vm_end - vma->vm_start;
7473         unsigned long pfn;
7474         void *ptr;
7475
7476         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7477         if (IS_ERR(ptr))
7478                 return PTR_ERR(ptr);
7479
7480         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7481         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7482 }
7483
7484 #else /* !CONFIG_MMU */
7485
7486 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7487 {
7488         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7489 }
7490
7491 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7492 {
7493         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7494 }
7495
7496 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7497         unsigned long addr, unsigned long len,
7498         unsigned long pgoff, unsigned long flags)
7499 {
7500         void *ptr;
7501
7502         ptr = io_uring_validate_mmap_request(file, pgoff, len);
7503         if (IS_ERR(ptr))
7504                 return PTR_ERR(ptr);
7505
7506         return (unsigned long) ptr;
7507 }
7508
7509 #endif /* !CONFIG_MMU */
7510
7511 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7512                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7513                 size_t, sigsz)
7514 {
7515         struct io_ring_ctx *ctx;
7516         long ret = -EBADF;
7517         int submitted = 0;
7518         struct fd f;
7519
7520         if (current->task_works)
7521                 task_work_run();
7522
7523         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7524                 return -EINVAL;
7525
7526         f = fdget(fd);
7527         if (!f.file)
7528                 return -EBADF;
7529
7530         ret = -EOPNOTSUPP;
7531         if (f.file->f_op != &io_uring_fops)
7532                 goto out_fput;
7533
7534         ret = -ENXIO;
7535         ctx = f.file->private_data;
7536         if (!percpu_ref_tryget(&ctx->refs))
7537                 goto out_fput;
7538
7539         /*
7540          * For SQ polling, the thread will do all submissions and completions.
7541          * Just return the requested submit count, and wake the thread if
7542          * we were asked to.
7543          */
7544         ret = 0;
7545         if (ctx->flags & IORING_SETUP_SQPOLL) {
7546                 if (!list_empty_careful(&ctx->cq_overflow_list))
7547                         io_cqring_overflow_flush(ctx, false);
7548                 if (flags & IORING_ENTER_SQ_WAKEUP)
7549                         wake_up(&ctx->sqo_wait);
7550                 submitted = to_submit;
7551         } else if (to_submit) {
7552                 mutex_lock(&ctx->uring_lock);
7553                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
7554                 mutex_unlock(&ctx->uring_lock);
7555
7556                 if (submitted != to_submit)
7557                         goto out;
7558         }
7559         if (flags & IORING_ENTER_GETEVENTS) {
7560                 unsigned nr_events = 0;
7561
7562                 min_complete = min(min_complete, ctx->cq_entries);
7563
7564                 /*
7565                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7566                  * space applications don't need to do io completion events
7567                  * polling again, they can rely on io_sq_thread to do polling
7568                  * work, which can reduce cpu usage and uring_lock contention.
7569                  */
7570                 if (ctx->flags & IORING_SETUP_IOPOLL &&
7571                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
7572                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
7573                 } else {
7574                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7575                 }
7576         }
7577
7578 out:
7579         percpu_ref_put(&ctx->refs);
7580 out_fput:
7581         fdput(f);
7582         return submitted ? submitted : ret;
7583 }
7584
7585 #ifdef CONFIG_PROC_FS
7586 static int io_uring_show_cred(int id, void *p, void *data)
7587 {
7588         const struct cred *cred = p;
7589         struct seq_file *m = data;
7590         struct user_namespace *uns = seq_user_ns(m);
7591         struct group_info *gi;
7592         kernel_cap_t cap;
7593         unsigned __capi;
7594         int g;
7595
7596         seq_printf(m, "%5d\n", id);
7597         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7598         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7599         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7600         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7601         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7602         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7603         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7604         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7605         seq_puts(m, "\n\tGroups:\t");
7606         gi = cred->group_info;
7607         for (g = 0; g < gi->ngroups; g++) {
7608                 seq_put_decimal_ull(m, g ? " " : "",
7609                                         from_kgid_munged(uns, gi->gid[g]));
7610         }
7611         seq_puts(m, "\n\tCapEff:\t");
7612         cap = cred->cap_effective;
7613         CAP_FOR_EACH_U32(__capi)
7614                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7615         seq_putc(m, '\n');
7616         return 0;
7617 }
7618
7619 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7620 {
7621         int i;
7622
7623         mutex_lock(&ctx->uring_lock);
7624         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7625         for (i = 0; i < ctx->nr_user_files; i++) {
7626                 struct fixed_file_table *table;
7627                 struct file *f;
7628
7629                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7630                 f = table->files[i & IORING_FILE_TABLE_MASK];
7631                 if (f)
7632                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7633                 else
7634                         seq_printf(m, "%5u: <none>\n", i);
7635         }
7636         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7637         for (i = 0; i < ctx->nr_user_bufs; i++) {
7638                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7639
7640                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7641                                                 (unsigned int) buf->len);
7642         }
7643         if (!idr_is_empty(&ctx->personality_idr)) {
7644                 seq_printf(m, "Personalities:\n");
7645                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7646         }
7647         seq_printf(m, "PollList:\n");
7648         spin_lock_irq(&ctx->completion_lock);
7649         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7650                 struct hlist_head *list = &ctx->cancel_hash[i];
7651                 struct io_kiocb *req;
7652
7653                 hlist_for_each_entry(req, list, hash_node)
7654                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
7655                                         req->task->task_works != NULL);
7656         }
7657         spin_unlock_irq(&ctx->completion_lock);
7658         mutex_unlock(&ctx->uring_lock);
7659 }
7660
7661 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7662 {
7663         struct io_ring_ctx *ctx = f->private_data;
7664
7665         if (percpu_ref_tryget(&ctx->refs)) {
7666                 __io_uring_show_fdinfo(ctx, m);
7667                 percpu_ref_put(&ctx->refs);
7668         }
7669 }
7670 #endif
7671
7672 static const struct file_operations io_uring_fops = {
7673         .release        = io_uring_release,
7674         .flush          = io_uring_flush,
7675         .mmap           = io_uring_mmap,
7676 #ifndef CONFIG_MMU
7677         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7678         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7679 #endif
7680         .poll           = io_uring_poll,
7681         .fasync         = io_uring_fasync,
7682 #ifdef CONFIG_PROC_FS
7683         .show_fdinfo    = io_uring_show_fdinfo,
7684 #endif
7685 };
7686
7687 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7688                                   struct io_uring_params *p)
7689 {
7690         struct io_rings *rings;
7691         size_t size, sq_array_offset;
7692
7693         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7694         if (size == SIZE_MAX)
7695                 return -EOVERFLOW;
7696
7697         rings = io_mem_alloc(size);
7698         if (!rings)
7699                 return -ENOMEM;
7700
7701         ctx->rings = rings;
7702         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7703         rings->sq_ring_mask = p->sq_entries - 1;
7704         rings->cq_ring_mask = p->cq_entries - 1;
7705         rings->sq_ring_entries = p->sq_entries;
7706         rings->cq_ring_entries = p->cq_entries;
7707         ctx->sq_mask = rings->sq_ring_mask;
7708         ctx->cq_mask = rings->cq_ring_mask;
7709         ctx->sq_entries = rings->sq_ring_entries;
7710         ctx->cq_entries = rings->cq_ring_entries;
7711
7712         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
7713         if (size == SIZE_MAX) {
7714                 io_mem_free(ctx->rings);
7715                 ctx->rings = NULL;
7716                 return -EOVERFLOW;
7717         }
7718
7719         ctx->sq_sqes = io_mem_alloc(size);
7720         if (!ctx->sq_sqes) {
7721                 io_mem_free(ctx->rings);
7722                 ctx->rings = NULL;
7723                 return -ENOMEM;
7724         }
7725
7726         return 0;
7727 }
7728
7729 /*
7730  * Allocate an anonymous fd, this is what constitutes the application
7731  * visible backing of an io_uring instance. The application mmaps this
7732  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7733  * we have to tie this fd to a socket for file garbage collection purposes.
7734  */
7735 static int io_uring_get_fd(struct io_ring_ctx *ctx)
7736 {
7737         struct file *file;
7738         int ret;
7739
7740 #if defined(CONFIG_UNIX)
7741         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7742                                 &ctx->ring_sock);
7743         if (ret)
7744                 return ret;
7745 #endif
7746
7747         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7748         if (ret < 0)
7749                 goto err;
7750
7751         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7752                                         O_RDWR | O_CLOEXEC);
7753         if (IS_ERR(file)) {
7754                 put_unused_fd(ret);
7755                 ret = PTR_ERR(file);
7756                 goto err;
7757         }
7758
7759 #if defined(CONFIG_UNIX)
7760         ctx->ring_sock->file = file;
7761 #endif
7762         fd_install(ret, file);
7763         return ret;
7764 err:
7765 #if defined(CONFIG_UNIX)
7766         sock_release(ctx->ring_sock);
7767         ctx->ring_sock = NULL;
7768 #endif
7769         return ret;
7770 }
7771
7772 static int io_uring_create(unsigned entries, struct io_uring_params *p,
7773                            struct io_uring_params __user *params)
7774 {
7775         struct user_struct *user = NULL;
7776         struct io_ring_ctx *ctx;
7777         bool account_mem;
7778         int ret;
7779
7780         if (!entries)
7781                 return -EINVAL;
7782         if (entries > IORING_MAX_ENTRIES) {
7783                 if (!(p->flags & IORING_SETUP_CLAMP))
7784                         return -EINVAL;
7785                 entries = IORING_MAX_ENTRIES;
7786         }
7787
7788         /*
7789          * Use twice as many entries for the CQ ring. It's possible for the
7790          * application to drive a higher depth than the size of the SQ ring,
7791          * since the sqes are only used at submission time. This allows for
7792          * some flexibility in overcommitting a bit. If the application has
7793          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7794          * of CQ ring entries manually.
7795          */
7796         p->sq_entries = roundup_pow_of_two(entries);
7797         if (p->flags & IORING_SETUP_CQSIZE) {
7798                 /*
7799                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
7800                  * to a power-of-two, if it isn't already. We do NOT impose
7801                  * any cq vs sq ring sizing.
7802                  */
7803                 if (p->cq_entries < p->sq_entries)
7804                         return -EINVAL;
7805                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7806                         if (!(p->flags & IORING_SETUP_CLAMP))
7807                                 return -EINVAL;
7808                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
7809                 }
7810                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7811         } else {
7812                 p->cq_entries = 2 * p->sq_entries;
7813         }
7814
7815         user = get_uid(current_user());
7816         account_mem = !capable(CAP_IPC_LOCK);
7817
7818         if (account_mem) {
7819                 ret = io_account_mem(user,
7820                                 ring_pages(p->sq_entries, p->cq_entries));
7821                 if (ret) {
7822                         free_uid(user);
7823                         return ret;
7824                 }
7825         }
7826
7827         ctx = io_ring_ctx_alloc(p);
7828         if (!ctx) {
7829                 if (account_mem)
7830                         io_unaccount_mem(user, ring_pages(p->sq_entries,
7831                                                                 p->cq_entries));
7832                 free_uid(user);
7833                 return -ENOMEM;
7834         }
7835         ctx->compat = in_compat_syscall();
7836         ctx->account_mem = account_mem;
7837         ctx->user = user;
7838         ctx->creds = get_current_cred();
7839
7840         ret = io_allocate_scq_urings(ctx, p);
7841         if (ret)
7842                 goto err;
7843
7844         ret = io_sq_offload_start(ctx, p);
7845         if (ret)
7846                 goto err;
7847
7848         memset(&p->sq_off, 0, sizeof(p->sq_off));
7849         p->sq_off.head = offsetof(struct io_rings, sq.head);
7850         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7851         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7852         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7853         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7854         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7855         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
7856
7857         memset(&p->cq_off, 0, sizeof(p->cq_off));
7858         p->cq_off.head = offsetof(struct io_rings, cq.head);
7859         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7860         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7861         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7862         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7863         p->cq_off.cqes = offsetof(struct io_rings, cqes);
7864         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
7865
7866         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
7867                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
7868                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
7869
7870         if (copy_to_user(params, p, sizeof(*p))) {
7871                 ret = -EFAULT;
7872                 goto err;
7873         }
7874         /*
7875          * Install ring fd as the very last thing, so we don't risk someone
7876          * having closed it before we finish setup
7877          */
7878         ret = io_uring_get_fd(ctx);
7879         if (ret < 0)
7880                 goto err;
7881
7882         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
7883         return ret;
7884 err:
7885         io_ring_ctx_wait_and_kill(ctx);
7886         return ret;
7887 }
7888
7889 /*
7890  * Sets up an aio uring context, and returns the fd. Applications asks for a
7891  * ring size, we return the actual sq/cq ring sizes (among other things) in the
7892  * params structure passed in.
7893  */
7894 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7895 {
7896         struct io_uring_params p;
7897         int i;
7898
7899         if (copy_from_user(&p, params, sizeof(p)))
7900                 return -EFAULT;
7901         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7902                 if (p.resv[i])
7903                         return -EINVAL;
7904         }
7905
7906         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
7907                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7908                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
7909                 return -EINVAL;
7910
7911         return  io_uring_create(entries, &p, params);
7912 }
7913
7914 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
7915                 struct io_uring_params __user *, params)
7916 {
7917         return io_uring_setup(entries, params);
7918 }
7919
7920 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
7921 {
7922         struct io_uring_probe *p;
7923         size_t size;
7924         int i, ret;
7925
7926         size = struct_size(p, ops, nr_args);
7927         if (size == SIZE_MAX)
7928                 return -EOVERFLOW;
7929         p = kzalloc(size, GFP_KERNEL);
7930         if (!p)
7931                 return -ENOMEM;
7932
7933         ret = -EFAULT;
7934         if (copy_from_user(p, arg, size))
7935                 goto out;
7936         ret = -EINVAL;
7937         if (memchr_inv(p, 0, size))
7938                 goto out;
7939
7940         p->last_op = IORING_OP_LAST - 1;
7941         if (nr_args > IORING_OP_LAST)
7942                 nr_args = IORING_OP_LAST;
7943
7944         for (i = 0; i < nr_args; i++) {
7945                 p->ops[i].op = i;
7946                 if (!io_op_defs[i].not_supported)
7947                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
7948         }
7949         p->ops_len = i;
7950
7951         ret = 0;
7952         if (copy_to_user(arg, p, size))
7953                 ret = -EFAULT;
7954 out:
7955         kfree(p);
7956         return ret;
7957 }
7958
7959 static int io_register_personality(struct io_ring_ctx *ctx)
7960 {
7961         const struct cred *creds = get_current_cred();
7962         int id;
7963
7964         id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
7965                                 USHRT_MAX, GFP_KERNEL);
7966         if (id < 0)
7967                 put_cred(creds);
7968         return id;
7969 }
7970
7971 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7972 {
7973         const struct cred *old_creds;
7974
7975         old_creds = idr_remove(&ctx->personality_idr, id);
7976         if (old_creds) {
7977                 put_cred(old_creds);
7978                 return 0;
7979         }
7980
7981         return -EINVAL;
7982 }
7983
7984 static bool io_register_op_must_quiesce(int op)
7985 {
7986         switch (op) {
7987         case IORING_UNREGISTER_FILES:
7988         case IORING_REGISTER_FILES_UPDATE:
7989         case IORING_REGISTER_PROBE:
7990         case IORING_REGISTER_PERSONALITY:
7991         case IORING_UNREGISTER_PERSONALITY:
7992                 return false;
7993         default:
7994                 return true;
7995         }
7996 }
7997
7998 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7999                                void __user *arg, unsigned nr_args)
8000         __releases(ctx->uring_lock)
8001         __acquires(ctx->uring_lock)
8002 {
8003         int ret;
8004
8005         /*
8006          * We're inside the ring mutex, if the ref is already dying, then
8007          * someone else killed the ctx or is already going through
8008          * io_uring_register().
8009          */
8010         if (percpu_ref_is_dying(&ctx->refs))
8011                 return -ENXIO;
8012
8013         if (io_register_op_must_quiesce(opcode)) {
8014                 percpu_ref_kill(&ctx->refs);
8015
8016                 /*
8017                  * Drop uring mutex before waiting for references to exit. If
8018                  * another thread is currently inside io_uring_enter() it might
8019                  * need to grab the uring_lock to make progress. If we hold it
8020                  * here across the drain wait, then we can deadlock. It's safe
8021                  * to drop the mutex here, since no new references will come in
8022                  * after we've killed the percpu ref.
8023                  */
8024                 mutex_unlock(&ctx->uring_lock);
8025                 ret = wait_for_completion_interruptible(&ctx->ref_comp);
8026                 mutex_lock(&ctx->uring_lock);
8027                 if (ret) {
8028                         percpu_ref_resurrect(&ctx->refs);
8029                         ret = -EINTR;
8030                         goto out;
8031                 }
8032         }
8033
8034         switch (opcode) {
8035         case IORING_REGISTER_BUFFERS:
8036                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8037                 break;
8038         case IORING_UNREGISTER_BUFFERS:
8039                 ret = -EINVAL;
8040                 if (arg || nr_args)
8041                         break;
8042                 ret = io_sqe_buffer_unregister(ctx);
8043                 break;
8044         case IORING_REGISTER_FILES:
8045                 ret = io_sqe_files_register(ctx, arg, nr_args);
8046                 break;
8047         case IORING_UNREGISTER_FILES:
8048                 ret = -EINVAL;
8049                 if (arg || nr_args)
8050                         break;
8051                 ret = io_sqe_files_unregister(ctx);
8052                 break;
8053         case IORING_REGISTER_FILES_UPDATE:
8054                 ret = io_sqe_files_update(ctx, arg, nr_args);
8055                 break;
8056         case IORING_REGISTER_EVENTFD:
8057         case IORING_REGISTER_EVENTFD_ASYNC:
8058                 ret = -EINVAL;
8059                 if (nr_args != 1)
8060                         break;
8061                 ret = io_eventfd_register(ctx, arg);
8062                 if (ret)
8063                         break;
8064                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8065                         ctx->eventfd_async = 1;
8066                 else
8067                         ctx->eventfd_async = 0;
8068                 break;
8069         case IORING_UNREGISTER_EVENTFD:
8070                 ret = -EINVAL;
8071                 if (arg || nr_args)
8072                         break;
8073                 ret = io_eventfd_unregister(ctx);
8074                 break;
8075         case IORING_REGISTER_PROBE:
8076                 ret = -EINVAL;
8077                 if (!arg || nr_args > 256)
8078                         break;
8079                 ret = io_probe(ctx, arg, nr_args);
8080                 break;
8081         case IORING_REGISTER_PERSONALITY:
8082                 ret = -EINVAL;
8083                 if (arg || nr_args)
8084                         break;
8085                 ret = io_register_personality(ctx);
8086                 break;
8087         case IORING_UNREGISTER_PERSONALITY:
8088                 ret = -EINVAL;
8089                 if (arg)
8090                         break;
8091                 ret = io_unregister_personality(ctx, nr_args);
8092                 break;
8093         default:
8094                 ret = -EINVAL;
8095                 break;
8096         }
8097
8098         if (io_register_op_must_quiesce(opcode)) {
8099                 /* bring the ctx back to life */
8100                 percpu_ref_reinit(&ctx->refs);
8101 out:
8102                 reinit_completion(&ctx->ref_comp);
8103         }
8104         return ret;
8105 }
8106
8107 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8108                 void __user *, arg, unsigned int, nr_args)
8109 {
8110         struct io_ring_ctx *ctx;
8111         long ret = -EBADF;
8112         struct fd f;
8113
8114         f = fdget(fd);
8115         if (!f.file)
8116                 return -EBADF;
8117
8118         ret = -EOPNOTSUPP;
8119         if (f.file->f_op != &io_uring_fops)
8120                 goto out_fput;
8121
8122         ctx = f.file->private_data;
8123
8124         mutex_lock(&ctx->uring_lock);
8125         ret = __io_uring_register(ctx, opcode, arg, nr_args);
8126         mutex_unlock(&ctx->uring_lock);
8127         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8128                                                         ctx->cq_ev_fd != NULL, ret);
8129 out_fput:
8130         fdput(f);
8131         return ret;
8132 }
8133
8134 static int __init io_uring_init(void)
8135 {
8136 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8137         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8138         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8139 } while (0)
8140
8141 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8142         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8143         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8144         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
8145         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
8146         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
8147         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
8148         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
8149         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
8150         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
8151         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
8152         BUILD_BUG_SQE_ELEM(24, __u32,  len);
8153         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
8154         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
8155         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8156         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
8157         BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
8158         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
8159         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
8160         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
8161         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
8162         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
8163         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
8164         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
8165         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
8166         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
8167         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
8168         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
8169         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
8170         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
8171
8172         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8173         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8174         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8175         return 0;
8176 };
8177 __initcall(io_uring_init);