Merge tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
Pull block updates from Jens Axboe:
 "Another series of killing more code than what is being added, again
  thanks to Christoph's relentless cleanups and tech debt tackling.

  This contains:

   - blk-iocost improvements (Baolin Wang)

   - part0 iostat fix (Jeffle Xu)

   - Disable iopoll for split bios (Jeffle Xu)

   - block tracepoint cleanups (Christoph Hellwig)

   - Merging of struct block_device and hd_struct (Christoph Hellwig)

   - Rework/cleanup of how block device sizes are updated (Christoph
     Hellwig)

   - Simplification of gendisk lookup and removal of block device
     aliasing (Christoph Hellwig)

   - Block device ioctl cleanups (Christoph Hellwig)

   - Removal of bdget()/blkdev_get() as exported API (Christoph Hellwig)

   - Disk change rework, avoid ->revalidate_disk() (Christoph Hellwig)

   - sbitmap improvements (Pavel Begunkov)

   - Hybrid polling fix (Pavel Begunkov)

   - bvec iteration improvements (Pavel Begunkov)

   - Zone revalidation fixes (Damien Le Moal)

   - blk-throttle limit fix (Yu Kuai)

   - Various little fixes"

* tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block: (126 commits)
  blk-mq: fix msec comment from micro to milli seconds
  blk-mq: update arg in comment of blk_mq_map_queue
  blk-mq: add helper allocating tagset->tags
  Revert "block: Fix a lockdep complaint triggered by request queue flushing"
  nvme-loop: use blk_mq_hctx_set_fq_lock_class to set loop's lock class
  blk-mq: add new API of blk_mq_hctx_set_fq_lock_class
  block: disable iopoll for split bio
  block: Improve blk_revalidate_disk_zones() checks
  sbitmap: simplify wrap check
  sbitmap: replace CAS with atomic and
  sbitmap: remove swap_lock
  sbitmap: optimise sbitmap_deferred_clear()
  blk-mq: skip hybrid polling if iopoll doesn't spin
  blk-iocost: Factor out the base vrate change into a separate function
  blk-iocost: Factor out the active iocgs' state check into a separate function
  blk-iocost: Move the usage ratio calculation to the correct place
  blk-iocost: Remove unnecessary advance declaration
  blk-iocost: Fix some typos in comments
  blktrace: fix up a kerneldoc comment
  block: remove the request_queue to argument request based tracepoints
  ...

29 files changed:
1  2 
block/blk-flush.c
block/blk-merge.c
block/blk-mq.c
drivers/block/xen-blkback/common.h
drivers/block/zram/zram_drv.c
drivers/block/zram/zram_drv.h
drivers/ide/ide-probe.c
drivers/md/dm-raid.c
drivers/md/dm-table.c
drivers/md/dm.c
drivers/md/md.c
drivers/md/raid0.c
drivers/md/raid10.c
drivers/nvme/host/core.c
drivers/s390/block/dasd.c
drivers/s390/block/dasd_int.h
fs/btrfs/sysfs.c
fs/btrfs/volumes.c
fs/btrfs/zoned.c
fs/buffer.c
fs/ext4/super.c
fs/f2fs/f2fs.h
fs/internal.h
fs/io_uring.c
fs/pstore/blk.c
include/linux/blkdev.h
include/linux/fs.h
kernel/trace/blktrace.c
mm/filemap.c

diff --combined block/blk-flush.c
@@@ -69,7 -69,6 +69,6 @@@
  #include <linux/blkdev.h>
  #include <linux/gfp.h>
  #include <linux/blk-mq.h>
- #include <linux/lockdep.h>
  
  #include "blk.h"
  #include "blk-mq.h"
@@@ -139,7 -138,7 +138,7 @@@ static void blk_flush_queue_rq(struct r
  
  static void blk_account_io_flush(struct request *rq)
  {
-       struct hd_struct *part = &rq->rq_disk->part0;
+       struct block_device *part = rq->rq_disk->part0;
  
        part_stat_lock();
        part_stat_inc(part, ios[STAT_FLUSH]);
@@@ -225,18 -224,13 +224,18 @@@ static void flush_end_io(struct reques
        /* release the tag's ownership to the req cloned from */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);
  
 -      WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
        if (!refcount_dec_and_test(&flush_rq->ref)) {
                fq->rq_status = error;
                spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
                return;
        }
  
 +      /*
 +       * Flush request has to be marked as IDLE when it is really ended
 +       * because its .end_io() is called from timeout code path too for
 +       * avoiding use-after-free.
 +       */
 +      WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
        if (fq->rq_status != BLK_STS_OK)
                error = fq->rq_status;
  
@@@ -474,9 -468,6 +473,6 @@@ struct blk_flush_queue *blk_alloc_flush
        INIT_LIST_HEAD(&fq->flush_queue[1]);
        INIT_LIST_HEAD(&fq->flush_data_in_flight);
  
-       lockdep_register_key(&fq->key);
-       lockdep_set_class(&fq->mq_flush_lock, &fq->key);
        return fq;
  
   fail_rq:
@@@ -491,7 -482,31 +487,31 @@@ void blk_free_flush_queue(struct blk_fl
        if (!fq)
                return;
  
-       lockdep_unregister_key(&fq->key);
        kfree(fq->flush_rq);
        kfree(fq);
  }
+ /*
+  * Allow driver to set its own lock class to fq->mq_flush_lock for
+  * avoiding lockdep complaint.
+  *
+  * flush_end_io() may be called recursively from some driver, such as
+  * nvme-loop, so lockdep may complain 'possible recursive locking' because
+  * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
+  * key. We need to assign different lock class for these driver's
+  * fq->mq_flush_lock for avoiding the lockdep warning.
+  *
+  * Use dynamically allocated lock class key for each 'blk_flush_queue'
+  * instance is over-kill, and more worse it introduces horrible boot delay
+  * issue because synchronize_rcu() is implied in lockdep_unregister_key which
+  * is called for each hctx release. SCSI probing may synchronously create and
+  * destroy lots of MQ request_queues for non-existent devices, and some robot
+  * test kernel always enable lockdep option. It is observed that more than half
+  * an hour is taken during SCSI MQ probe with per-fq lock class.
+  */
+ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+               struct lock_class_key *key)
+ {
+       lockdep_set_class(&hctx->fq->mq_flush_lock, key);
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
diff --combined block/blk-merge.c
@@@ -144,7 -144,7 +144,7 @@@ static struct bio *blk_bio_write_same_s
  static inline unsigned get_max_io_size(struct request_queue *q,
                                       struct bio *bio)
  {
 -      unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
 +      unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
        unsigned max_sectors = sectors;
        unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
        unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
@@@ -279,6 -279,14 +279,14 @@@ static struct bio *blk_bio_segment_spli
        return NULL;
  split:
        *segs = nsegs;
+       /*
+        * Bio splitting may cause subtle trouble such as hang when doing sync
+        * iopoll in direct IO routine. Given performance gain of iopoll for
+        * big IO can be trival, disable iopoll when split needed.
+        */
+       bio->bi_opf &= ~REQ_HIPRI;
        return bio_split(bio, sectors, GFP_NOIO, bs);
  }
  
@@@ -338,7 -346,7 +346,7 @@@ void __blk_queue_split(struct bio **bio
                split->bi_opf |= REQ_NOMERGE;
  
                bio_chain(split, *bio);
-               trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
+               trace_block_split(split, (*bio)->bi_iter.bi_sector);
                submit_bio_noacct(*bio);
                *bio = split;
        }
@@@ -683,8 -691,6 +691,6 @@@ static void blk_account_io_merge_reques
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                part_stat_unlock();
-               hd_struct_put(req->part);
        }
  }
  
@@@ -801,7 -807,7 +807,7 @@@ static struct request *attempt_merge(st
         */
        blk_account_io_merge_request(next);
  
-       trace_block_rq_merge(q, next);
+       trace_block_rq_merge(next);
  
        /*
         * ownership of bio passed from next to req, return 'next' for
@@@ -924,7 -930,7 +930,7 @@@ static enum bio_merge_status bio_attemp
        if (!ll_back_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;
  
-       trace_block_bio_backmerge(req->q, req, bio);
+       trace_block_bio_backmerge(bio);
        rq_qos_merge(req->q, req, bio);
  
        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
@@@ -948,7 -954,7 +954,7 @@@ static enum bio_merge_status bio_attemp
        if (!ll_front_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;
  
-       trace_block_bio_frontmerge(req->q, req, bio);
+       trace_block_bio_frontmerge(bio);
        rq_qos_merge(req->q, req, bio);
  
        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
diff --combined block/blk-mq.c
@@@ -95,7 -95,7 +95,7 @@@ static void blk_mq_hctx_clear_pending(s
  }
  
  struct mq_inflight {
-       struct hd_struct *part;
+       struct block_device *part;
        unsigned int inflight[2];
  };
  
@@@ -105,13 -105,15 +105,15 @@@ static bool blk_mq_check_inflight(struc
  {
        struct mq_inflight *mi = priv;
  
-       if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+       if ((!mi->part->bd_partno || rq->part == mi->part) &&
+           blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                mi->inflight[rq_data_dir(rq)]++;
  
        return true;
  }
  
- unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
+ unsigned int blk_mq_in_flight(struct request_queue *q,
+               struct block_device *part)
  {
        struct mq_inflight mi = { .part = part };
  
        return mi.inflight[0] + mi.inflight[1];
  }
  
- void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
-                        unsigned int inflight[2])
+ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+               unsigned int inflight[2])
  {
        struct mq_inflight mi = { .part = part };
  
@@@ -671,7 -673,9 +673,7 @@@ bool blk_mq_complete_request_remote(str
                return false;
  
        if (blk_mq_complete_need_ipi(rq)) {
 -              rq->csd.func = __blk_mq_complete_request_remote;
 -              rq->csd.info = rq;
 -              rq->csd.flags = 0;
 +              INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
                smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
        } else {
                if (rq->q->nr_hw_queues > 1)
@@@ -729,7 -733,7 +731,7 @@@ void blk_mq_start_request(struct reques
  {
        struct request_queue *q = rq->q;
  
-       trace_block_rq_issue(q, rq);
+       trace_block_rq_issue(rq);
  
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
                rq->io_start_time_ns = ktime_get_ns();
@@@ -756,7 -760,7 +758,7 @@@ static void __blk_mq_requeue_request(st
  
        blk_mq_put_driver_tag(rq);
  
-       trace_block_rq_requeue(q, rq);
+       trace_block_rq_requeue(rq);
        rq_qos_requeue(q, rq);
  
        if (blk_mq_request_started(rq)) {
@@@ -1590,7 -1594,7 +1592,7 @@@ select_cpu
   * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
   * @hctx: Pointer to the hardware queue to run.
   * @async: If we want to run the queue asynchronously.
-  * @msecs: Microseconds of delay to wait before running the queue.
+  * @msecs: Milliseconds of delay to wait before running the queue.
   *
   * If !@async, try to run the queue now. Else, run the queue asynchronously and
   * with a delay of @msecs.
@@@ -1619,7 -1623,7 +1621,7 @@@ static void __blk_mq_delay_run_hw_queue
  /**
   * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
   * @hctx: Pointer to the hardware queue to run.
-  * @msecs: Microseconds of delay to wait before running the queue.
+  * @msecs: Milliseconds of delay to wait before running the queue.
   *
   * Run a hardware queue asynchronously with a delay of @msecs.
   */
@@@ -1683,7 -1687,7 +1685,7 @@@ EXPORT_SYMBOL(blk_mq_run_hw_queues)
  /**
   * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
   * @q: Pointer to the request queue to run.
-  * @msecs: Microseconds of delay to wait before running the queues.
+  * @msecs: Milliseconds of delay to wait before running the queues.
   */
  void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
  {
@@@ -1817,7 -1821,7 +1819,7 @@@ static inline void __blk_mq_insert_req_
  
        lockdep_assert_held(&ctx->lock);
  
-       trace_block_rq_insert(hctx->queue, rq);
+       trace_block_rq_insert(rq);
  
        if (at_head)
                list_add(&rq->queuelist, &ctx->rq_lists[type]);
@@@ -1874,7 -1878,7 +1876,7 @@@ void blk_mq_insert_requests(struct blk_
         */
        list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
-               trace_block_rq_insert(hctx->queue, rq);
+               trace_block_rq_insert(rq);
        }
  
        spin_lock(&ctx->lock);
@@@ -2155,6 -2159,7 +2157,7 @@@ blk_qc_t blk_mq_submit_bio(struct bio *
        unsigned int nr_segs;
        blk_qc_t cookie;
        blk_status_t ret;
+       bool hipri;
  
        blk_queue_bounce(q, &bio);
        __blk_queue_split(&bio, &nr_segs);
  
        rq_qos_throttle(q, bio);
  
+       hipri = bio->bi_opf & REQ_HIPRI;
        data.cmd_flags = bio->bi_opf;
        rq = __blk_mq_alloc_request(&data);
        if (unlikely(!rq)) {
                goto queue_exit;
        }
  
-       trace_block_getrq(q, bio, bio->bi_opf);
+       trace_block_getrq(bio);
  
        rq_qos_track(q, rq, bio);
  
                blk_mq_sched_insert_request(rq, false, true, true);
        }
  
+       if (!hipri)
+               return BLK_QC_T_NONE;
        return cookie;
  queue_exit:
        blk_queue_exit(q);
@@@ -3373,6 -3382,12 +3380,12 @@@ static int blk_mq_realloc_tag_set_tags(
        return 0;
  }
  
+ static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
+                               int new_nr_hw_queues)
+ {
+       return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
+ }
  /*
   * Alloc a tag set to be associated with one or more request queues.
   * May fail with EINVAL for various error conditions. May adjust the
@@@ -3426,7 -3441,7 +3439,7 @@@ int blk_mq_alloc_tag_set(struct blk_mq_
        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                set->nr_hw_queues = nr_cpu_ids;
  
-       if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
+       if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
                return -ENOMEM;
  
        ret = -ENOMEM;
@@@ -3861,9 -3876,10 +3874,10 @@@ int blk_poll(struct request_queue *q, b
         * the state. Like for the other success return cases, the
         * caller is responsible for checking if the IO completed. If
         * the IO isn't complete, we'll get called again and will go
-        * straight to the busy poll loop.
+        * straight to the busy poll loop. If specified not to spin,
+        * we also should not sleep.
         */
-       if (blk_mq_poll_hybrid(q, hctx, cookie))
+       if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
                return 1;
  
        hctx->poll_considered++;
@@@ -288,7 -288,9 +288,7 @@@ struct xen_blkif_ring 
        struct work_struct      persistent_purge_work;
  
        /* Buffer of free pages to map grant refs. */
 -      spinlock_t              free_pages_lock;
 -      int                     free_pages_num;
 -      struct list_head        free_pages;
 +      struct gnttab_page_cache free_pages;
  
        struct work_struct      free_work;
        /* Thread shutdown wait queue. */
@@@ -356,9 -358,7 +356,7 @@@ struct pending_req 
  };
  
  
- #define vbd_sz(_v)    ((_v)->bdev->bd_part ? \
-                        (_v)->bdev->bd_part->nr_sects : \
-                         get_capacity((_v)->bdev->bd_disk))
+ #define vbd_sz(_v)    bdev_nr_sectors((_v)->bdev)
  
  #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
  #define xen_blkif_put(_b)                             \
@@@ -42,7 -42,7 +42,7 @@@ static DEFINE_IDR(zram_index_idr)
  static DEFINE_MUTEX(zram_index_mutex);
  
  static int zram_major;
 -static const char *default_compressor = "lzo-rle";
 +static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
  
  /* Module params (documentation at end) */
  static unsigned int num_devices = 1;
@@@ -403,13 -403,10 +403,10 @@@ static void reset_bdev(struct zram *zra
                return;
  
        bdev = zram->bdev;
-       if (zram->old_block_size)
-               set_blocksize(bdev, zram->old_block_size);
        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
        /* hope filp_close flush all of IO */
        filp_close(zram->backing_dev, NULL);
        zram->backing_dev = NULL;
-       zram->old_block_size = 0;
        zram->bdev = NULL;
        zram->disk->fops = &zram_devops;
        kvfree(zram->bitmap);
@@@ -454,7 -451,7 +451,7 @@@ static ssize_t backing_dev_store(struc
        struct file *backing_dev = NULL;
        struct inode *inode;
        struct address_space *mapping;
-       unsigned int bitmap_sz, old_block_size = 0;
+       unsigned int bitmap_sz;
        unsigned long nr_pages, *bitmap = NULL;
        struct block_device *bdev = NULL;
        int err;
                goto out;
        }
  
-       old_block_size = block_size(bdev);
-       err = set_blocksize(bdev, PAGE_SIZE);
-       if (err)
-               goto out;
        reset_bdev(zram);
  
-       zram->old_block_size = old_block_size;
        zram->bdev = bdev;
        zram->backing_dev = backing_dev;
        zram->bitmap = bitmap;
@@@ -620,19 -611,15 +611,19 @@@ static int read_from_bdev_async(struct 
        return 1;
  }
  
 +#define PAGE_WB_SIG "page_index="
 +
 +#define PAGE_WRITEBACK 0
  #define HUGE_WRITEBACK 1
  #define IDLE_WRITEBACK 2
  
 +
  static ssize_t writeback_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t len)
  {
        struct zram *zram = dev_to_zram(dev);
        unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
 -      unsigned long index;
 +      unsigned long index = 0;
        struct bio bio;
        struct bio_vec bio_vec;
        struct page *page;
                mode = IDLE_WRITEBACK;
        else if (sysfs_streq(buf, "huge"))
                mode = HUGE_WRITEBACK;
 -      else
 -              return -EINVAL;
 +      else {
 +              if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
 +                      return -EINVAL;
 +
 +              ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
 +              if (ret || index >= nr_pages)
 +                      return -EINVAL;
 +
 +              nr_pages = 1;
 +              mode = PAGE_WRITEBACK;
 +      }
  
        down_read(&zram->init_lock);
        if (!init_done(zram)) {
                goto release_init_lock;
        }
  
 -      for (index = 0; index < nr_pages; index++) {
 +      while (nr_pages--) {
                struct bio_vec bvec;
  
                bvec.bv_page = page;
@@@ -1084,7 -1062,7 +1075,7 @@@ static ssize_t mm_stat_show(struct devi
        max_used = atomic_long_read(&zram->stats.max_used_pages);
  
        ret = scnprintf(buf, PAGE_SIZE,
 -                      "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
 +                      "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
                        orig_size << PAGE_SHIFT,
                        (u64)atomic64_read(&zram->stats.compr_data_size),
                        mem_used << PAGE_SHIFT,
                        max_used << PAGE_SHIFT,
                        (u64)atomic64_read(&zram->stats.same_pages),
                        pool_stats.pages_compacted,
 -                      (u64)atomic64_read(&zram->stats.huge_pages));
 +                      (u64)atomic64_read(&zram->stats.huge_pages),
 +                      (u64)atomic64_read(&zram->stats.huge_pages_since));
        up_read(&zram->init_lock);
  
        return ret;
@@@ -1425,7 -1402,6 +1416,7 @@@ out
        if (comp_len == PAGE_SIZE) {
                zram_set_flag(zram, index, ZRAM_HUGE);
                atomic64_inc(&zram->stats.huge_pages);
 +              atomic64_inc(&zram->stats.huge_pages_since);
        }
  
        if (flags) {
@@@ -1710,8 -1686,8 +1701,8 @@@ static void zram_reset_device(struct zr
        disksize = zram->disksize;
        zram->disksize = 0;
  
-       set_capacity(zram->disk, 0);
-       part_stat_set_all(&zram->disk->part0, 0);
+       set_capacity_and_notify(zram->disk, 0);
+       part_stat_set_all(zram->disk->part0, 0);
  
        up_write(&zram->init_lock);
        /* I/O operation under all of CPU are done so let's free */
@@@ -1756,9 -1732,7 +1747,7 @@@ static ssize_t disksize_store(struct de
  
        zram->comp = comp;
        zram->disksize = disksize;
-       set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-       revalidate_disk_size(zram->disk, true);
+       set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
        up_write(&zram->init_lock);
  
        return len;
@@@ -1786,15 -1760,12 +1775,12 @@@ static ssize_t reset_store(struct devic
                return -EINVAL;
  
        zram = dev_to_zram(dev);
-       bdev = bdget_disk(zram->disk, 0);
-       if (!bdev)
-               return -ENOMEM;
+       bdev = zram->disk->part0;
  
        mutex_lock(&bdev->bd_mutex);
        /* Do not reset an active device or claimed device */
        if (bdev->bd_openers || zram->claim) {
                mutex_unlock(&bdev->bd_mutex);
-               bdput(bdev);
                return -EBUSY;
        }
  
        /* Make sure all the pending I/O are finished */
        fsync_bdev(bdev);
        zram_reset_device(zram);
-       revalidate_disk_size(zram->disk, true);
-       bdput(bdev);
  
        mutex_lock(&bdev->bd_mutex);
        zram->claim = false;
@@@ -1992,16 -1961,11 +1976,11 @@@ out_free_dev
  
  static int zram_remove(struct zram *zram)
  {
-       struct block_device *bdev;
-       bdev = bdget_disk(zram->disk, 0);
-       if (!bdev)
-               return -ENOMEM;
+       struct block_device *bdev = zram->disk->part0;
  
        mutex_lock(&bdev->bd_mutex);
        if (bdev->bd_openers || zram->claim) {
                mutex_unlock(&bdev->bd_mutex);
-               bdput(bdev);
                return -EBUSY;
        }
  
        /* Make sure all the pending I/O are finished */
        fsync_bdev(bdev);
        zram_reset_device(zram);
-       bdput(bdev);
  
        pr_info("Removed device: %s\n", zram->disk->disk_name);
  
@@@ -78,7 -78,6 +78,7 @@@ struct zram_stats 
        atomic64_t notify_free; /* no. of swap slot free notifications */
        atomic64_t same_pages;          /* no. of same element filled pages */
        atomic64_t huge_pages;          /* no. of huge pages */
 +      atomic64_t huge_pages_since;    /* no. of huge pages since zram set up */
        atomic64_t pages_stored;        /* no. of pages currently stored */
        atomic_long_t max_used_pages;   /* no. of maximum pages stored */
        atomic64_t writestall;          /* no. of write slow paths */
@@@ -119,7 -118,6 +119,6 @@@ struct zram 
        bool wb_limit_enable;
        u64 bd_wb_limit;
        struct block_device *bdev;
-       unsigned int old_block_size;
        unsigned long *bitmap;
        unsigned long nr_pages;
  #endif
diff --combined drivers/ide/ide-probe.c
@@@ -902,65 -902,14 +902,14 @@@ out_up
        return 1;
  }
  
- static int ata_lock(dev_t dev, void *data)
+ static void ata_probe(dev_t dev)
  {
-       /* FIXME: we want to pin hwif down */
-       return 0;
+       request_module("ide-disk");
+       request_module("ide-cd");
+       request_module("ide-tape");
+       request_module("ide-floppy");
  }
  
- static struct kobject *ata_probe(dev_t dev, int *part, void *data)
- {
-       ide_hwif_t *hwif = data;
-       int unit = *part >> PARTN_BITS;
-       ide_drive_t *drive = hwif->devices[unit];
-       if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-               return NULL;
-       if (drive->media == ide_disk)
-               request_module("ide-disk");
-       if (drive->media == ide_cdrom || drive->media == ide_optical)
-               request_module("ide-cd");
-       if (drive->media == ide_tape)
-               request_module("ide-tape");
-       if (drive->media == ide_floppy)
-               request_module("ide-floppy");
-       return NULL;
- }
- static struct kobject *exact_match(dev_t dev, int *part, void *data)
- {
-       struct gendisk *p = data;
-       *part &= (1 << PARTN_BITS) - 1;
-       return &disk_to_dev(p)->kobj;
- }
- static int exact_lock(dev_t dev, void *data)
- {
-       struct gendisk *p = data;
-       if (!get_disk_and_module(p))
-               return -1;
-       return 0;
- }
- void ide_register_region(struct gendisk *disk)
- {
-       blk_register_region(MKDEV(disk->major, disk->first_minor),
-                           disk->minors, NULL, exact_match, exact_lock, disk);
- }
- EXPORT_SYMBOL_GPL(ide_register_region);
- void ide_unregister_region(struct gendisk *disk)
- {
-       blk_unregister_region(MKDEV(disk->major, disk->first_minor),
-                             disk->minors);
- }
- EXPORT_SYMBOL_GPL(ide_unregister_region);
  void ide_init_disk(struct gendisk *disk, ide_drive_t *drive)
  {
        ide_hwif_t *hwif = drive->hwif;
@@@ -999,7 -948,7 +948,7 @@@ static int hwif_init(ide_hwif_t *hwif
                return 0;
        }
  
-       if (register_blkdev(hwif->major, hwif->name))
+       if (__register_blkdev(hwif->major, hwif->name, ata_probe))
                return 0;
  
        if (!hwif->sg_max_nents)
                goto out;
        }
  
-       blk_register_region(MKDEV(hwif->major, 0), MAX_DRIVES << PARTN_BITS,
-                           THIS_MODULE, ata_probe, ata_lock, hwif);
        return 1;
  
  out:
@@@ -1592,6 -1539,9 +1539,6 @@@ EXPORT_SYMBOL_GPL(ide_port_unregister_d
  
  static void ide_unregister(ide_hwif_t *hwif)
  {
 -      BUG_ON(in_interrupt());
 -      BUG_ON(irqs_disabled());
 -
        mutex_lock(&ide_cfg_mtx);
  
        if (hwif->present) {
        /*
         * Remove us from the kernel's knowledge
         */
-       blk_unregister_region(MKDEV(hwif->major, 0), MAX_DRIVES<<PARTN_BITS);
        kfree(hwif->sg_table);
        unregister_blkdev(hwif->major, hwif->name);
  
diff --combined drivers/md/dm-raid.c
@@@ -700,8 -700,7 +700,7 @@@ static void rs_set_capacity(struct raid
  {
        struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
  
-       set_capacity(gendisk, rs->md.array_sectors);
-       revalidate_disk_size(gendisk, true);
+       set_capacity_and_notify(gendisk, rs->md.array_sectors);
  }
  
  /*
@@@ -3728,15 -3727,6 +3727,15 @@@ static void raid_io_hints(struct dm_tar
  
        blk_limits_io_min(limits, chunk_size_bytes);
        blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
 +
 +      /*
 +       * RAID1 and RAID10 personalities require bio splitting,
 +       * RAID0/4/5/6 don't and process large discard bios properly.
 +       */
 +      if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
 +              limits->discard_granularity = chunk_size_bytes;
 +              limits->max_discard_sectors = rs->md.chunk_sectors;
 +      }
  }
  
  static void raid_postsuspend(struct dm_target *ti)
diff --combined drivers/md/dm-table.c
@@@ -18,6 -18,7 +18,6 @@@
  #include <linux/mutex.h>
  #include <linux/delay.h>
  #include <linux/atomic.h>
 -#include <linux/lcm.h>
  #include <linux/blk-mq.h>
  #include <linux/mount.h>
  #include <linux/dax.h>
@@@ -347,16 -348,9 +347,9 @@@ static int upgrade_mode(struct dm_dev_i
  dev_t dm_get_dev_t(const char *path)
  {
        dev_t dev;
-       struct block_device *bdev;
  
-       bdev = lookup_bdev(path);
-       if (IS_ERR(bdev))
+       if (lookup_bdev(path, &dev))
                dev = name_to_dev_t(path);
-       else {
-               dev = bdev->bd_dev;
-               bdput(bdev);
-       }
        return dev;
  }
  EXPORT_SYMBOL_GPL(dm_get_dev_t);
@@@ -1246,6 -1240,12 +1239,6 @@@ void dm_table_event_callback(struct dm_
  
  void dm_table_event(struct dm_table *t)
  {
 -      /*
 -       * You can no longer call dm_table_event() from interrupt
 -       * context, use a bottom half instead.
 -       */
 -      BUG_ON(in_interrupt());
 -
        mutex_lock(&_event_lock);
        if (t->event_fn)
                t->event_fn(t->event_context);
@@@ -1448,6 -1448,10 +1441,6 @@@ int dm_calculate_queue_limits(struct dm
                        zone_sectors = ti_limits.chunk_sectors;
                }
  
 -              /* Stack chunk_sectors if target-specific splitting is required */
 -              if (ti->max_io_len)
 -                      ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len,
 -                                                             ti_limits.chunk_sectors);
                /* Set I/O hints portion of queue limits */
                if (ti->type->io_hints)
                        ti->type->io_hints(ti, &ti_limits);
diff --combined drivers/md/dm.c
@@@ -476,10 -476,8 +476,10 @@@ static int dm_blk_report_zones(struct g
                return -EAGAIN;
  
        map = dm_get_live_table(md, &srcu_idx);
 -      if (!map)
 -              return -EIO;
 +      if (!map) {
 +              ret = -EIO;
 +              goto out;
 +      }
  
        do {
                struct dm_target *tgt;
@@@ -509,6 -507,7 +509,6 @@@ out
  
  static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
                            struct block_device **bdev)
 -      __acquires(md->io_barrier)
  {
        struct dm_target *tgt;
        struct dm_table *map;
@@@ -542,6 -541,7 +542,6 @@@ retry
  }
  
  static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 -      __releases(md->io_barrier)
  {
        dm_put_live_table(md, srcu_idx);
  }
@@@ -570,7 -570,10 +570,10 @@@ static int dm_blk_ioctl(struct block_de
                }
        }
  
-       r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+       if (!bdev->bd_disk->fops->ioctl)
+               r = -ENOTTY;
+       else
+               r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
  out:
        dm_unprepare_ioctl(md, srcu_idx);
        return r;
@@@ -1037,18 -1040,15 +1040,18 @@@ static sector_t max_io_len(struct dm_ta
        sector_t max_len;
  
        /*
 -       * Does the target need to split even further?
 -       * - q->limits.chunk_sectors reflects ti->max_io_len so
 -       *   blk_max_size_offset() provides required splitting.
 -       * - blk_max_size_offset() also respects q->limits.max_sectors
 +       * Does the target need to split IO even further?
 +       * - varied (per target) IO splitting is a tenet of DM; this
 +       *   explains why stacked chunk_sectors based splitting via
 +       *   blk_max_size_offset() isn't possible here. So pass in
 +       *   ti->max_io_len to override stacked chunk_sectors.
         */
 -      max_len = blk_max_size_offset(ti->table->md->queue,
 -                                    target_offset);
 -      if (len > max_len)
 -              len = max_len;
 +      if (ti->max_io_len) {
 +              max_len = blk_max_size_offset(ti->table->md->queue,
 +                                            target_offset, ti->max_io_len);
 +              if (len > max_len)
 +                      len = max_len;
 +      }
  
        return len;
  }
@@@ -1199,9 -1199,11 +1202,9 @@@ static int dm_dax_zero_page_range(struc
                 * ->zero_page_range() is mandatory dax operation. If we are
                 *  here, something is wrong.
                 */
 -              dm_put_live_table(md, srcu_idx);
                goto out;
        }
        ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
 -
   out:
        dm_put_live_table(md, srcu_idx);
  
@@@ -1274,8 -1276,7 +1277,7 @@@ static blk_qc_t __map_bio(struct dm_tar
                break;
        case DM_MAPIO_REMAPPED:
                /* the bio has been remapped so dispatch it */
-               trace_block_bio_remap(clone->bi_disk->queue, clone,
-                                     bio_dev(io->orig_bio), sector);
+               trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
                ret = submit_bio_noacct(clone);
                break;
        case DM_MAPIO_KILL:
@@@ -1420,18 -1421,12 +1422,12 @@@ static int __send_empty_flush(struct cl
         */
        bio_init(&flush_bio, NULL, 0);
        flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+       flush_bio.bi_disk = ci->io->md->disk;
+       bio_associate_blkg(&flush_bio);
        ci->bio = &flush_bio;
        ci->sector_count = 0;
  
-       /*
-        * Empty flush uses a statically initialized bio, as the base for
-        * cloning.  However, blkg association requires that a bdev is
-        * associated with a gendisk, which doesn't happen until the bdev is
-        * opened.  So, blkg association is done at issue time of the flush
-        * rather than when the device is created in alloc_dev().
-        */
-       bio_set_dev(ci->bio, ci->io->md->bdev);
        BUG_ON(bio_has_data(ci->bio));
        while ((ti = dm_table_get_target(ci->map, target_nr++)))
                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
@@@ -1611,12 -1606,12 +1607,12 @@@ static blk_qc_t __split_and_process_bio
                                 * (by eliminating DM's splitting and just using bio_split)
                                 */
                                part_stat_lock();
-                               __dm_part_stat_sub(&dm_disk(md)->part0,
+                               __dm_part_stat_sub(dm_disk(md)->part0,
                                                   sectors[op_stat_group(bio_op(bio))], ci.sector_count);
                                part_stat_unlock();
  
                                bio_chain(b, bio);
-                               trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
+                               trace_block_split(b, bio->bi_iter.bi_sector);
                                ret = submit_bio_noacct(bio);
                                break;
                        }
@@@ -1748,11 -1743,6 +1744,6 @@@ static void cleanup_mapped_device(struc
  
        cleanup_srcu_struct(&md->io_barrier);
  
-       if (md->bdev) {
-               bdput(md->bdev);
-               md->bdev = NULL;
-       }
        mutex_destroy(&md->suspend_lock);
        mutex_destroy(&md->type_lock);
        mutex_destroy(&md->table_devices_lock);
@@@ -1844,10 -1834,6 +1835,6 @@@ static struct mapped_device *alloc_dev(
        if (!md->wq)
                goto bad;
  
-       md->bdev = bdget_disk(md->disk, 0);
-       if (!md->bdev)
-               goto bad;
        dm_stats_init(&md->stats);
  
        /* Populate the mapping, nobody knows we exist yet */
@@@ -1972,8 -1958,7 +1959,7 @@@ static struct dm_table *__bind(struct m
        if (size != dm_get_size(md))
                memset(&md->geometry, 0, sizeof(md->geometry));
  
-       set_capacity(md->disk, size);
-       bd_set_nr_sectors(md->bdev, size);
+       set_capacity_and_notify(md->disk, size);
  
        dm_table_event_callback(t, event_callback, md);
  
@@@ -2256,7 -2241,7 +2242,7 @@@ EXPORT_SYMBOL_GPL(dm_put)
  static bool md_in_flight_bios(struct mapped_device *md)
  {
        int cpu;
-       struct hd_struct *part = &dm_disk(md)->part0;
+       struct block_device *part = dm_disk(md)->part0;
        long sum = 0;
  
        for_each_possible_cpu(cpu) {
@@@ -2391,27 -2376,19 +2377,19 @@@ static int lock_fs(struct mapped_devic
  {
        int r;
  
-       WARN_ON(md->frozen_sb);
-       md->frozen_sb = freeze_bdev(md->bdev);
-       if (IS_ERR(md->frozen_sb)) {
-               r = PTR_ERR(md->frozen_sb);
-               md->frozen_sb = NULL;
-               return r;
-       }
-       set_bit(DMF_FROZEN, &md->flags);
+       WARN_ON(test_bit(DMF_FROZEN, &md->flags));
  
-       return 0;
+       r = freeze_bdev(md->disk->part0);
+       if (!r)
+               set_bit(DMF_FROZEN, &md->flags);
+       return r;
  }
  
  static void unlock_fs(struct mapped_device *md)
  {
        if (!test_bit(DMF_FROZEN, &md->flags))
                return;
-       thaw_bdev(md->bdev, md->frozen_sb);
-       md->frozen_sb = NULL;
+       thaw_bdev(md->disk->part0);
        clear_bit(DMF_FROZEN, &md->flags);
  }
  
diff --combined drivers/md/md.c
@@@ -464,7 -464,7 +464,7 @@@ struct md_io 
        bio_end_io_t *orig_bi_end_io;
        void *orig_bi_private;
        unsigned long start_time;
-       struct hd_struct *part;
+       struct block_device *part;
  };
  
  static void md_end_io(struct bio *bio)
@@@ -2414,7 -2414,6 +2414,6 @@@ EXPORT_SYMBOL(md_integrity_add_rdev)
  static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
  {
        char b[BDEVNAME_SIZE];
-       struct kobject *ko;
        int err;
  
        /* prevent duplicates */
        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
                goto fail;
  
-       ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
        /* failure here is OK */
-       err = sysfs_create_link(&rdev->kobj, ko, "block");
+       err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
        rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
        rdev->sysfs_unack_badblocks =
                sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
@@@ -5355,10 -5353,9 +5353,9 @@@ array_size_store(struct mddev *mddev, c
  
        if (!err) {
                mddev->array_sectors = sectors;
-               if (mddev->pers) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
-               }
+               if (mddev->pers)
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
        }
        mddev_unlock(mddev);
        return err ?: len;
@@@ -5765,11 -5762,12 +5762,12 @@@ static int md_alloc(dev_t dev, char *na
        return error;
  }
  
- static struct kobject *md_probe(dev_t dev, int *part, void *data)
+ static void md_probe(dev_t dev)
  {
+       if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
+               return;
        if (create_on_open)
                md_alloc(dev, NULL);
-       return NULL;
  }
  
  static int add_named_array(const char *val, const struct kernel_param *kp)
@@@ -6107,8 -6105,7 +6105,7 @@@ int do_md_run(struct mddev *mddev
        md_wakeup_thread(mddev->thread);
        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
  
-       set_capacity(mddev->gendisk, mddev->array_sectors);
-       revalidate_disk_size(mddev->gendisk, true);
+       set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
        clear_bit(MD_NOT_READY, &mddev->flags);
        mddev->changed = 1;
        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
@@@ -6423,10 -6420,9 +6420,9 @@@ static int do_md_stop(struct mddev *mdd
                        if (rdev->raid_disk >= 0)
                                sysfs_unlink_rdev(mddev, rdev);
  
-               set_capacity(disk, 0);
+               set_capacity_and_notify(disk, 0);
                mutex_unlock(&mddev->open_mutex);
                mddev->changed = 1;
-               revalidate_disk_size(disk, true);
  
                if (mddev->ro)
                        mddev->ro = 0;
@@@ -6535,7 -6531,7 +6531,7 @@@ static void autorun_devices(int part
                        break;
                }
  
-               md_probe(dev, NULL, NULL);
+               md_probe(dev);
                mddev = mddev_find(dev);
                if (!mddev || !mddev->gendisk) {
                        if (mddev)
@@@ -7257,8 -7253,8 +7253,8 @@@ static int update_size(struct mddev *md
                if (mddev_is_clustered(mddev))
                        md_cluster_ops->update_size(mddev, old_dev_sectors);
                else if (mddev->queue) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
                }
        }
        return rv;
@@@ -7480,7 -7476,6 +7476,6 @@@ static inline bool md_ioctl_valid(unsig
  {
        switch (cmd) {
        case ADD_NEW_DISK:
-       case BLKROSET:
        case GET_ARRAY_INFO:
        case GET_BITMAP_FILE:
        case GET_DISK_INFO:
@@@ -7507,7 -7502,6 +7502,6 @@@ static int md_ioctl(struct block_devic
        int err = 0;
        void __user *argp = (void __user *)arg;
        struct mddev *mddev = NULL;
-       int ro;
        bool did_set_md_closing = false;
  
        if (!md_ioctl_valid(cmd))
                        goto unlock;
                }
                break;
-       case BLKROSET:
-               if (get_user(ro, (int __user *)(arg))) {
-                       err = -EFAULT;
-                       goto unlock;
-               }
-               err = -EINVAL;
-               /* if the bdev is going readonly the value of mddev->ro
-                * does not matter, no writes are coming
-                */
-               if (ro)
-                       goto unlock;
-               /* are we are already prepared for writes? */
-               if (mddev->ro != 1)
-                       goto unlock;
-               /* transitioning to readauto need only happen for
-                * arrays that call md_write_start
-                */
-               if (mddev->pers) {
-                       err = restart_array(mddev);
-                       if (err == 0) {
-                               mddev->ro = 2;
-                               set_disk_ro(mddev->gendisk, 0);
-                       }
-               }
-               goto unlock;
        }
  
        /*
@@@ -7809,6 -7774,36 +7774,36 @@@ static int md_compat_ioctl(struct block
  }
  #endif /* CONFIG_COMPAT */
  
+ static int md_set_read_only(struct block_device *bdev, bool ro)
+ {
+       struct mddev *mddev = bdev->bd_disk->private_data;
+       int err;
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       if (!mddev->raid_disks && !mddev->external) {
+               err = -ENODEV;
+               goto out_unlock;
+       }
+       /*
+        * Transitioning to read-auto need only happen for arrays that call
+        * md_write_start and which are not ready for writes yet.
+        */
+       if (!ro && mddev->ro == 1 && mddev->pers) {
+               err = restart_array(mddev);
+               if (err)
+                       goto out_unlock;
+               mddev->ro = 2;
+       }
+ out_unlock:
+       mddev_unlock(mddev);
+       return err;
+ }
  static int md_open(struct block_device *bdev, fmode_t mode)
  {
        /*
@@@ -7886,6 -7881,7 +7881,7 @@@ const struct block_device_operations md
  #endif
        .getgeo         = md_getgeo,
        .check_events   = md_check_events,
+       .set_read_only  = md_set_read_only,
  };
  
  static int md_thread(void *arg)
@@@ -8445,7 -8441,7 +8441,7 @@@ static int is_mddev_idle(struct mddev *
        rcu_read_lock();
        rdev_for_each_rcu(rdev, mddev) {
                struct gendisk *disk = rdev->bdev->bd_disk;
-               curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
+               curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
                              atomic_read(&disk->sync_io);
                /* sync IO will cause sync_io to increase before the disk_stats
                 * as sync_io is counted when a request starts, and
@@@ -8582,6 -8578,25 +8578,6 @@@ void md_write_end(struct mddev *mddev
  
  EXPORT_SYMBOL(md_write_end);
  
 -/* This is used by raid0 and raid10 */
 -void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 -                      struct bio *bio, sector_t start, sector_t size)
 -{
 -      struct bio *discard_bio = NULL;
 -
 -      if (__blkdev_issue_discard(rdev->bdev, start, size,
 -              GFP_NOIO, 0, &discard_bio) || !discard_bio)
 -              return;
 -
 -      bio_chain(discard_bio, bio);
 -      bio_clone_blkg_association(discard_bio, bio);
 -      if (mddev->gendisk)
 -              trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk),
 -                                    bio->bi_iter.bi_sector);
 -      submit_bio_noacct(discard_bio);
 -}
 -EXPORT_SYMBOL(md_submit_discard_bio);
 -
  /* md_allow_write(mddev)
   * Calling this ensures that the array is marked 'active' so that writes
   * may proceed without blocking.  It is important to call this before
@@@ -9015,10 -9030,9 +9011,9 @@@ void md_do_sync(struct md_thread *threa
                mddev_lock_nointr(mddev);
                md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
                mddev_unlock(mddev);
-               if (!mddev_is_clustered(mddev)) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
-               }
+               if (!mddev_is_clustered(mddev))
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
        }
  
        spin_lock(&mddev->lock);
@@@ -9547,18 -9561,15 +9542,15 @@@ static int __init md_init(void
        if (!md_rdev_misc_wq)
                goto err_rdev_misc_wq;
  
-       if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+       ret = __register_blkdev(MD_MAJOR, "md", md_probe);
+       if (ret < 0)
                goto err_md;
  
-       if ((ret = register_blkdev(0, "mdp")) < 0)
+       ret = __register_blkdev(0, "mdp", md_probe);
+       if (ret < 0)
                goto err_mdp;
        mdp_major = ret;
  
-       blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
-                           md_probe, NULL, NULL);
-       blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
-                           md_probe, NULL, NULL);
        register_reboot_notifier(&md_notifier);
        raid_table_header = register_sysctl_table(raid_root_table);
  
@@@ -9825,9 -9836,6 +9817,6 @@@ static __exit void md_exit(void
        struct list_head *tmp;
        int delay = 1;
  
-       blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
-       blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
        unregister_blkdev(MD_MAJOR,"md");
        unregister_blkdev(mdp_major, "mdp");
        unregister_reboot_notifier(&md_notifier);
diff --combined drivers/md/raid0.c
@@@ -477,7 -477,6 +477,7 @@@ static void raid0_handle_discard(struc
  
        for (disk = 0; disk < zone->nb_dev; disk++) {
                sector_t dev_start, dev_end;
 +              struct bio *discard_bio = NULL;
                struct md_rdev *rdev;
  
                if (disk < start_disk_index)
  
                rdev = conf->devlist[(zone - conf->strip_zone) *
                        conf->strip_zone[0].nb_dev + disk];
 -              md_submit_discard_bio(mddev, rdev, bio,
 +              if (__blkdev_issue_discard(rdev->bdev,
                        dev_start + zone->dev_start + rdev->data_offset,
 -                      dev_end - dev_start);
 +                      dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
 +                  !discard_bio)
 +                      continue;
 +              bio_chain(discard_bio, bio);
 +              bio_clone_blkg_association(discard_bio, bio);
 +              if (mddev->gendisk)
-                       trace_block_bio_remap(bdev_get_queue(rdev->bdev),
-                               discard_bio, disk_devt(mddev->gendisk),
++                      trace_block_bio_remap(discard_bio,
++                              disk_devt(mddev->gendisk),
 +                              bio->bi_iter.bi_sector);
 +              submit_bio_noacct(discard_bio);
        }
        bio_endio(bio);
  }
@@@ -581,8 -571,8 +581,8 @@@ static bool raid0_make_request(struct m
                tmp_dev->data_offset;
  
        if (mddev->gendisk)
-               trace_block_bio_remap(bio->bi_disk->queue, bio,
-                               disk_devt(mddev->gendisk), bio_sector);
+               trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+                                     bio_sector);
        mddev_check_writesame(mddev, bio);
        mddev_check_write_zeroes(mddev, bio);
        submit_bio_noacct(bio);
diff --combined drivers/md/raid10.c
@@@ -91,7 -91,7 +91,7 @@@ static inline struct r10bio *get_resync
  static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
  {
        struct r10conf *conf = data;
 -      int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
 +      int size = offsetof(struct r10bio, devs[conf->copies]);
  
        /* allocate a r10bio with room for raid_disks entries in the
         * bios array */
@@@ -238,7 -238,7 +238,7 @@@ static void put_all_bios(struct r10con
  {
        int i;
  
 -      for (i = 0; i < conf->geo.raid_disks; i++) {
 +      for (i = 0; i < conf->copies; i++) {
                struct bio **bio = & r10_bio->devs[i].bio;
                if (!BIO_SPECIAL(*bio))
                        bio_put(*bio);
@@@ -327,7 -327,7 +327,7 @@@ static int find_bio_disk(struct r10con
        int slot;
        int repl = 0;
  
 -      for (slot = 0; slot < conf->geo.raid_disks; slot++) {
 +      for (slot = 0; slot < conf->copies; slot++) {
                if (r10_bio->devs[slot].bio == bio)
                        break;
                if (r10_bio->devs[slot].repl_bio == bio) {
                }
        }
  
 +      BUG_ON(slot == conf->copies);
        update_head_pos(slot, r10_bio);
  
        if (slotp)
@@@ -1201,8 -1200,7 +1201,7 @@@ static void raid10_read_request(struct 
        read_bio->bi_private = r10_bio;
  
        if (mddev->gendisk)
-               trace_block_bio_remap(read_bio->bi_disk->queue,
-                                     read_bio, disk_devt(mddev->gendisk),
+               trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
                                      r10_bio->sector);
        submit_bio_noacct(read_bio);
        return;
@@@ -1251,8 -1249,7 +1250,7 @@@ static void raid10_write_one_disk(struc
        mbio->bi_private = r10_bio;
  
        if (conf->mddev->gendisk)
-               trace_block_bio_remap(mbio->bi_disk->queue,
-                                     mbio, disk_devt(conf->mddev->gendisk),
+               trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
                                      r10_bio->sector);
        /* flush_pending_writes() needs access to the rdev so...*/
        mbio->bi_disk = (void *)rdev;
        }
  }
  
 -static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 -{
 -      int i;
 -      struct r10conf *conf = mddev->private;
 -      struct md_rdev *blocked_rdev;
 -
 -retry_wait:
 -      blocked_rdev = NULL;
 -      rcu_read_lock();
 -      for (i = 0; i < conf->copies; i++) {
 -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 -              struct md_rdev *rrdev = rcu_dereference(
 -                      conf->mirrors[i].replacement);
 -              if (rdev == rrdev)
 -                      rrdev = NULL;
 -              if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 -                      atomic_inc(&rdev->nr_pending);
 -                      blocked_rdev = rdev;
 -                      break;
 -              }
 -              if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
 -                      atomic_inc(&rrdev->nr_pending);
 -                      blocked_rdev = rrdev;
 -                      break;
 -              }
 -
 -              if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
 -                      sector_t first_bad;
 -                      sector_t dev_sector = r10_bio->devs[i].addr;
 -                      int bad_sectors;
 -                      int is_bad;
 -
 -                      /* Discard request doesn't care the write result
 -                       * so it doesn't need to wait blocked disk here.
 -                       */
 -                      if (!r10_bio->sectors)
 -                              continue;
 -
 -                      is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
 -                                           &first_bad, &bad_sectors);
 -                      if (is_bad < 0) {
 -                              /* Mustn't write here until the bad block
 -                               * is acknowledged
 -                               */
 -                              atomic_inc(&rdev->nr_pending);
 -                              set_bit(BlockedBadBlocks, &rdev->flags);
 -                              blocked_rdev = rdev;
 -                              break;
 -                      }
 -              }
 -      }
 -      rcu_read_unlock();
 -
 -      if (unlikely(blocked_rdev)) {
 -              /* Have to wait for this device to get unblocked, then retry */
 -              allow_barrier(conf);
 -              raid10_log(conf->mddev, "%s wait rdev %d blocked",
 -                              __func__, blocked_rdev->raid_disk);
 -              md_wait_for_blocked_rdev(blocked_rdev, mddev);
 -              wait_barrier(conf);
 -              goto retry_wait;
 -      }
 -}
 -
  static void raid10_write_request(struct mddev *mddev, struct bio *bio,
                                 struct r10bio *r10_bio)
  {
        struct r10conf *conf = mddev->private;
        int i;
 +      struct md_rdev *blocked_rdev;
        sector_t sectors;
        int max_sectors;
  
  
        r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
        raid10_find_phys(conf, r10_bio);
 -
 -      wait_blocked_dev(mddev, r10_bio);
 -
 +retry_write:
 +      blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r10_bio->sectors;
  
                        conf->mirrors[d].replacement);
                if (rdev == rrdev)
                        rrdev = NULL;
 +              if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 +                      atomic_inc(&rdev->nr_pending);
 +                      blocked_rdev = rdev;
 +                      break;
 +              }
 +              if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
 +                      atomic_inc(&rrdev->nr_pending);
 +                      blocked_rdev = rrdev;
 +                      break;
 +              }
                if (rdev && (test_bit(Faulty, &rdev->flags)))
                        rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)))
  
                        is_bad = is_badblock(rdev, dev_sector, max_sectors,
                                             &first_bad, &bad_sectors);
 +                      if (is_bad < 0) {
 +                              /* Mustn't write here until the bad block
 +                               * is acknowledged
 +                               */
 +                              atomic_inc(&rdev->nr_pending);
 +                              set_bit(BlockedBadBlocks, &rdev->flags);
 +                              blocked_rdev = rdev;
 +                              break;
 +                      }
                        if (is_bad && first_bad <= dev_sector) {
                                /* Cannot write here at all */
                                bad_sectors -= (dev_sector - first_bad);
        }
        rcu_read_unlock();
  
 +      if (unlikely(blocked_rdev)) {
 +              /* Have to wait for this device to get unblocked, then retry */
 +              int j;
 +              int d;
 +
 +              for (j = 0; j < i; j++) {
 +                      if (r10_bio->devs[j].bio) {
 +                              d = r10_bio->devs[j].devnum;
 +                              rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 +                      }
 +                      if (r10_bio->devs[j].repl_bio) {
 +                              struct md_rdev *rdev;
 +                              d = r10_bio->devs[j].devnum;
 +                              rdev = conf->mirrors[d].replacement;
 +                              if (!rdev) {
 +                                      /* Race with remove_disk */
 +                                      smp_mb();
 +                                      rdev = conf->mirrors[d].rdev;
 +                              }
 +                              rdev_dec_pending(rdev, mddev);
 +                      }
 +              }
 +              allow_barrier(conf);
 +              raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 +              md_wait_for_blocked_rdev(blocked_rdev, mddev);
 +              wait_barrier(conf);
 +              goto retry_write;
 +      }
 +
        if (max_sectors < r10_bio->sectors)
                r10_bio->sectors = max_sectors;
  
@@@ -1493,7 -1506,7 +1491,7 @@@ static void __make_request(struct mdde
        r10_bio->mddev = mddev;
        r10_bio->sector = bio->bi_iter.bi_sector;
        r10_bio->state = 0;
 -      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
 +      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
  
        if (bio_data_dir(bio) == READ)
                raid10_read_request(mddev, bio, r10_bio);
                raid10_write_request(mddev, bio, r10_bio);
  }
  
 -static struct bio *raid10_split_bio(struct r10conf *conf,
 -                      struct bio *bio, sector_t sectors, bool want_first)
 -{
 -      struct bio *split;
 -
 -      split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
 -      bio_chain(split, bio);
 -      allow_barrier(conf);
 -      if (want_first) {
 -              submit_bio_noacct(bio);
 -              bio = split;
 -      } else
 -              submit_bio_noacct(split);
 -      wait_barrier(conf);
 -
 -      return bio;
 -}
 -
 -static void raid_end_discard_bio(struct r10bio *r10bio)
 -{
 -      struct r10conf *conf = r10bio->mddev->private;
 -      struct r10bio *first_r10bio;
 -
 -      while (atomic_dec_and_test(&r10bio->remaining)) {
 -
 -              allow_barrier(conf);
 -
 -              if (!test_bit(R10BIO_Discard, &r10bio->state)) {
 -                      first_r10bio = (struct r10bio *)r10bio->master_bio;
 -                      free_r10bio(r10bio);
 -                      r10bio = first_r10bio;
 -              } else {
 -                      md_write_end(r10bio->mddev);
 -                      bio_endio(r10bio->master_bio);
 -                      free_r10bio(r10bio);
 -                      break;
 -              }
 -      }
 -}
 -
 -static void raid10_end_discard_request(struct bio *bio)
 -{
 -      struct r10bio *r10_bio = bio->bi_private;
 -      struct r10conf *conf = r10_bio->mddev->private;
 -      struct md_rdev *rdev = NULL;
 -      int dev;
 -      int slot, repl;
 -
 -      /*
 -       * We don't care the return value of discard bio
 -       */
 -      if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 -              set_bit(R10BIO_Uptodate, &r10_bio->state);
 -
 -      dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 -      if (repl)
 -              rdev = conf->mirrors[dev].replacement;
 -      if (!rdev) {
 -              /* raid10_remove_disk uses smp_mb to make sure rdev is set to
 -               * replacement before setting replacement to NULL. It can read
 -               * rdev first without barrier protect even replacment is NULL
 -               */
 -              smp_rmb();
 -              rdev = conf->mirrors[dev].rdev;
 -      }
 -
 -      raid_end_discard_bio(r10_bio);
 -      rdev_dec_pending(rdev, conf->mddev);
 -}
 -
 -/* There are some limitations to handle discard bio
 - * 1st, the discard size is bigger than stripe_size*2.
 - * 2st, if the discard bio spans reshape progress, we use the old way to
 - * handle discard bio
 - */
 -static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 -{
 -      struct r10conf *conf = mddev->private;
 -      struct geom *geo = &conf->geo;
 -      struct r10bio *r10_bio, *first_r10bio;
 -      int far_copies = geo->far_copies;
 -      bool first_copy = true;
 -
 -      int disk;
 -      sector_t chunk;
 -      unsigned int stripe_size;
 -      sector_t split_size;
 -
 -      sector_t bio_start, bio_end;
 -      sector_t first_stripe_index, last_stripe_index;
 -      sector_t start_disk_offset;
 -      unsigned int start_disk_index;
 -      sector_t end_disk_offset;
 -      unsigned int end_disk_index;
 -      unsigned int remainder;
 -
 -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 -              return -EAGAIN;
 -
 -      wait_barrier(conf);
 -
 -      /* Check reshape again to avoid reshape happens after checking
 -       * MD_RECOVERY_RESHAPE and before wait_barrier
 -       */
 -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 -              goto out;
 -
 -      stripe_size = geo->raid_disks << geo->chunk_shift;
 -      bio_start = bio->bi_iter.bi_sector;
 -      bio_end = bio_end_sector(bio);
 -
 -      /* Maybe one discard bio is smaller than strip size or across one stripe
 -       * and discard region is larger than one stripe size. For far offset layout,
 -       * if the discard region is not aligned with stripe size, there is hole
 -       * when we submit discard bio to member disk. For simplicity, we only
 -       * handle discard bio which discard region is bigger than stripe_size*2
 -       */
 -      if (bio_sectors(bio) < stripe_size*2)
 -              goto out;
 -
 -      /* For far and far offset layout, if bio is not aligned with stripe size,
 -       * it splits the part that is not aligned with strip size.
 -       */
 -      div_u64_rem(bio_start, stripe_size, &remainder);
 -      if ((far_copies > 1) && remainder) {
 -              split_size = stripe_size - remainder;
 -              bio = raid10_split_bio(conf, bio, split_size, false);
 -      }
 -      div_u64_rem(bio_end, stripe_size, &remainder);
 -      if ((far_copies > 1) && remainder) {
 -              split_size = bio_sectors(bio) - remainder;
 -              bio = raid10_split_bio(conf, bio, split_size, true);
 -      }
 -
 -      bio_start = bio->bi_iter.bi_sector;
 -      bio_end = bio_end_sector(bio);
 -
 -      /* raid10 uses chunk as the unit to store data. It's similar like raid0.
 -       * One stripe contains the chunks from all member disk (one chunk from
 -       * one disk at the same HBA address). For layout detail, see 'man md 4'
 -       */
 -      chunk = bio_start >> geo->chunk_shift;
 -      chunk *= geo->near_copies;
 -      first_stripe_index = chunk;
 -      start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
 -      if (geo->far_offset)
 -              first_stripe_index *= geo->far_copies;
 -      start_disk_offset = (bio_start & geo->chunk_mask) +
 -                              (first_stripe_index << geo->chunk_shift);
 -
 -      chunk = bio_end >> geo->chunk_shift;
 -      chunk *= geo->near_copies;
 -      last_stripe_index = chunk;
 -      end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
 -      if (geo->far_offset)
 -              last_stripe_index *= geo->far_copies;
 -      end_disk_offset = (bio_end & geo->chunk_mask) +
 -                              (last_stripe_index << geo->chunk_shift);
 -
 -retry_discard:
 -      r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
 -      r10_bio->mddev = mddev;
 -      r10_bio->state = 0;
 -      r10_bio->sectors = 0;
 -      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
 -      wait_blocked_dev(mddev, r10_bio);
 -
 -      /* For far layout it needs more than one r10bio to cover all regions.
 -       * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
 -       * to record the discard bio. Other r10bio->master_bio record the first
 -       * r10bio. The first r10bio only release after all other r10bios finish.
 -       * The discard bio returns only first r10bio finishes
 -       */
 -      if (first_copy) {
 -              r10_bio->master_bio = bio;
 -              set_bit(R10BIO_Discard, &r10_bio->state);
 -              first_copy = false;
 -              first_r10bio = r10_bio;
 -      } else
 -              r10_bio->master_bio = (struct bio *)first_r10bio;
 -
 -      rcu_read_lock();
 -      for (disk = 0; disk < geo->raid_disks; disk++) {
 -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
 -              struct md_rdev *rrdev = rcu_dereference(
 -                      conf->mirrors[disk].replacement);
 -
 -              r10_bio->devs[disk].bio = NULL;
 -              r10_bio->devs[disk].repl_bio = NULL;
 -
 -              if (rdev && (test_bit(Faulty, &rdev->flags)))
 -                      rdev = NULL;
 -              if (rrdev && (test_bit(Faulty, &rrdev->flags)))
 -                      rrdev = NULL;
 -              if (!rdev && !rrdev)
 -                      continue;
 -
 -              if (rdev) {
 -                      r10_bio->devs[disk].bio = bio;
 -                      atomic_inc(&rdev->nr_pending);
 -              }
 -              if (rrdev) {
 -                      r10_bio->devs[disk].repl_bio = bio;
 -                      atomic_inc(&rrdev->nr_pending);
 -              }
 -      }
 -      rcu_read_unlock();
 -
 -      atomic_set(&r10_bio->remaining, 1);
 -      for (disk = 0; disk < geo->raid_disks; disk++) {
 -              sector_t dev_start, dev_end;
 -              struct bio *mbio, *rbio = NULL;
 -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
 -              struct md_rdev *rrdev = rcu_dereference(
 -                      conf->mirrors[disk].replacement);
 -
 -              /*
 -               * Now start to calculate the start and end address for each disk.
 -               * The space between dev_start and dev_end is the discard region.
 -               *
 -               * For dev_start, it needs to consider three conditions:
 -               * 1st, the disk is before start_disk, you can imagine the disk in
 -               * the next stripe. So the dev_start is the start address of next
 -               * stripe.
 -               * 2st, the disk is after start_disk, it means the disk is at the
 -               * same stripe of first disk
 -               * 3st, the first disk itself, we can use start_disk_offset directly
 -               */
 -              if (disk < start_disk_index)
 -                      dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
 -              else if (disk > start_disk_index)
 -                      dev_start = first_stripe_index * mddev->chunk_sectors;
 -              else
 -                      dev_start = start_disk_offset;
 -
 -              if (disk < end_disk_index)
 -                      dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
 -              else if (disk > end_disk_index)
 -                      dev_end = last_stripe_index * mddev->chunk_sectors;
 -              else
 -                      dev_end = end_disk_offset;
 -
 -              /* It only handles discard bio which size is >= stripe size, so
 -               * dev_end > dev_start all the time
 -               */
 -              if (r10_bio->devs[disk].bio) {
 -                      mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 -                      mbio->bi_end_io = raid10_end_discard_request;
 -                      mbio->bi_private = r10_bio;
 -                      r10_bio->devs[disk].bio = mbio;
 -                      r10_bio->devs[disk].devnum = disk;
 -                      atomic_inc(&r10_bio->remaining);
 -                      md_submit_discard_bio(mddev, rdev, mbio,
 -                                      dev_start + choose_data_offset(r10_bio, rdev),
 -                                      dev_end - dev_start);
 -                      bio_endio(mbio);
 -              }
 -              if (r10_bio->devs[disk].repl_bio) {
 -                      rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 -                      rbio->bi_end_io = raid10_end_discard_request;
 -                      rbio->bi_private = r10_bio;
 -                      r10_bio->devs[disk].repl_bio = rbio;
 -                      r10_bio->devs[disk].devnum = disk;
 -                      atomic_inc(&r10_bio->remaining);
 -                      md_submit_discard_bio(mddev, rrdev, rbio,
 -                                      dev_start + choose_data_offset(r10_bio, rrdev),
 -                                      dev_end - dev_start);
 -                      bio_endio(rbio);
 -              }
 -      }
 -
 -      if (!geo->far_offset && --far_copies) {
 -              first_stripe_index += geo->stride >> geo->chunk_shift;
 -              start_disk_offset += geo->stride;
 -              last_stripe_index += geo->stride >> geo->chunk_shift;
 -              end_disk_offset += geo->stride;
 -              atomic_inc(&first_r10bio->remaining);
 -              raid_end_discard_bio(r10_bio);
 -              wait_barrier(conf);
 -              goto retry_discard;
 -      }
 -
 -      raid_end_discard_bio(r10_bio);
 -
 -      return 0;
 -out:
 -      allow_barrier(conf);
 -      return -EAGAIN;
 -}
 -
  static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
  {
        struct r10conf *conf = mddev->private;
        if (!md_write_start(mddev, bio))
                return false;
  
 -      if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
 -              if (!raid10_handle_discard(mddev, bio))
 -                      return true;
 -
        /*
         * If this request crosses a chunk boundary, we need to split
         * it.
@@@ -3754,7 -4061,7 +3752,7 @@@ static int raid10_run(struct mddev *mdd
  
        if (mddev->queue) {
                blk_queue_max_discard_sectors(mddev->queue,
 -                                            UINT_MAX);
 +                                            mddev->chunk_sectors);
                blk_queue_max_write_same_sectors(mddev->queue, 0);
                blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
                blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
diff --combined drivers/nvme/host/core.c
@@@ -93,16 -93,6 +93,6 @@@ static void nvme_put_subsystem(struct n
  static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
                                           unsigned nsid);
  
- static void nvme_update_bdev_size(struct gendisk *disk)
- {
-       struct block_device *bdev = bdget_disk(disk, 0);
-       if (bdev) {
-               bd_set_nr_sectors(bdev, get_capacity(disk));
-               bdput(bdev);
-       }
- }
  /*
   * Prepare a queue for teardown.
   *
@@@ -119,8 -109,7 +109,7 @@@ static void nvme_set_queue_dying(struc
        blk_set_queue_dying(ns->queue);
        blk_mq_unquiesce_queue(ns->queue);
  
-       set_capacity(ns->disk, 0);
-       nvme_update_bdev_size(ns->disk);
+       set_capacity_and_notify(ns->disk, 0);
  }
  
  static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@@ -2053,7 -2042,7 +2042,7 @@@ static void nvme_update_disk_info(struc
                        capacity = 0;
        }
  
-       set_capacity_revalidate_and_notify(disk, capacity, false);
+       set_capacity_and_notify(disk, capacity);
  
        nvme_config_discard(disk, ns);
        nvme_config_write_zeroes(disk, ns);
@@@ -2134,7 -2123,6 +2123,6 @@@ static int nvme_update_ns_info(struct n
                blk_stack_limits(&ns->head->disk->queue->limits,
                                 &ns->queue->limits, 0);
                blk_queue_update_readahead(ns->head->disk->queue);
-               nvme_update_bdev_size(ns->head->disk);
                blk_mq_unfreeze_queue(ns->head->disk->queue);
        }
  #endif
@@@ -2929,7 -2917,7 +2917,7 @@@ int nvme_get_log(struct nvme_ctrl *ctrl
  static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
                                struct nvme_effects_log **log)
  {
 -      struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
 +      struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
        int ret;
  
        if (cel)
                return -ENOMEM;
  
        ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
 -                      &cel->log, sizeof(cel->log), 0);
 +                      cel, sizeof(*cel), 0);
        if (ret) {
                kfree(cel);
                return ret;
        }
  
 -      cel->csi = csi;
 -      xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
 +      xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
  out:
 -      *log = &cel->log;
 +      *log = cel;
        return 0;
  }
  
@@@ -3962,8 -3951,6 +3950,6 @@@ out
         */
        if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR)))
                nvme_ns_remove(ns);
-       else
-               revalidate_disk_size(ns->disk, true);
  }
  
  static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@@ -4373,19 -4360,6 +4359,19 @@@ void nvme_uninit_ctrl(struct nvme_ctrl 
  }
  EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
  
 +static void nvme_free_cels(struct nvme_ctrl *ctrl)
 +{
 +      struct nvme_effects_log *cel;
 +      unsigned long i;
 +
 +      xa_for_each (&ctrl->cels, i, cel) {
 +              xa_erase(&ctrl->cels, i);
 +              kfree(cel);
 +      }
 +
 +      xa_destroy(&ctrl->cels);
 +}
 +
  static void nvme_free_ctrl(struct device *dev)
  {
        struct nvme_ctrl *ctrl =
        if (!subsys || ctrl->instance != subsys->instance)
                ida_simple_remove(&nvme_instance_ida, ctrl->instance);
  
 -      xa_destroy(&ctrl->cels);
 -
 +      nvme_free_cels(ctrl);
        nvme_mpath_uninit(ctrl);
        __free_page(ctrl->discard_page);
  
@@@ -75,6 -75,7 +75,6 @@@ static int dasd_flush_block_queue(struc
  static void dasd_device_tasklet(unsigned long);
  static void dasd_block_tasklet(unsigned long);
  static void do_kick_device(struct work_struct *);
 -static void do_restore_device(struct work_struct *);
  static void do_reload_device(struct work_struct *);
  static void do_requeue_requests(struct work_struct *);
  static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
@@@ -137,6 -138,7 +137,6 @@@ struct dasd_device *dasd_alloc_device(v
        INIT_LIST_HEAD(&device->ccw_queue);
        timer_setup(&device->timer, dasd_device_timeout, 0);
        INIT_WORK(&device->kick_work, do_kick_device);
 -      INIT_WORK(&device->restore_device, do_restore_device);
        INIT_WORK(&device->reload_device, do_reload_device);
        INIT_WORK(&device->requeue_requests, do_requeue_requests);
        device->state = DASD_STATE_NEW;
@@@ -430,7 -432,7 +430,7 @@@ dasd_state_ready_to_online(struct dasd_
  {
        struct gendisk *disk;
        struct disk_part_iter piter;
-       struct hd_struct *part;
+       struct block_device *part;
  
        device->state = DASD_STATE_ONLINE;
        if (device->block) {
                disk = device->block->bdev->bd_disk;
                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
                while ((part = disk_part_iter_next(&piter)))
-                       kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+                       kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
                disk_part_iter_exit(&piter);
        }
        return 0;
@@@ -457,7 -459,7 +457,7 @@@ static int dasd_state_online_to_ready(s
        int rc;
        struct gendisk *disk;
        struct disk_part_iter piter;
-       struct hd_struct *part;
+       struct block_device *part;
  
        if (device->discipline->online_to_ready) {
                rc = device->discipline->online_to_ready(device);
                disk = device->block->bdev->bd_disk;
                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
                while ((part = disk_part_iter_next(&piter)))
-                       kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+                       kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
                disk_part_iter_exit(&piter);
        }
        return 0;
@@@ -619,6 -621,26 +619,6 @@@ void dasd_reload_device(struct dasd_dev
  EXPORT_SYMBOL(dasd_reload_device);
  
  /*
 - * dasd_restore_device will schedule a call do do_restore_device to the kernel
 - * event daemon.
 - */
 -static void do_restore_device(struct work_struct *work)
 -{
 -      struct dasd_device *device = container_of(work, struct dasd_device,
 -                                                restore_device);
 -      device->cdev->drv->restore(device->cdev);
 -      dasd_put_device(device);
 -}
 -
 -void dasd_restore_device(struct dasd_device *device)
 -{
 -      dasd_get_device(device);
 -      /* queue call to dasd_restore_device to the kernel event daemon. */
 -      if (!schedule_work(&device->restore_device))
 -              dasd_put_device(device);
 -}
 -
 -/*
   * Set the target state for a device and starts the state change.
   */
  void dasd_set_target_state(struct dasd_device *device, int target)
@@@ -1492,6 -1514,7 +1492,6 @@@ int dasd_start_IO(struct dasd_ccw_req *
                              "start_IO: -EIO device gone, retry");
                break;
        case -EINVAL:
 -              /* most likely caused in power management context */
                DBF_DEV_EVENT(DBF_WARNING, device, "%s",
                              "start_IO: -EINVAL device currently "
                              "not accessible");
@@@ -2025,7 -2048,7 +2025,7 @@@ static void __dasd_device_check_expire(
  static int __dasd_device_is_unusable(struct dasd_device *device,
                                     struct dasd_ccw_req *cqr)
  {
 -      int mask = ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM | DASD_STOPPED_NOSPC);
 +      int mask = ~(DASD_STOPPED_DC_WAIT | DASD_STOPPED_NOSPC);
  
        if (test_bit(DASD_FLAG_OFFLINE, &device->flags) &&
            !test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
@@@ -2089,7 -2112,8 +2089,7 @@@ static void __dasd_device_check_path_ev
        if (!dasd_path_get_tbvpm(device))
                return;
  
 -      if (device->stopped &
 -          ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM))
 +      if (device->stopped & ~(DASD_STOPPED_DC_WAIT))
                return;
        rc = device->discipline->verify_path(device,
                                             dasd_path_get_tbvpm(device));
@@@ -2956,12 -2980,6 +2956,12 @@@ static int _dasd_requeue_request(struc
  
        if (!block)
                return -EINVAL;
 +      /*
 +       * If the request is an ERP request there is nothing to requeue.
 +       * This will be done with the remaining original request.
 +       */
 +      if (cqr->refers)
 +              return 0;
        spin_lock_irq(&cqr->dq->lock);
        req = (struct request *) cqr->callback_data;
        blk_mq_requeue_request(req, false);
@@@ -3376,6 -3394,7 +3376,7 @@@ dasd_device_operations = 
        .ioctl          = dasd_ioctl,
        .compat_ioctl   = dasd_ioctl,
        .getgeo         = dasd_getgeo,
+       .set_read_only  = dasd_set_read_only,
  };
  
  /*******************************************************************************
@@@ -3770,6 -3789,11 +3771,6 @@@ int dasd_generic_path_operational(struc
                 "operational\n");
        DBF_DEV_EVENT(DBF_WARNING, device, "%s", "path operational");
        dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT);
 -      if (device->stopped & DASD_UNRESUMED_PM) {
 -              dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM);
 -              dasd_restore_device(device);
 -              return 1;
 -      }
        dasd_schedule_device_bh(device);
        if (device->block) {
                dasd_schedule_block_bh(device->block);
@@@ -4029,6 -4053,66 +4030,6 @@@ void dasd_schedule_requeue(struct dasd_
  }
  EXPORT_SYMBOL(dasd_schedule_requeue);
  
 -int dasd_generic_pm_freeze(struct ccw_device *cdev)
 -{
 -      struct dasd_device *device = dasd_device_from_cdev(cdev);
 -
 -      if (IS_ERR(device))
 -              return PTR_ERR(device);
 -
 -      /* mark device as suspended */
 -      set_bit(DASD_FLAG_SUSPENDED, &device->flags);
 -
 -      if (device->discipline->freeze)
 -              device->discipline->freeze(device);
 -
 -      /* disallow new I/O  */
 -      dasd_device_set_stop_bits(device, DASD_STOPPED_PM);
 -
 -      return dasd_generic_requeue_all_requests(device);
 -}
 -EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);
 -
 -int dasd_generic_restore_device(struct ccw_device *cdev)
 -{
 -      struct dasd_device *device = dasd_device_from_cdev(cdev);
 -      int rc = 0;
 -
 -      if (IS_ERR(device))
 -              return PTR_ERR(device);
 -
 -      /* allow new IO again */
 -      dasd_device_remove_stop_bits(device,
 -                                   (DASD_STOPPED_PM | DASD_UNRESUMED_PM));
 -
 -      dasd_schedule_device_bh(device);
 -
 -      /*
 -       * call discipline restore function
 -       * if device is stopped do nothing e.g. for disconnected devices
 -       */
 -      if (device->discipline->restore && !(device->stopped))
 -              rc = device->discipline->restore(device);
 -      if (rc || device->stopped)
 -              /*
 -               * if the resume failed for the DASD we put it in
 -               * an UNRESUMED stop state
 -               */
 -              device->stopped |= DASD_UNRESUMED_PM;
 -
 -      if (device->block) {
 -              dasd_schedule_block_bh(device->block);
 -              if (device->block->request_queue)
 -                      blk_mq_run_hw_queues(device->block->request_queue,
 -                                           true);
 -      }
 -
 -      clear_bit(DASD_FLAG_SUSPENDED, &device->flags);
 -      dasd_put_device(device);
 -      return 0;
 -}
 -EXPORT_SYMBOL_GPL(dasd_generic_restore_device);
 -
  static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device,
                                                   int rdc_buffer_size,
                                                   int magic)
@@@ -355,6 -355,10 +355,6 @@@ struct dasd_discipline 
        int (*fill_info) (struct dasd_device *, struct dasd_information2_t *);
        int (*ioctl) (struct dasd_block *, unsigned int, void __user *);
  
 -      /* suspend/resume functions */
 -      int (*freeze) (struct dasd_device *);
 -      int (*restore) (struct dasd_device *);
 -
        /* reload device after state change */
        int (*reload) (struct dasd_device *);
  
@@@ -516,6 -520,7 +516,6 @@@ struct dasd_device 
        atomic_t tasklet_scheduled;
          struct tasklet_struct tasklet;
        struct work_struct kick_work;
 -      struct work_struct restore_device;
        struct work_struct reload_device;
        struct work_struct kick_validate;
        struct work_struct suc_work;
@@@ -587,6 -592,8 +587,6 @@@ struct dasd_queue 
  #define DASD_STOPPED_PENDING 4         /* long busy */
  #define DASD_STOPPED_DC_WAIT 8         /* disconnected, wait */
  #define DASD_STOPPED_SU      16        /* summary unit check handling */
 -#define DASD_STOPPED_PM      32        /* pm state transition */
 -#define DASD_UNRESUMED_PM    64        /* pm resume failed state */
  #define DASD_STOPPED_NOSPC   128       /* no space left */
  
  /* per device flags */
@@@ -746,6 -753,7 +746,6 @@@ enum blk_eh_timer_return dasd_times_out
  void dasd_enable_device(struct dasd_device *);
  void dasd_set_target_state(struct dasd_device *, int);
  void dasd_kick_device(struct dasd_device *);
 -void dasd_restore_device(struct dasd_device *);
  void dasd_reload_device(struct dasd_device *);
  void dasd_schedule_requeue(struct dasd_device *);
  
@@@ -777,6 -785,8 +777,6 @@@ int dasd_generic_path_operational(struc
  void dasd_generic_shutdown(struct ccw_device *);
  
  void dasd_generic_handle_state_change(struct dasd_device *);
 -int dasd_generic_pm_freeze(struct ccw_device *);
 -int dasd_generic_restore_device(struct ccw_device *);
  enum uc_todo dasd_generic_uc_handler(struct ccw_device *, struct irb *);
  void dasd_generic_path_event(struct ccw_device *, int *);
  int dasd_generic_verify_path(struct dasd_device *, __u8);
@@@ -834,7 -844,8 +834,8 @@@ int dasd_scan_partitions(struct dasd_bl
  void dasd_destroy_partitions(struct dasd_block *);
  
  /* externals in dasd_ioctl.c */
- int  dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_set_read_only(struct block_device *bdev, bool ro);
  
  /* externals in dasd_proc.c */
  int dasd_proc_init(void);
diff --combined fs/btrfs/sysfs.c
@@@ -263,10 -263,6 +263,10 @@@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_H
  BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
  BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
  BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
 +/* Remove once support for zoned allocation is feature complete */
 +#ifdef CONFIG_BTRFS_DEBUG
 +BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 +#endif
  
  static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(mixed_backref),
        BTRFS_FEAT_ATTR_PTR(metadata_uuid),
        BTRFS_FEAT_ATTR_PTR(free_space_tree),
        BTRFS_FEAT_ATTR_PTR(raid1c34),
 +#ifdef CONFIG_BTRFS_DEBUG
 +      BTRFS_FEAT_ATTR_PTR(zoned),
 +#endif
        NULL
  };
  
@@@ -336,35 -329,10 +336,35 @@@ static ssize_t send_stream_version_show
  }
  BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
  
 +static const char *rescue_opts[] = {
 +      "usebackuproot",
 +      "nologreplay",
 +      "ignorebadroots",
 +      "ignoredatacsums",
 +      "all",
 +};
 +
 +static ssize_t supported_rescue_options_show(struct kobject *kobj,
 +                                           struct kobj_attribute *a,
 +                                           char *buf)
 +{
 +      ssize_t ret = 0;
 +      int i;
 +
 +      for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
 +              ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
 +                               (i ? " " : ""), rescue_opts[i]);
 +      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
 +      return ret;
 +}
 +BTRFS_ATTR(static_feature, supported_rescue_options,
 +         supported_rescue_options_show);
 +
  static struct attribute *btrfs_supported_static_feature_attrs[] = {
        BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
        BTRFS_ATTR_PTR(static_feature, supported_checksums),
        BTRFS_ATTR_PTR(static_feature, send_stream_version),
 +      BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
        NULL
  };
  
@@@ -465,8 -433,7 +465,8 @@@ static ssize_t btrfs_discard_iops_limit
                return -EINVAL;
  
        WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
 -
 +      btrfs_discard_calc_delay(discard_ctl);
 +      btrfs_discard_schedule_work(discard_ctl, true);
        return len;
  }
  BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
@@@ -496,7 -463,7 +496,7 @@@ static ssize_t btrfs_discard_kbps_limit
                return -EINVAL;
  
        WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
 -
 +      btrfs_discard_schedule_work(discard_ctl, true);
        return len;
  }
  BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
@@@ -887,82 -854,6 +887,82 @@@ static ssize_t btrfs_exclusive_operatio
  }
  BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
  
 +static ssize_t btrfs_generation_show(struct kobject *kobj,
 +                                   struct kobj_attribute *a, char *buf)
 +{
 +      struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 +
 +      return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
 +}
 +BTRFS_ATTR(, generation, btrfs_generation_show);
 +
 +/*
 + * Look for an exact string @string in @buffer with possible leading or
 + * trailing whitespace
 + */
 +static bool strmatch(const char *buffer, const char *string)
 +{
 +      const size_t len = strlen(string);
 +
 +      /* Skip leading whitespace */
 +      buffer = skip_spaces(buffer);
 +
 +      /* Match entire string, check if the rest is whitespace or empty */
 +      if (strncmp(string, buffer, len) == 0 &&
 +          strlen(skip_spaces(buffer + len)) == 0)
 +              return true;
 +
 +      return false;
 +}
 +
 +static const char * const btrfs_read_policy_name[] = { "pid" };
 +
 +static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 +                                    struct kobj_attribute *a, char *buf)
 +{
 +      struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
 +      ssize_t ret = 0;
 +      int i;
 +
 +      for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
 +              if (fs_devices->read_policy == i)
 +                      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
 +                                       (ret == 0 ? "" : " "),
 +                                       btrfs_read_policy_name[i]);
 +              else
 +                      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
 +                                       (ret == 0 ? "" : " "),
 +                                       btrfs_read_policy_name[i]);
 +      }
 +
 +      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
 +
 +      return ret;
 +}
 +
 +static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 +                                     struct kobj_attribute *a,
 +                                     const char *buf, size_t len)
 +{
 +      struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
 +      int i;
 +
 +      for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
 +              if (strmatch(buf, btrfs_read_policy_name[i])) {
 +                      if (i != fs_devices->read_policy) {
 +                              fs_devices->read_policy = i;
 +                              btrfs_info(fs_devices->fs_info,
 +                                         "read policy set to '%s'",
 +                                         btrfs_read_policy_name[i]);
 +                      }
 +                      return len;
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
 +
  static const struct attribute *btrfs_attrs[] = {
        BTRFS_ATTR_PTR(, label),
        BTRFS_ATTR_PTR(, nodesize),
        BTRFS_ATTR_PTR(, metadata_uuid),
        BTRFS_ATTR_PTR(, checksum),
        BTRFS_ATTR_PTR(, exclusive_operation),
 +      BTRFS_ATTR_PTR(, generation),
 +      BTRFS_ATTR_PTR(, read_policy),
        NULL,
  };
  
@@@ -1318,7 -1207,7 +1318,7 @@@ static const char *alloc_name(u64 flags
        default:
                WARN_ON(1);
                return "invalid-combination";
 -      };
 +      }
  }
  
  /*
@@@ -1343,8 -1232,6 +1343,6 @@@ int btrfs_sysfs_add_space_info_type(str
  
  void btrfs_sysfs_remove_device(struct btrfs_device *device)
  {
-       struct hd_struct *disk;
-       struct kobject *disk_kobj;
        struct kobject *devices_kobj;
  
        /*
        devices_kobj = device->fs_info->fs_devices->devices_kobj;
        ASSERT(devices_kobj);
  
-       if (device->bdev) {
-               disk = device->bdev->bd_part;
-               disk_kobj = &part_to_dev(disk)->kobj;
-               sysfs_remove_link(devices_kobj, disk_kobj->name);
-       }
+       if (device->bdev)
+               sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
  
        if (device->devid_kobj.state_initialized) {
                kobject_del(&device->devid_kobj);
@@@ -1464,11 -1348,7 +1459,7 @@@ int btrfs_sysfs_add_device(struct btrfs
        nofs_flag = memalloc_nofs_save();
  
        if (device->bdev) {
-               struct hd_struct *disk;
-               struct kobject *disk_kobj;
-               disk = device->bdev->bd_part;
-               disk_kobj = &part_to_dev(disk)->kobj;
+               struct kobject *disk_kobj = bdev_kobj(device->bdev);
  
                ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
                if (ret) {
diff --combined fs/btrfs/volumes.c
@@@ -31,7 -31,6 +31,7 @@@
  #include "space-info.h"
  #include "block-group.h"
  #include "discard.h"
 +#include "zoned.h"
  
  const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10] = {
@@@ -375,7 -374,6 +375,7 @@@ void btrfs_free_device(struct btrfs_dev
        rcu_string_free(device->name);
        extent_io_tree_release(&device->alloc_state);
        bio_put(device->flush_bio);
 +      btrfs_destroy_dev_zone_info(device);
        kfree(device);
  }
  
@@@ -669,10 -667,6 +669,10 @@@ static int btrfs_open_one_device(struc
        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        device->mode = flags;
  
 +      ret = btrfs_get_dev_zone_info(device);
 +      if (ret != 0)
 +              goto error_free_page;
 +
        fs_devices->open_devices++;
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
            device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@@ -828,7 -822,7 +828,7 @@@ static noinline struct btrfs_device *de
        } else {
                mutex_lock(&fs_devices->device_list_mutex);
                device = btrfs_find_device(fs_devices, devid,
 -                              disk_super->dev_item.uuid, NULL, false);
 +                              disk_super->dev_item.uuid, NULL);
  
                /*
                 * If this disk has been pulled into an fs devices created by
                 * make sure it's the same device if the device is mounted
                 */
                if (device->bdev) {
-                       struct block_device *path_bdev;
+                       int error;
+                       dev_t path_dev;
  
-                       path_bdev = lookup_bdev(path);
-                       if (IS_ERR(path_bdev)) {
+                       error = lookup_bdev(path, &path_dev);
+                       if (error) {
                                mutex_unlock(&fs_devices->device_list_mutex);
-                               return ERR_CAST(path_bdev);
+                               return ERR_PTR(error);
                        }
  
-                       if (device->bdev != path_bdev) {
-                               bdput(path_bdev);
+                       if (device->bdev->bd_dev != path_dev) {
                                mutex_unlock(&fs_devices->device_list_mutex);
 -                              btrfs_warn_in_rcu(device->fs_info,
 +                              /*
 +                               * device->fs_info may not be reliable here, so
 +                               * pass in a NULL instead. This avoids a
 +                               * possible use-after-free when the fs_info and
 +                               * fs_info->sb are already torn down.
 +                               */
 +                              btrfs_warn_in_rcu(NULL,
        "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
                                                  path, devid, found_transid,
                                                  current->comm,
                                                  task_pid_nr(current));
                                return ERR_PTR(-EEXIST);
                        }
-                       bdput(path_bdev);
                        btrfs_info_in_rcu(device->fs_info,
        "devid %llu device path %s changed to %s scanned by %s (%d)",
                                          devid, rcu_str_deref(device->name),
@@@ -1050,7 -1037,7 +1049,7 @@@ error
  }
  
  static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 -                                    int step, struct btrfs_device **latest_dev)
 +                                    struct btrfs_device **latest_dev)
  {
        struct btrfs_device *device, *next;
  
   * After we have read the system tree and know devids belonging to this
   * filesystem, remove the device which does not belong there.
   */
 -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
 +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
  {
        struct btrfs_device *latest_dev = NULL;
        struct btrfs_fs_devices *seed_dev;
  
        mutex_lock(&uuid_mutex);
 -      __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
 +      __btrfs_free_extra_devids(fs_devices, &latest_dev);
  
        list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
 -              __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
 +              __btrfs_free_extra_devids(seed_dev, &latest_dev);
  
        fs_devices->latest_bdev = latest_dev->bdev;
  
@@@ -1143,7 -1130,6 +1142,7 @@@ static void btrfs_close_one_device(stru
                device->bdev = NULL;
        }
        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 +      btrfs_destroy_dev_zone_info(device);
  
        device->fs_info = NULL;
        atomic_set(&device->dev_stats_ccnt, 0);
@@@ -1224,7 -1210,6 +1223,7 @@@ static int open_fs_devices(struct btrfs
        fs_devices->latest_bdev = latest_dev->bdev;
        fs_devices->total_rw_bytes = 0;
        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
 +      fs_devices->read_policy = BTRFS_READ_POLICY_PID;
  
        return 0;
  }
@@@ -1276,7 -1261,7 +1275,7 @@@ void btrfs_release_disk_super(struct bt
  }
  
  static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
 -                                                     u64 bytenr)
 +                                                     u64 bytenr, u64 bytenr_orig)
  {
        struct btrfs_super_block *disk_super;
        struct page *page;
        /* align our pointer to the offset of the super block */
        disk_super = p + offset_in_page(bytenr);
  
 -      if (btrfs_super_bytenr(disk_super) != bytenr ||
 +      if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
            btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
                btrfs_release_disk_super(p);
                return ERR_PTR(-EINVAL);
@@@ -1342,8 -1327,7 +1341,8 @@@ struct btrfs_device *btrfs_scan_one_dev
        bool new_device_added = false;
        struct btrfs_device *device = NULL;
        struct block_device *bdev;
 -      u64 bytenr;
 +      u64 bytenr, bytenr_orig;
 +      int ret;
  
        lockdep_assert_held(&uuid_mutex);
  
         * So, we need to add a special mount option to scan for
         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
         */
 -      bytenr = btrfs_sb_offset(0);
        flags |= FMODE_EXCL;
  
        bdev = blkdev_get_by_path(path, flags, holder);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
  
 -      disk_super = btrfs_read_disk_super(bdev, bytenr);
 +      bytenr_orig = btrfs_sb_offset(0);
 +      ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
 +      if (ret)
 +              return ERR_PTR(ret);
 +
 +      disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
        if (IS_ERR(disk_super)) {
                device = ERR_CAST(disk_super);
                goto error_bdev_put;
@@@ -2028,11 -2008,6 +2027,11 @@@ void btrfs_scratch_superblocks(struct b
                if (IS_ERR(disk_super))
                        continue;
  
 +              if (bdev_is_zoned(bdev)) {
 +                      btrfs_reset_sb_log_zones(bdev, copy_num);
 +                      continue;
 +              }
 +
                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
  
                page = virt_to_page(disk_super);
@@@ -2311,10 -2286,10 +2310,10 @@@ static struct btrfs_device *btrfs_find_
        dev_uuid = disk_super->dev_item.uuid;
        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
 -                                         disk_super->metadata_uuid, true);
 +                                         disk_super->metadata_uuid);
        else
                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
 -                                         disk_super->fsid, true);
 +                                         disk_super->fsid);
  
        btrfs_release_disk_super(disk_super);
        if (!device)
@@@ -2334,7 -2309,7 +2333,7 @@@ struct btrfs_device *btrfs_find_device_
  
        if (devid) {
                device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
 -                                         NULL, true);
 +                                         NULL);
                if (!device)
                        return ERR_PTR(-ENOENT);
                return device;
@@@ -2483,7 -2458,7 +2482,7 @@@ next_slot
                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                   BTRFS_FSID_SIZE);
                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
 -                                         fs_uuid, true);
 +                                         fs_uuid);
                BUG_ON(!device); /* Logic error */
  
                if (device->fs_devices->seeding) {
@@@ -2525,11 -2500,6 +2524,11 @@@ int btrfs_init_new_device(struct btrfs_
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
  
 +      if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 +              ret = -EINVAL;
 +              goto error;
 +      }
 +
        if (fs_devices->seeding) {
                seeding_dev = 1;
                down_write(&sb->s_umount);
        }
        rcu_assign_pointer(device->name, name);
  
 +      device->fs_info = fs_info;
 +      device->bdev = bdev;
 +
 +      ret = btrfs_get_dev_zone_info(device);
 +      if (ret)
 +              goto error_free_device;
 +
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
 -              goto error_free_device;
 +              goto error_free_zone;
        }
  
        q = bdev_get_queue(bdev);
                                         fs_info->sectorsize);
        device->disk_total_bytes = device->total_bytes;
        device->commit_total_bytes = device->total_bytes;
 -      device->fs_info = fs_info;
 -      device->bdev = bdev;
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
        device->mode = FMODE_EXCL;
@@@ -2732,8 -2697,6 +2731,8 @@@ error_trans
                sb->s_flags |= SB_RDONLY;
        if (trans)
                btrfs_end_transaction(trans);
 +error_free_zone:
 +      btrfs_destroy_dev_zone_info(device);
  error_free_device:
        btrfs_free_device(device);
  error:
@@@ -5509,18 -5472,7 +5508,18 @@@ static int find_live_mirror(struct btrf
        else
                num_stripes = map->num_stripes;
  
 -      preferred_mirror = first + current->pid % num_stripes;
 +      switch (fs_info->fs_devices->read_policy) {
 +      default:
 +              /* Shouldn't happen, just warn and use pid instead of failing */
 +              btrfs_warn_rl(fs_info,
 +                            "unknown read_policy type %u, reset to pid",
 +                            fs_info->fs_devices->read_policy);
 +              fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
 +              fallthrough;
 +      case BTRFS_READ_POLICY_PID:
 +              preferred_mirror = first + (current->pid % num_stripes);
 +              break;
 +      }
  
        if (dev_replace_is_ongoing &&
            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@@ -6376,7 -6328,7 +6375,7 @@@ static void submit_stripe_bio(struct bt
        bio->bi_iter.bi_sector = physical >> 9;
        btrfs_debug_in_rcu(fs_info,
        "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
 -              bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
 +              bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
                dev->devid, bio->bi_iter.bi_size);
        bio_set_dev(bio, dev->bdev);
@@@ -6408,7 -6360,7 +6407,7 @@@ blk_status_t btrfs_map_bio(struct btrfs
  {
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
 -      u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 +      u64 logical = bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        int ret;
   * If @seed is true, traverse through the seed devices.
   */
  struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
 -                                     u64 devid, u8 *uuid, u8 *fsid,
 -                                     bool seed)
 +                                     u64 devid, u8 *uuid, u8 *fsid)
  {
        struct btrfs_device *device;
        struct btrfs_fs_devices *seed_devs;
@@@ -6695,7 -6648,7 +6694,7 @@@ static int read_one_chunk(struct btrfs_
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
                map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
 -                                                      devid, uuid, NULL, true);
 +                                                      devid, uuid, NULL);
                if (!map->stripes[i].dev &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        free_extent_map(em);
@@@ -6834,7 -6787,7 +6833,7 @@@ static int read_one_dev(struct extent_b
        }
  
        device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
 -                                 fs_uuid, true);
 +                                 fs_uuid);
        if (!device) {
                if (!btrfs_test_opt(fs_info, DEGRADED)) {
                        btrfs_report_missing_device(fs_info, devid,
        }
  
        fill_device_from_item(leaf, dev_item, device);
 +      if (device->bdev) {
 +              u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
 +
 +              if (device->total_bytes > max_total_bytes) {
 +                      btrfs_err(fs_info,
 +                      "device total_bytes should be at most %llu but found %llu",
 +                                max_total_bytes, device->total_bytes);
 +                      return -EINVAL;
 +              }
 +      }
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
           !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@@ -6941,11 -6884,11 +6940,11 @@@ int btrfs_read_sys_array(struct btrfs_f
         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
         * overallocate but we can keep it as-is, only the first page is used.
         */
 -      sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
 +      sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
 +                                        root->root_key.objectid, 0);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
        set_extent_buffer_uptodate(sb);
 -      btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
        /*
         * The sb extent buffer is artificial and just used to read the system array.
         * set_extent_buffer_uptodate() call does not properly mark all it's
@@@ -7109,8 -7052,12 +7108,8 @@@ static void readahead_tree_node_childre
        int i;
        const int nr_items = btrfs_header_nritems(node);
  
 -      for (i = 0; i < nr_items; i++) {
 -              u64 start;
 -
 -              start = btrfs_node_blockptr(node, i);
 -              readahead_tree_block(node->fs_info, start);
 -      }
 +      for (i = 0; i < nr_items; i++)
 +              btrfs_readahead_node_child(node, i);
  }
  
  int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
@@@ -7497,7 -7444,8 +7496,7 @@@ int btrfs_get_dev_stats(struct btrfs_fs
        int i;
  
        mutex_lock(&fs_devices->device_list_mutex);
 -      dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
 -                              true);
 +      dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
  
        if (!dev) {
@@@ -7628,13 -7576,28 +7627,13 @@@ static int verify_one_dev_extent(struc
        }
  
        /* Make sure no dev extent is beyond device bondary */
 -      dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
 +      dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
        if (!dev) {
                btrfs_err(fs_info, "failed to find devid %llu", devid);
                ret = -EUCLEAN;
                goto out;
        }
  
 -      /* It's possible this device is a dummy for seed device */
 -      if (dev->disk_total_bytes == 0) {
 -              struct btrfs_fs_devices *devs;
 -
 -              devs = list_first_entry(&fs_info->fs_devices->seed_list,
 -                                      struct btrfs_fs_devices, seed_list);
 -              dev = btrfs_find_device(devs, devid, NULL, NULL, false);
 -              if (!dev) {
 -                      btrfs_err(fs_info, "failed to find seed devid %llu",
 -                                devid);
 -                      ret = -EUCLEAN;
 -                      goto out;
 -              }
 -      }
 -
        if (physical_offset + physical_len > dev->disk_total_bytes) {
                btrfs_err(fs_info,
  "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@@ -7689,19 -7652,6 +7688,19 @@@ int btrfs_verify_dev_extents(struct btr
        u64 prev_dev_ext_end = 0;
        int ret = 0;
  
 +      /*
 +       * We don't have a dev_root because we mounted with ignorebadroots and
 +       * failed to load the root, so we want to skip the verification in this
 +       * case for sure.
 +       *
 +       * However if the dev root is fine, but the tree itself is corrupted
 +       * we'd still fail to mount.  This verification is only to make sure
 +       * writes can happen safely, so instead just bypass this check
 +       * completely in the case of IGNOREBADROOTS.
 +       */
 +      if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
 +              return 0;
 +
        key.objectid = 1;
        key.type = BTRFS_DEV_EXTENT_KEY;
        key.offset = 0;
diff --combined fs/btrfs/zoned.c
index 1555451,0000000..c388466
mode 100644,000000..100644
--- /dev/null
@@@ -1,616 -1,0 +1,616 @@@
-       nr_sectors = bdev->bd_part->nr_sects;
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include <linux/slab.h>
 +#include <linux/blkdev.h>
 +#include "ctree.h"
 +#include "volumes.h"
 +#include "zoned.h"
 +#include "rcu-string.h"
 +
 +/* Maximum number of zones to report per blkdev_report_zones() call */
 +#define BTRFS_REPORT_NR_ZONES   4096
 +
 +/* Number of superblock log zones */
 +#define BTRFS_NR_SB_LOG_ZONES 2
 +
 +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
 +{
 +      struct blk_zone *zones = data;
 +
 +      memcpy(&zones[idx], zone, sizeof(*zone));
 +
 +      return 0;
 +}
 +
 +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
 +                          u64 *wp_ret)
 +{
 +      bool empty[BTRFS_NR_SB_LOG_ZONES];
 +      bool full[BTRFS_NR_SB_LOG_ZONES];
 +      sector_t sector;
 +
 +      ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
 +             zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
 +
 +      empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
 +      empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
 +      full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
 +      full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
 +
 +      /*
 +       * Possible states of log buffer zones
 +       *
 +       *           Empty[0]  In use[0]  Full[0]
 +       * Empty[1]         *          x        0
 +       * In use[1]        0          x        0
 +       * Full[1]          1          1        C
 +       *
 +       * Log position:
 +       *   *: Special case, no superblock is written
 +       *   0: Use write pointer of zones[0]
 +       *   1: Use write pointer of zones[1]
 +       *   C: Compare super blcoks from zones[0] and zones[1], use the latest
 +       *      one determined by generation
 +       *   x: Invalid state
 +       */
 +
 +      if (empty[0] && empty[1]) {
 +              /* Special case to distinguish no superblock to read */
 +              *wp_ret = zones[0].start << SECTOR_SHIFT;
 +              return -ENOENT;
 +      } else if (full[0] && full[1]) {
 +              /* Compare two super blocks */
 +              struct address_space *mapping = bdev->bd_inode->i_mapping;
 +              struct page *page[BTRFS_NR_SB_LOG_ZONES];
 +              struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
 +              int i;
 +
 +              for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 +                      u64 bytenr;
 +
 +                      bytenr = ((zones[i].start + zones[i].len)
 +                                 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 +
 +                      page[i] = read_cache_page_gfp(mapping,
 +                                      bytenr >> PAGE_SHIFT, GFP_NOFS);
 +                      if (IS_ERR(page[i])) {
 +                              if (i == 1)
 +                                      btrfs_release_disk_super(super[0]);
 +                              return PTR_ERR(page[i]);
 +                      }
 +                      super[i] = page_address(page[i]);
 +              }
 +
 +              if (super[0]->generation > super[1]->generation)
 +                      sector = zones[1].start;
 +              else
 +                      sector = zones[0].start;
 +
 +              for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 +                      btrfs_release_disk_super(super[i]);
 +      } else if (!full[0] && (empty[1] || full[1])) {
 +              sector = zones[0].wp;
 +      } else if (full[0]) {
 +              sector = zones[1].wp;
 +      } else {
 +              return -EUCLEAN;
 +      }
 +      *wp_ret = sector << SECTOR_SHIFT;
 +      return 0;
 +}
 +
 +/*
 + * The following zones are reserved as the circular buffer on ZONED btrfs.
 + *  - The primary superblock: zones 0 and 1
 + *  - The first copy: zones 16 and 17
 + *  - The second copy: zones 1024 or zone at 256GB which is minimum, and
 + *                     the following one
 + */
 +static inline u32 sb_zone_number(int shift, int mirror)
 +{
 +      ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 +
 +      switch (mirror) {
 +      case 0: return 0;
 +      case 1: return 16;
 +      case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
 +      }
 +
 +      return 0;
 +}
 +
 +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 +                             struct blk_zone *zones, unsigned int *nr_zones)
 +{
 +      int ret;
 +
 +      if (!*nr_zones)
 +              return 0;
 +
 +      ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 +                                copy_zone_info_cb, zones);
 +      if (ret < 0) {
 +              btrfs_err_in_rcu(device->fs_info,
 +                               "zoned: failed to read zone %llu on %s (devid %llu)",
 +                               pos, rcu_str_deref(device->name),
 +                               device->devid);
 +              return ret;
 +      }
 +      *nr_zones = ret;
 +      if (!ret)
 +              return -EIO;
 +
 +      return 0;
 +}
 +
 +int btrfs_get_dev_zone_info(struct btrfs_device *device)
 +{
 +      struct btrfs_zoned_device_info *zone_info = NULL;
 +      struct block_device *bdev = device->bdev;
 +      struct request_queue *queue = bdev_get_queue(bdev);
 +      sector_t nr_sectors;
 +      sector_t sector = 0;
 +      struct blk_zone *zones = NULL;
 +      unsigned int i, nreported = 0, nr_zones;
 +      unsigned int zone_sectors;
 +      int ret;
 +
 +      if (!bdev_is_zoned(bdev))
 +              return 0;
 +
 +      if (device->zone_info)
 +              return 0;
 +
 +      zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 +      if (!zone_info)
 +              return -ENOMEM;
 +
-       nr_sectors = bdev->bd_part->nr_sects;
++      nr_sectors = bdev_nr_sectors(bdev);
 +      zone_sectors = bdev_zone_sectors(bdev);
 +      /* Check if it's power of 2 (see is_power_of_2) */
 +      ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 +      zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 +      zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 +      zone_info->max_zone_append_size =
 +              (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
 +      zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 +      if (!IS_ALIGNED(nr_sectors, zone_sectors))
 +              zone_info->nr_zones++;
 +
 +      zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 +      if (!zone_info->seq_zones) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +
 +      zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 +      if (!zone_info->empty_zones) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +
 +      zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 +      if (!zones) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +
 +      /* Get zones type */
 +      while (sector < nr_sectors) {
 +              nr_zones = BTRFS_REPORT_NR_ZONES;
 +              ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 +                                        &nr_zones);
 +              if (ret)
 +                      goto out;
 +
 +              for (i = 0; i < nr_zones; i++) {
 +                      if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 +                              __set_bit(nreported, zone_info->seq_zones);
 +                      if (zones[i].cond == BLK_ZONE_COND_EMPTY)
 +                              __set_bit(nreported, zone_info->empty_zones);
 +                      nreported++;
 +              }
 +              sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 +      }
 +
 +      if (nreported != zone_info->nr_zones) {
 +              btrfs_err_in_rcu(device->fs_info,
 +                               "inconsistent number of zones on %s (%u/%u)",
 +                               rcu_str_deref(device->name), nreported,
 +                               zone_info->nr_zones);
 +              ret = -EIO;
 +              goto out;
 +      }
 +
 +      /* Validate superblock log */
 +      nr_zones = BTRFS_NR_SB_LOG_ZONES;
 +      for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 +              u32 sb_zone;
 +              u64 sb_wp;
 +              int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 +
 +              sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 +              if (sb_zone + 1 >= zone_info->nr_zones)
 +                      continue;
 +
 +              sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
 +              ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
 +                                        &zone_info->sb_zones[sb_pos],
 +                                        &nr_zones);
 +              if (ret)
 +                      goto out;
 +
 +              if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 +                      btrfs_err_in_rcu(device->fs_info,
 +      "zoned: failed to read super block log zone info at devid %llu zone %u",
 +                                       device->devid, sb_zone);
 +                      ret = -EUCLEAN;
 +                      goto out;
 +              }
 +
 +              /*
 +               * If zones[0] is conventional, always use the beggining of the
 +               * zone to record superblock. No need to validate in that case.
 +               */
 +              if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 +                  BLK_ZONE_TYPE_CONVENTIONAL)
 +                      continue;
 +
 +              ret = sb_write_pointer(device->bdev,
 +                                     &zone_info->sb_zones[sb_pos], &sb_wp);
 +              if (ret != -ENOENT && ret) {
 +                      btrfs_err_in_rcu(device->fs_info,
 +                      "zoned: super block log zone corrupted devid %llu zone %u",
 +                                       device->devid, sb_zone);
 +                      ret = -EUCLEAN;
 +                      goto out;
 +              }
 +      }
 +
 +
 +      kfree(zones);
 +
 +      device->zone_info = zone_info;
 +
 +      /* device->fs_info is not safe to use for printing messages */
 +      btrfs_info_in_rcu(NULL,
 +                      "host-%s zoned block device %s, %u zones of %llu bytes",
 +                      bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
 +                      rcu_str_deref(device->name), zone_info->nr_zones,
 +                      zone_info->zone_size);
 +
 +      return 0;
 +
 +out:
 +      kfree(zones);
 +      bitmap_free(zone_info->empty_zones);
 +      bitmap_free(zone_info->seq_zones);
 +      kfree(zone_info);
 +
 +      return ret;
 +}
 +
 +void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 +{
 +      struct btrfs_zoned_device_info *zone_info = device->zone_info;
 +
 +      if (!zone_info)
 +              return;
 +
 +      bitmap_free(zone_info->seq_zones);
 +      bitmap_free(zone_info->empty_zones);
 +      kfree(zone_info);
 +      device->zone_info = NULL;
 +}
 +
 +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 +                     struct blk_zone *zone)
 +{
 +      unsigned int nr_zones = 1;
 +      int ret;
 +
 +      ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 +      if (ret != 0 || !nr_zones)
 +              return ret ? ret : -EIO;
 +
 +      return 0;
 +}
 +
 +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 +{
 +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 +      struct btrfs_device *device;
 +      u64 zoned_devices = 0;
 +      u64 nr_devices = 0;
 +      u64 zone_size = 0;
 +      u64 max_zone_append_size = 0;
 +      const bool incompat_zoned = btrfs_is_zoned(fs_info);
 +      int ret = 0;
 +
 +      /* Count zoned devices */
 +      list_for_each_entry(device, &fs_devices->devices, dev_list) {
 +              enum blk_zoned_model model;
 +
 +              if (!device->bdev)
 +                      continue;
 +
 +              model = bdev_zoned_model(device->bdev);
 +              if (model == BLK_ZONED_HM ||
 +                  (model == BLK_ZONED_HA && incompat_zoned)) {
 +                      struct btrfs_zoned_device_info *zone_info;
 +
 +                      zone_info = device->zone_info;
 +                      zoned_devices++;
 +                      if (!zone_size) {
 +                              zone_size = zone_info->zone_size;
 +                      } else if (zone_info->zone_size != zone_size) {
 +                              btrfs_err(fs_info,
 +              "zoned: unequal block device zone sizes: have %llu found %llu",
 +                                        device->zone_info->zone_size,
 +                                        zone_size);
 +                              ret = -EINVAL;
 +                              goto out;
 +                      }
 +                      if (!max_zone_append_size ||
 +                          (zone_info->max_zone_append_size &&
 +                           zone_info->max_zone_append_size < max_zone_append_size))
 +                              max_zone_append_size =
 +                                      zone_info->max_zone_append_size;
 +              }
 +              nr_devices++;
 +      }
 +
 +      if (!zoned_devices && !incompat_zoned)
 +              goto out;
 +
 +      if (!zoned_devices && incompat_zoned) {
 +              /* No zoned block device found on ZONED filesystem */
 +              btrfs_err(fs_info,
 +                        "zoned: no zoned devices found on a zoned filesystem");
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      if (zoned_devices && !incompat_zoned) {
 +              btrfs_err(fs_info,
 +                        "zoned: mode not enabled but zoned device found");
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      if (zoned_devices != nr_devices) {
 +              btrfs_err(fs_info,
 +                        "zoned: cannot mix zoned and regular devices");
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      /*
 +       * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 +       * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
 +       * check the alignment here.
 +       */
 +      if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 +              btrfs_err(fs_info,
 +                        "zoned: zone size %llu not aligned to stripe %u",
 +                        zone_size, BTRFS_STRIPE_LEN);
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 +              btrfs_err(fs_info, "zoned: mixed block groups not supported");
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      fs_info->zone_size = zone_size;
 +      fs_info->max_zone_append_size = max_zone_append_size;
 +
 +      btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 +out:
 +      return ret;
 +}
 +
 +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 +{
 +      if (!btrfs_is_zoned(info))
 +              return 0;
 +
 +      /*
 +       * Space cache writing is not COWed. Disable that to avoid write errors
 +       * in sequential zones.
 +       */
 +      if (btrfs_test_opt(info, SPACE_CACHE)) {
 +              btrfs_err(info, "zoned: space cache v1 is not supported");
 +              return -EINVAL;
 +      }
 +
 +      if (btrfs_test_opt(info, NODATACOW)) {
 +              btrfs_err(info, "zoned: NODATACOW not supported");
 +              return -EINVAL;
 +      }
 +
 +      return 0;
 +}
 +
 +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 +                         int rw, u64 *bytenr_ret)
 +{
 +      u64 wp;
 +      int ret;
 +
 +      if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 +              *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 +              return 0;
 +      }
 +
 +      ret = sb_write_pointer(bdev, zones, &wp);
 +      if (ret != -ENOENT && ret < 0)
 +              return ret;
 +
 +      if (rw == WRITE) {
 +              struct blk_zone *reset = NULL;
 +
 +              if (wp == zones[0].start << SECTOR_SHIFT)
 +                      reset = &zones[0];
 +              else if (wp == zones[1].start << SECTOR_SHIFT)
 +                      reset = &zones[1];
 +
 +              if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 +                      ASSERT(reset->cond == BLK_ZONE_COND_FULL);
 +
 +                      ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 +                                             reset->start, reset->len,
 +                                             GFP_NOFS);
 +                      if (ret)
 +                              return ret;
 +
 +                      reset->cond = BLK_ZONE_COND_EMPTY;
 +                      reset->wp = reset->start;
 +              }
 +      } else if (ret != -ENOENT) {
 +              /* For READ, we want the precious one */
 +              if (wp == zones[0].start << SECTOR_SHIFT)
 +                      wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
 +              wp -= BTRFS_SUPER_INFO_SIZE;
 +      }
 +
 +      *bytenr_ret = wp;
 +      return 0;
 +
 +}
 +
 +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 +                             u64 *bytenr_ret)
 +{
 +      struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 +      unsigned int zone_sectors;
 +      u32 sb_zone;
 +      int ret;
 +      u64 zone_size;
 +      u8 zone_sectors_shift;
 +      sector_t nr_sectors;
 +      u32 nr_zones;
 +
 +      if (!bdev_is_zoned(bdev)) {
 +              *bytenr_ret = btrfs_sb_offset(mirror);
 +              return 0;
 +      }
 +
 +      ASSERT(rw == READ || rw == WRITE);
 +
 +      zone_sectors = bdev_zone_sectors(bdev);
 +      if (!is_power_of_2(zone_sectors))
 +              return -EINVAL;
 +      zone_size = zone_sectors << SECTOR_SHIFT;
 +      zone_sectors_shift = ilog2(zone_sectors);
-       nr_sectors = bdev->bd_part->nr_sects;
++      nr_sectors = bdev_nr_sectors(bdev);
 +      nr_zones = nr_sectors >> zone_sectors_shift;
 +
 +      sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 +      if (sb_zone + 1 >= nr_zones)
 +              return -ENOENT;
 +
 +      ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
 +                                BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 +                                zones);
 +      if (ret < 0)
 +              return ret;
 +      if (ret != BTRFS_NR_SB_LOG_ZONES)
 +              return -EIO;
 +
 +      return sb_log_location(bdev, zones, rw, bytenr_ret);
 +}
 +
 +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 +                        u64 *bytenr_ret)
 +{
 +      struct btrfs_zoned_device_info *zinfo = device->zone_info;
 +      u32 zone_num;
 +
 +      if (!zinfo) {
 +              *bytenr_ret = btrfs_sb_offset(mirror);
 +              return 0;
 +      }
 +
 +      zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 +      if (zone_num + 1 >= zinfo->nr_zones)
 +              return -ENOENT;
 +
 +      return sb_log_location(device->bdev,
 +                             &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 +                             rw, bytenr_ret);
 +}
 +
 +static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 +                                int mirror)
 +{
 +      u32 zone_num;
 +
 +      if (!zinfo)
 +              return false;
 +
 +      zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 +      if (zone_num + 1 >= zinfo->nr_zones)
 +              return false;
 +
 +      if (!test_bit(zone_num, zinfo->seq_zones))
 +              return false;
 +
 +      return true;
 +}
 +
 +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 +{
 +      struct btrfs_zoned_device_info *zinfo = device->zone_info;
 +      struct blk_zone *zone;
 +
 +      if (!is_sb_log_zone(zinfo, mirror))
 +              return;
 +
 +      zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 +      if (zone->cond != BLK_ZONE_COND_FULL) {
 +              if (zone->cond == BLK_ZONE_COND_EMPTY)
 +                      zone->cond = BLK_ZONE_COND_IMP_OPEN;
 +
 +              zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 +
 +              if (zone->wp == zone->start + zone->len)
 +                      zone->cond = BLK_ZONE_COND_FULL;
 +
 +              return;
 +      }
 +
 +      zone++;
 +      ASSERT(zone->cond != BLK_ZONE_COND_FULL);
 +      if (zone->cond == BLK_ZONE_COND_EMPTY)
 +              zone->cond = BLK_ZONE_COND_IMP_OPEN;
 +
 +      zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 +
 +      if (zone->wp == zone->start + zone->len)
 +              zone->cond = BLK_ZONE_COND_FULL;
 +}
 +
 +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 +{
 +      sector_t zone_sectors;
 +      sector_t nr_sectors;
 +      u8 zone_sectors_shift;
 +      u32 sb_zone;
 +      u32 nr_zones;
 +
 +      zone_sectors = bdev_zone_sectors(bdev);
 +      zone_sectors_shift = ilog2(zone_sectors);
++      nr_sectors = bdev_nr_sectors(bdev);
 +      nr_zones = nr_sectors >> zone_sectors_shift;
 +
 +      sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 +      if (sb_zone + 1 >= nr_zones)
 +              return -ENOENT;
 +
 +      return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 +                              sb_zone << zone_sectors_shift,
 +                              zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 +}
diff --combined fs/buffer.c
@@@ -523,7 -523,7 +523,7 @@@ repeat
  
  void emergency_thaw_bdev(struct super_block *sb)
  {
-       while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+       while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
                printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
  }
  
@@@ -657,7 -657,7 +657,7 @@@ int __set_page_dirty_buffers(struct pag
                } while (bh != head);
        }
        /*
 -       * Lock out page->mem_cgroup migration to keep PageDirty
 +       * Lock out page's memcg migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        lock_page_memcg(page);
diff --combined fs/ext4/super.c
@@@ -2638,6 -2638,10 +2638,6 @@@ static int _ext4_show_options(struct se
        } else if (test_opt2(sb, DAX_INODE)) {
                SEQ_OPTS_PUTS("dax=inode");
        }
 -
 -      if (test_opt2(sb, JOURNAL_FAST_COMMIT))
 -              SEQ_OPTS_PUTS("fast_commit");
 -
        ext4_show_quota_options(seq, sb);
        return 0;
  }
@@@ -4044,9 -4048,8 +4044,8 @@@ static int ext4_fill_super(struct super
        sbi->s_sb = sb;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
-       if (sb->s_bdev->bd_part)
-               sbi->s_sectors_written_start =
-                       part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+       sbi->s_sectors_written_start =
+               part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
  
        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');
@@@ -5505,15 -5508,10 +5504,10 @@@ static int ext4_commit_super(struct sup
         */
        if (!(sb->s_flags & SB_RDONLY))
                ext4_update_tstamp(es, s_wtime);
-       if (sb->s_bdev->bd_part)
-               es->s_kbytes_written =
-                       cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-                           ((part_stat_read(sb->s_bdev->bd_part,
-                                            sectors[STAT_WRITE]) -
-                             EXT4_SB(sb)->s_sectors_written_start) >> 1));
-       else
-               es->s_kbytes_written =
-                       cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+       es->s_kbytes_written =
+               cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+                   ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+                     EXT4_SB(sb)->s_sectors_written_start) >> 1));
        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
                ext4_free_blocks_count_set(es,
                        EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
diff --combined fs/f2fs/f2fs.h
@@@ -1675,7 -1675,7 +1675,7 @@@ static inline bool f2fs_is_multi_device
   * and the return value is in kbytes. s is of struct f2fs_sb_info.
   */
  #define BD_PART_WRITTEN(s)                                             \
(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) -   \
      (((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) -   \
                (s)->sectors_written_start) >> 1)
  
  static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
@@@ -3251,8 -3251,6 +3251,8 @@@ bool f2fs_empty_dir(struct inode *dir)
  
  static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
  {
 +      if (fscrypt_is_nokey_name(dentry))
 +              return -ENOKEY;
        return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name,
                                inode, inode->i_ino, inode->i_mode);
  }
diff --combined fs/internal.h
@@@ -25,7 -25,6 +25,6 @@@ extern void __init bdev_cache_init(void
  extern int __sync_blockdev(struct block_device *bdev, int wait);
  void iterate_bdevs(void (*)(struct block_device *, void *), void *);
  void emergency_thaw_bdev(struct super_block *sb);
- void bd_forget(struct inode *inode);
  #else
  static inline void bdev_cache_init(void)
  {
@@@ -43,9 -42,6 +42,6 @@@ static inline int emergency_thaw_bdev(s
  {
        return 0;
  }
- static inline void bd_forget(struct inode *inode)
- {
- }
  #endif /* CONFIG_BLOCK */
  
  /*
@@@ -78,8 -74,6 +74,8 @@@ extern int vfs_path_lookup(struct dentr
  long do_rmdir(int dfd, struct filename *name);
  long do_unlinkat(int dfd, struct filename *name);
  int may_linkat(struct path *link);
 +int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
 +               struct filename *newname, unsigned int flags);
  
  /*
   * namespace.c
@@@ -116,7 -110,8 +112,8 @@@ extern struct file *alloc_empty_file_no
   */
  extern int reconfigure_super(struct fs_context *);
  extern bool trylock_super(struct super_block *sb);
- extern struct super_block *user_get_super(dev_t);
+ struct super_block *user_get_super(dev_t, bool excl);
+ void put_super(struct super_block *sb);
  extern bool mount_capable(struct fs_context *);
  
  /*
diff --combined fs/io_uring.c
@@@ -205,7 -205,6 +205,7 @@@ struct fixed_file_ref_node 
        struct list_head                file_list;
        struct fixed_file_data          *file_data;
        struct llist_node               llist;
 +      bool                            done;
  };
  
  struct fixed_file_data {
@@@ -245,8 -244,6 +245,8 @@@ struct io_sq_data 
  
        struct task_struct      *thread;
        struct wait_queue_head  wait;
 +
 +      unsigned                sq_thread_idle;
  };
  
  struct io_ring_ctx {
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
  
 -              wait_queue_head_t       inflight_wait;
                struct io_uring_sqe     *sq_sqes;
        } ____cacheline_aligned_in_smp;
  
        struct io_sq_data       *sq_data;       /* if using sq thread polling */
  
        struct wait_queue_head  sqo_sq_wait;
 -      struct wait_queue_entry sqo_wait_entry;
        struct list_head        sqd_list;
  
        /*
   */
  struct io_poll_iocb {
        struct file                     *file;
 -      union {
 -              struct wait_queue_head  *head;
 -              u64                     addr;
 -      };
 +      struct wait_queue_head          *head;
        __poll_t                        events;
        bool                            done;
        bool                            canceled;
        struct wait_queue_entry         wait;
  };
  
 +struct io_poll_remove {
 +      struct file                     *file;
 +      u64                             addr;
 +};
 +
  struct io_close {
        struct file                     *file;
        struct file                     *put_file;
@@@ -446,17 -443,11 +446,17 @@@ struct io_timeout 
        u32                             off;
        u32                             target_seq;
        struct list_head                list;
 +      /* head of the link, used by linked timeouts only */
 +      struct io_kiocb                 *head;
  };
  
  struct io_timeout_rem {
        struct file                     *file;
        u64                             addr;
 +
 +      /* timeout update */
 +      struct timespec64               ts;
 +      u32                             flags;
  };
  
  struct io_rw {
@@@ -487,7 -478,6 +487,7 @@@ struct io_sr_msg 
  struct io_open {
        struct file                     *file;
        int                             dfd;
 +      bool                            ignore_nonblock;
        struct filename                 *filename;
        struct open_how                 how;
        unsigned long                   nofile;
@@@ -549,27 -539,6 +549,27 @@@ struct io_statx 
        struct statx __user             *buffer;
  };
  
 +struct io_shutdown {
 +      struct file                     *file;
 +      int                             how;
 +};
 +
 +struct io_rename {
 +      struct file                     *file;
 +      int                             old_dfd;
 +      int                             new_dfd;
 +      struct filename                 *oldpath;
 +      struct filename                 *newpath;
 +      int                             flags;
 +};
 +
 +struct io_unlink {
 +      struct file                     *file;
 +      int                             dfd;
 +      int                             flags;
 +      struct filename                 *filename;
 +};
 +
  struct io_completion {
        struct file                     *file;
        struct list_head                list;
@@@ -604,6 -573,7 +604,6 @@@ enum 
        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
  
 -      REQ_F_LINK_HEAD_BIT,
        REQ_F_FAIL_LINK_BIT,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
@@@ -635,6 -605,8 +635,6 @@@ enum 
        /* IOSQE_BUFFER_SELECT */
        REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
  
 -      /* head of a link */
 -      REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
        /* fail rest of links */
        REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
        /* on inflight list */
@@@ -677,7 -649,6 +677,7 @@@ struct io_kiocb 
                struct file             *file;
                struct io_rw            rw;
                struct io_poll_iocb     poll;
 +              struct io_poll_remove   poll_remove;
                struct io_accept        accept;
                struct io_sync          sync;
                struct io_cancel        cancel;
                struct io_splice        splice;
                struct io_provide_buf   pbuf;
                struct io_statx         statx;
 +              struct io_shutdown      shutdown;
 +              struct io_rename        rename;
 +              struct io_unlink        unlink;
                /* use only after cleaning per-op data, see io_clean_op() */
                struct io_completion    compl;
        };
        struct task_struct              *task;
        u64                             user_data;
  
 -      struct list_head                link_list;
 +      struct io_kiocb                 *link;
 +      struct percpu_ref               *fixed_file_refs;
  
        /*
         * 1. used with ctx->iopoll_list with reads/writes
         * 2. to track reqs with ->files (see io_op_def::file_table)
         */
        struct list_head                inflight_entry;
 -
 -      struct percpu_ref               *fixed_file_refs;
        struct callback_head            task_work;
        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
        struct hlist_node               hash_node;
@@@ -754,8 -723,6 +754,8 @@@ struct io_submit_state 
        void                    *reqs[IO_IOPOLL_BATCH];
        unsigned int            free_reqs;
  
 +      bool                    plug_started;
 +
        /*
         * Batch completion logic
         */
         */
        struct file             *file;
        unsigned int            fd;
 -      unsigned int            has_refs;
 +      unsigned int            file_refs;
        unsigned int            ios_left;
  };
  
@@@ -788,8 -755,6 +788,8 @@@ struct io_op_def 
        unsigned                buffer_select : 1;
        /* must always have async data allocated */
        unsigned                needs_async_data : 1;
 +      /* should block plug */
 +      unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short          async_size;
        unsigned                work_flags;
@@@ -803,7 -768,6 +803,7 @@@ static const struct io_op_def io_op_def
                .pollin                 = 1,
                .buffer_select          = 1,
                .needs_async_data       = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
                .needs_async_data       = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                IO_WQ_WORK_FSIZE,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
                                                IO_WQ_WORK_MM,
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
 -              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 -                                              IO_WQ_WORK_FS,
 +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_RECVMSG] = {
                .needs_file             = 1,
                .buffer_select          = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
 -              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 -                                              IO_WQ_WORK_FS,
 +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
                .work_flags             = IO_WQ_WORK_MM,
        },
 -      [IORING_OP_TIMEOUT_REMOVE] = {},
 +      [IORING_OP_TIMEOUT_REMOVE] = {
 +              /* used by timeout updates' prep() */
 +              .work_flags             = IO_WQ_WORK_MM,
 +      },
        [IORING_OP_ACCEPT] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
        },
        [IORING_OP_OPENAT] = {
                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
 -                                              IO_WQ_WORK_FS,
 +                                              IO_WQ_WORK_FS | IO_WQ_WORK_MM,
        },
        [IORING_OP_CLOSE] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
                .buffer_select          = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
 +              .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                IO_WQ_WORK_FSIZE,
        },
        [IORING_OP_OPENAT2] = {
                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
 -                                              IO_WQ_WORK_BLKCG,
 +                                              IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file    = 1,
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
        },
 +      [IORING_OP_SHUTDOWN] = {
 +              .needs_file             = 1,
 +      },
 +      [IORING_OP_RENAMEAT] = {
 +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
 +                                              IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
 +      },
 +      [IORING_OP_UNLINKAT] = {
 +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
 +                                              IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
 +      },
  };
  
  enum io_mem_account {
@@@ -1034,9 -981,6 +1034,9 @@@ struct sock *io_uring_get_socket(struc
  }
  EXPORT_SYMBOL(io_uring_get_socket);
  
 +#define io_for_each_link(pos, head) \
 +      for (pos = (head); pos; pos = pos->link)
 +
  static inline void io_clean_op(struct io_kiocb *req)
  {
        if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
                __io_clean_op(req);
  }
  
 -static void io_sq_thread_drop_mm(void)
 +static inline void io_set_resource_node(struct io_kiocb *req)
 +{
 +      struct io_ring_ctx *ctx = req->ctx;
 +
 +      if (!req->fixed_file_refs) {
 +              req->fixed_file_refs = &ctx->file_data->node->refs;
 +              percpu_ref_get(req->fixed_file_refs);
 +      }
 +}
 +
 +static bool io_match_task(struct io_kiocb *head,
 +                        struct task_struct *task,
 +                        struct files_struct *files)
 +{
 +      struct io_kiocb *req;
 +
 +      if (task && head->task != task)
 +              return false;
 +      if (!files)
 +              return true;
 +
 +      io_for_each_link(req, head) {
 +              if ((req->flags & REQ_F_WORK_INITIALIZED) &&
 +                  (req->work.flags & IO_WQ_WORK_FILES) &&
 +                  req->work.identity->files == files)
 +                      return true;
 +      }
 +      return false;
 +}
 +
 +static void io_sq_thread_drop_mm_files(void)
  {
 +      struct files_struct *files = current->files;
        struct mm_struct *mm = current->mm;
  
        if (mm) {
                mmput(mm);
                current->mm = NULL;
        }
 +      if (files) {
 +              struct nsproxy *nsproxy = current->nsproxy;
 +
 +              task_lock(current);
 +              current->files = NULL;
 +              current->nsproxy = NULL;
 +              task_unlock(current);
 +              put_files_struct(files);
 +              put_nsproxy(nsproxy);
 +      }
 +}
 +
 +static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
 +{
 +      if (!current->files) {
 +              struct files_struct *files;
 +              struct nsproxy *nsproxy;
 +
 +              task_lock(ctx->sqo_task);
 +              files = ctx->sqo_task->files;
 +              if (!files) {
 +                      task_unlock(ctx->sqo_task);
 +                      return -EOWNERDEAD;
 +              }
 +              atomic_inc(&files->count);
 +              get_nsproxy(ctx->sqo_task->nsproxy);
 +              nsproxy = ctx->sqo_task->nsproxy;
 +              task_unlock(ctx->sqo_task);
 +
 +              task_lock(current);
 +              current->files = files;
 +              current->nsproxy = nsproxy;
 +              task_unlock(current);
 +      }
 +      return 0;
  }
  
  static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
        return -EFAULT;
  }
  
 -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
 -                                 struct io_kiocb *req)
 +static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
 +                                       struct io_kiocb *req)
  {
 -      if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
 -              return 0;
 -      return __io_sq_thread_acquire_mm(ctx);
 +      const struct io_op_def *def = &io_op_defs[req->opcode];
 +      int ret;
 +
 +      if (def->work_flags & IO_WQ_WORK_MM) {
 +              ret = __io_sq_thread_acquire_mm(ctx);
 +              if (unlikely(ret))
 +                      return ret;
 +      }
 +
 +      if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
 +              ret = __io_sq_thread_acquire_files(ctx);
 +              if (unlikely(ret))
 +                      return ret;
 +      }
 +
 +      return 0;
  }
  
  static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@@ -1307,6 -1172,7 +1307,6 @@@ static struct io_ring_ctx *io_ring_ctx_
        INIT_LIST_HEAD(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
 -      init_waitqueue_head(&ctx->inflight_wait);
        spin_lock_init(&ctx->inflight_lock);
        INIT_LIST_HEAD(&ctx->inflight_list);
        INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@@ -1416,7 -1282,7 +1416,7 @@@ static bool io_identity_cow(struct io_k
         */
        io_init_identity(id);
        if (creds)
 -              req->work.identity->creds = creds;
 +              id->creds = creds;
  
        /* add one for this request */
        refcount_inc(&id->count);
@@@ -1445,6 -1311,22 +1445,6 @@@ static bool io_grab_identity(struct io_
                        return false;
                req->work.flags |= IO_WQ_WORK_FSIZE;
        }
 -
 -      if (!(req->work.flags & IO_WQ_WORK_FILES) &&
 -          (def->work_flags & IO_WQ_WORK_FILES) &&
 -          !(req->flags & REQ_F_NO_FILE_TABLE)) {
 -              if (id->files != current->files ||
 -                  id->nsproxy != current->nsproxy)
 -                      return false;
 -              atomic_inc(&id->files->count);
 -              get_nsproxy(id->nsproxy);
 -              req->flags |= REQ_F_INFLIGHT;
 -
 -              spin_lock_irq(&ctx->inflight_lock);
 -              list_add(&req->inflight_entry, &ctx->inflight_list);
 -              spin_unlock_irq(&ctx->inflight_lock);
 -              req->work.flags |= IO_WQ_WORK_FILES;
 -      }
  #ifdef CONFIG_BLK_CGROUP
        if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
            (def->work_flags & IO_WQ_WORK_BLKCG)) {
                }
                spin_unlock(&current->fs->lock);
        }
 +      if (!(req->work.flags & IO_WQ_WORK_FILES) &&
 +          (def->work_flags & IO_WQ_WORK_FILES) &&
 +          !(req->flags & REQ_F_NO_FILE_TABLE)) {
 +              if (id->files != current->files ||
 +                  id->nsproxy != current->nsproxy)
 +                      return false;
 +              atomic_inc(&id->files->count);
 +              get_nsproxy(id->nsproxy);
 +              req->flags |= REQ_F_INFLIGHT;
 +
 +              spin_lock_irq(&ctx->inflight_lock);
 +              list_add(&req->inflight_entry, &ctx->inflight_list);
 +              spin_unlock_irq(&ctx->inflight_lock);
 +              req->work.flags |= IO_WQ_WORK_FILES;
 +      }
  
        return true;
  }
@@@ -1548,8 -1415,10 +1548,8 @@@ static void io_prep_async_link(struct i
  {
        struct io_kiocb *cur;
  
 -      io_prep_async_work(req);
 -      if (req->flags & REQ_F_LINK_HEAD)
 -              list_for_each_entry(cur, &req->link_list, link_list)
 -                      io_prep_async_work(cur);
 +      io_for_each_link(cur, req)
 +              io_prep_async_work(cur);
  }
  
  static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@@ -1590,18 -1459,30 +1590,18 @@@ static void io_kill_timeout(struct io_k
        }
  }
  
 -static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
 -{
 -      struct io_ring_ctx *ctx = req->ctx;
 -
 -      if (!tsk || req->task == tsk)
 -              return true;
 -      if (ctx->flags & IORING_SETUP_SQPOLL) {
 -              if (ctx->sq_data && req->task == ctx->sq_data->thread)
 -                      return true;
 -      }
 -      return false;
 -}
 -
  /*
   * Returns true if we found and killed one or more timeouts
   */
 -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
 +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 +                           struct files_struct *files)
  {
        struct io_kiocb *req, *tmp;
        int canceled = 0;
  
        spin_lock_irq(&ctx->completion_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 -              if (io_task_match(req, tsk)) {
 +              if (io_match_task(req, tsk, files)) {
                        io_kill_timeout(req);
                        canceled++;
                }
@@@ -1712,6 -1593,32 +1712,6 @@@ static void io_cqring_mark_overflow(str
        }
  }
  
 -static inline bool __io_match_files(struct io_kiocb *req,
 -                                  struct files_struct *files)
 -{
 -      return ((req->flags & REQ_F_WORK_INITIALIZED) &&
 -              (req->work.flags & IO_WQ_WORK_FILES)) &&
 -              req->work.identity->files == files;
 -}
 -
 -static bool io_match_files(struct io_kiocb *req,
 -                         struct files_struct *files)
 -{
 -      struct io_kiocb *link;
 -
 -      if (!files)
 -              return true;
 -      if (__io_match_files(req, files))
 -              return true;
 -      if (req->flags & REQ_F_LINK_HEAD) {
 -              list_for_each_entry(link, &req->link_list, link_list) {
 -                      if (__io_match_files(link, files))
 -                              return true;
 -              }
 -      }
 -      return false;
 -}
 -
  /* Returns true if there are no backlogged entries after the flush */
  static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                     struct task_struct *tsk,
  
        cqe = NULL;
        list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
 -              if (tsk && req->task != tsk)
 -                      continue;
 -              if (!io_match_files(req, files))
 +              if (!io_match_task(req, tsk, files))
                        continue;
  
                cqe = io_get_cqring(ctx);
@@@ -1935,7 -1844,9 +1935,7 @@@ fallback
  static inline void io_put_file(struct io_kiocb *req, struct file *file,
                          bool fixed)
  {
 -      if (fixed)
 -              percpu_ref_put(req->fixed_file_refs);
 -      else
 +      if (!fixed)
                fput(file);
  }
  
@@@ -1947,8 -1858,7 +1947,8 @@@ static void io_dismantle_req(struct io_
                kfree(req->async_data);
        if (req->file)
                io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 -
 +      if (req->fixed_file_refs)
 +              percpu_ref_put(req->fixed_file_refs);
        io_req_clean_work(req);
  }
  
@@@ -1971,14 -1881,6 +1971,14 @@@ static void __io_free_req(struct io_kio
        percpu_ref_put(&ctx->refs);
  }
  
 +static inline void io_remove_next_linked(struct io_kiocb *req)
 +{
 +      struct io_kiocb *nxt = req->link;
 +
 +      req->link = nxt->link;
 +      nxt->link = NULL;
 +}
 +
  static void io_kill_linked_timeout(struct io_kiocb *req)
  {
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;
  
        spin_lock_irqsave(&ctx->completion_lock, flags);
 -      link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
 -                                      link_list);
 +      link = req->link;
 +
        /*
         * Can happen if a linked timeout fired and link had been like
         * req -> link t-out -> link t-out [-> ...]
                struct io_timeout_data *io = link->async_data;
                int ret;
  
 -              list_del_init(&link->link_list);
 +              io_remove_next_linked(req);
 +              link->timeout.head = NULL;
                ret = hrtimer_try_to_cancel(&io->timer);
                if (ret != -1) {
                        io_cqring_fill_event(link, -ECANCELED);
        }
  }
  
 -static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
 -{
 -      struct io_kiocb *nxt;
 -
 -      /*
 -       * The list should never be empty when we are called here. But could
 -       * potentially happen if the chain is messed up, check to be on the
 -       * safe side.
 -       */
 -      if (unlikely(list_empty(&req->link_list)))
 -              return NULL;
 -
 -      nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
 -      list_del_init(&req->link_list);
 -      if (!list_empty(&nxt->link_list))
 -              nxt->flags |= REQ_F_LINK_HEAD;
 -      return nxt;
 -}
  
 -/*
 - * Called if REQ_F_LINK_HEAD is set, and we fail the head request
 - */
  static void io_fail_links(struct io_kiocb *req)
  {
 +      struct io_kiocb *link, *nxt;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;
  
        spin_lock_irqsave(&ctx->completion_lock, flags);
 -      while (!list_empty(&req->link_list)) {
 -              struct io_kiocb *link = list_first_entry(&req->link_list,
 -                                              struct io_kiocb, link_list);
 +      link = req->link;
 +      req->link = NULL;
  
 -              list_del_init(&link->link_list);
 -              trace_io_uring_fail_link(req, link);
 +      while (link) {
 +              nxt = link->link;
 +              link->link = NULL;
  
 +              trace_io_uring_fail_link(req, link);
                io_cqring_fill_event(link, -ECANCELED);
  
                /*
                        io_put_req_deferred(link, 2);
                else
                        io_double_put_req(link);
 +              link = nxt;
        }
 -
        io_commit_cqring(ctx);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
  
  static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
  {
 -      req->flags &= ~REQ_F_LINK_HEAD;
        if (req->flags & REQ_F_LINK_TIMEOUT)
                io_kill_linked_timeout(req);
  
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
 -      if (likely(!(req->flags & REQ_F_FAIL_LINK)))
 -              return io_req_link_next(req);
 +      if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
 +              struct io_kiocb *nxt = req->link;
 +
 +              req->link = NULL;
 +              return nxt;
 +      }
        io_fail_links(req);
        return NULL;
  }
  
 -static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
  {
 -      if (likely(!(req->flags & REQ_F_LINK_HEAD)))
 +      if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
                return NULL;
        return __io_req_find_next(req);
  }
  
 -static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
 +static int io_req_task_work_add(struct io_kiocb *req)
  {
        struct task_struct *tsk = req->task;
        struct io_ring_ctx *ctx = req->ctx;
         * will do the job.
         */
        notify = TWA_NONE;
 -      if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
 +      if (!(ctx->flags & IORING_SETUP_SQPOLL))
                notify = TWA_SIGNAL;
  
        ret = task_work_add(tsk, &req->task_work, notify);
@@@ -2132,8 -2049,7 +2132,8 @@@ static void __io_req_task_submit(struc
  {
        struct io_ring_ctx *ctx = req->ctx;
  
 -      if (!__io_sq_thread_acquire_mm(ctx)) {
 +      if (!__io_sq_thread_acquire_mm(ctx) &&
 +          !__io_sq_thread_acquire_files(ctx)) {
                mutex_lock(&ctx->uring_lock);
                __io_queue_sqe(req, NULL);
                mutex_unlock(&ctx->uring_lock);
@@@ -2158,7 -2074,7 +2158,7 @@@ static void io_req_task_queue(struct io
        init_task_work(&req->task_work, io_req_task_submit);
        percpu_ref_get(&req->ctx->refs);
  
 -      ret = io_req_task_work_add(req, true);
 +      ret = io_req_task_work_add(req);
        if (unlikely(ret)) {
                struct task_struct *tsk;
  
        }
  }
  
 -static void io_queue_next(struct io_kiocb *req)
 +static inline void io_queue_next(struct io_kiocb *req)
  {
        struct io_kiocb *nxt = io_req_find_next(req);
  
@@@ -2226,7 -2142,8 +2226,7 @@@ static void io_req_free_batch(struct re
                io_free_req(req);
                return;
        }
 -      if (req->flags & REQ_F_LINK_HEAD)
 -              io_queue_next(req);
 +      io_queue_next(req);
  
        if (req->task != rb->task) {
                if (rb->task) {
@@@ -2279,7 -2196,7 +2279,7 @@@ static void io_free_req_deferred(struc
        int ret;
  
        init_task_work(&req->task_work, io_put_req_deferred_cb);
 -      ret = io_req_task_work_add(req, true);
 +      ret = io_req_task_work_add(req);
        if (unlikely(ret)) {
                struct task_struct *tsk;
  
@@@ -2328,7 -2245,7 +2328,7 @@@ static unsigned io_cqring_events(struc
                 * we wake up the task, and the next invocation will flush the
                 * entries. We cannot safely to it from here.
                 */
 -              if (noflush && !list_empty(&ctx->cq_overflow_list))
 +              if (noflush)
                        return -1U;
  
                io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@@ -2660,6 -2577,7 +2660,6 @@@ static bool io_resubmit_prep(struct io_
        }
  end_req:
        req_set_fail_links(req);
 -      io_req_complete(req, ret);
        return false;
  }
  #endif
@@@ -2675,7 -2593,7 +2675,7 @@@ static bool io_rw_reissue(struct io_kio
        if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
                return false;
  
 -      ret = io_sq_thread_acquire_mm(req->ctx, req);
 +      ret = io_sq_thread_acquire_mm_files(req->ctx, req);
  
        if (io_resubmit_prep(req, ret)) {
                refcount_inc(&req->refs);
@@@ -2723,7 -2641,7 +2723,7 @@@ static void io_complete_rw_iopoll(struc
   * find it from a io_iopoll_getevents() thread before the issuer is done
   * accessing the kiocb cookie.
   */
 -static void io_iopoll_req_issued(struct io_kiocb *req)
 +static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
  {
        struct io_ring_ctx *ctx = req->ctx;
  
        else
                list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
  
 -      if ((ctx->flags & IORING_SETUP_SQPOLL) &&
 +      /*
 +       * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
 +       * task context or in io worker task context. If current task context is
 +       * sq thread, we don't need to check whether should wake up sq thread.
 +       */
 +      if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
            wq_has_sleeper(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
  }
  
 -static void __io_state_file_put(struct io_submit_state *state)
 +static inline void __io_state_file_put(struct io_submit_state *state)
  {
 -      if (state->has_refs)
 -              fput_many(state->file, state->has_refs);
 -      state->file = NULL;
 +      fput_many(state->file, state->file_refs);
 +      state->file_refs = 0;
  }
  
  static inline void io_state_file_put(struct io_submit_state *state)
  {
 -      if (state->file)
 +      if (state->file_refs)
                __io_state_file_put(state);
  }
  
@@@ -2784,29 -2698,25 +2784,25 @@@ static struct file *__io_file_get(struc
        if (!state)
                return fget(fd);
  
 -      if (state->file) {
 +      if (state->file_refs) {
                if (state->fd == fd) {
 -                      state->has_refs--;
 +                      state->file_refs--;
                        return state->file;
                }
                __io_state_file_put(state);
        }
        state->file = fget_many(fd, state->ios_left);
 -      if (!state->file)
 +      if (unlikely(!state->file))
                return NULL;
  
        state->fd = fd;
 -      state->has_refs = state->ios_left - 1;
 +      state->file_refs = state->ios_left - 1;
        return state->file;
  }
  
  static bool io_bdev_nowait(struct block_device *bdev)
  {
- #ifdef CONFIG_BLOCK
        return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
- #else
-       return true;
- #endif
  }
  
  /*
@@@ -2819,14 -2729,16 +2815,16 @@@ static bool io_file_supports_async(stru
        umode_t mode = file_inode(file)->i_mode;
  
        if (S_ISBLK(mode)) {
-               if (io_bdev_nowait(file->f_inode->i_bdev))
+               if (IS_ENABLED(CONFIG_BLOCK) &&
+                   io_bdev_nowait(I_BDEV(file->f_mapping->host)))
                        return true;
                return false;
        }
        if (S_ISCHR(mode) || S_ISSOCK(mode))
                return true;
        if (S_ISREG(mode)) {
-               if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+               if (IS_ENABLED(CONFIG_BLOCK) &&
+                   io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
                    file->f_op != &io_uring_fops)
                        return true;
                return false;
@@@ -3151,7 -3063,7 +3149,7 @@@ static ssize_t io_iov_buffer_select(str
        return __io_iov_buffer_select(req, iov, needs_lock);
  }
  
 -static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
 +static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                                 struct iovec **iovec, struct iov_iter *iter,
                                 bool needs_lock)
  {
  
                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
 -              return ret < 0 ? ret : sqe_len;
 +              return ret;
        }
  
        if (req->flags & REQ_F_BUFFER_SELECT) {
                              req->ctx->compat);
  }
  
 -static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 -                             struct iovec **iovec, struct iov_iter *iter,
 -                             bool needs_lock)
 -{
 -      struct io_async_rw *iorw = req->async_data;
 -
 -      if (!iorw)
 -              return __io_import_iovec(rw, req, iovec, iter, needs_lock);
 -      *iovec = NULL;
 -      return iov_iter_count(&iorw->iter);
 -}
 -
  static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
  {
        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@@ -3266,7 -3190,7 +3264,7 @@@ static void io_req_map_rw(struct io_kio
        rw->free_iovec = iovec;
        rw->bytes_done = 0;
        /* can only be fixed buffers, no need to do anything */
 -      if (iter->type == ITER_BVEC)
 +      if (iov_iter_is_bvec(iter))
                return;
        if (!iovec) {
                unsigned iov_off = 0;
@@@ -3320,7 -3244,7 +3318,7 @@@ static inline int io_rw_prep_async(stru
        struct iovec *iov = iorw->fast_iov;
        ssize_t ret;
  
 -      ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
 +      ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
        if (unlikely(ret < 0))
                return ret;
  
@@@ -3379,7 -3303,7 +3377,7 @@@ static int io_async_buf_func(struct wai
  
        /* submit ref gets dropped, acquire a new one */
        refcount_inc(&req->refs);
 -      ret = io_req_task_work_add(req, true);
 +      ret = io_req_task_work_add(req);
        if (unlikely(ret)) {
                struct task_struct *tsk;
  
@@@ -3453,17 -3377,17 +3451,17 @@@ static int io_read(struct io_kiocb *req
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        ssize_t io_size, ret, ret2;
 -      size_t iov_count;
        bool no_async;
  
 -      if (rw)
 +      if (rw) {
                iter = &rw->iter;
 -
 -      ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 -      if (ret < 0)
 -              return ret;
 -      iov_count = iov_iter_count(iter);
 -      io_size = ret;
 +              iovec = NULL;
 +      } else {
 +              ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 +              if (ret < 0)
 +                      return ret;
 +      }
 +      io_size = iov_iter_count(iter);
        req->result = io_size;
        ret = 0;
  
        if (no_async)
                goto copy_iov;
  
 -      ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
 +      ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
        if (unlikely(ret))
                goto out_free;
  
                if (req->file->f_flags & O_NONBLOCK)
                        goto done;
                /* some cases will consume bytes even on error returns */
 -              iov_iter_revert(iter, iov_count - iov_iter_count(iter));
 +              iov_iter_revert(iter, io_size - iov_iter_count(iter));
                ret = 0;
                goto copy_iov;
        } else if (ret < 0) {
@@@ -3581,17 -3505,17 +3579,17 @@@ static int io_write(struct io_kiocb *re
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
 -      size_t iov_count;
        ssize_t ret, ret2, io_size;
  
 -      if (rw)
 +      if (rw) {
                iter = &rw->iter;
 -
 -      ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 -      if (ret < 0)
 -              return ret;
 -      iov_count = iov_iter_count(iter);
 -      io_size = ret;
 +              iovec = NULL;
 +      } else {
 +              ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 +              if (ret < 0)
 +                      return ret;
 +      }
 +      io_size = iov_iter_count(iter);
        req->result = io_size;
  
        /* Ensure we clear previously set non-block flag */
            (req->flags & REQ_F_ISREG))
                goto copy_iov;
  
 -      ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
 +      ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
        if (unlikely(ret))
                goto out_free;
  
@@@ -3652,7 -3576,7 +3650,7 @@@ done
        } else {
  copy_iov:
                /* some cases will consume bytes even on error returns */
 -              iov_iter_revert(iter, iov_count - iov_iter_count(iter));
 +              iov_iter_revert(iter, io_size - iov_iter_count(iter));
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
                if (!ret)
                        return -EAGAIN;
@@@ -3664,209 -3588,80 +3662,209 @@@ out_free
        return ret;
  }
  
 -static int __io_splice_prep(struct io_kiocb *req,
 +static int io_renameat_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
  {
 -      struct io_splice* sp = &req->splice;
 -      unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 -
 -      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 -              return -EINVAL;
 +      struct io_rename *ren = &req->rename;
 +      const char __user *oldf, *newf;
  
 -      sp->file_in = NULL;
 -      sp->len = READ_ONCE(sqe->len);
 -      sp->flags = READ_ONCE(sqe->splice_flags);
 +      if (unlikely(req->flags & REQ_F_FIXED_FILE))
 +              return -EBADF;
  
 -      if (unlikely(sp->flags & ~valid_flags))
 -              return -EINVAL;
 +      ren->old_dfd = READ_ONCE(sqe->fd);
 +      oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 +      newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 +      ren->new_dfd = READ_ONCE(sqe->len);
 +      ren->flags = READ_ONCE(sqe->rename_flags);
  
 -      sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
 -                                (sp->flags & SPLICE_F_FD_IN_FIXED));
 -      if (!sp->file_in)
 -              return -EBADF;
 -      req->flags |= REQ_F_NEED_CLEANUP;
 +      ren->oldpath = getname(oldf);
 +      if (IS_ERR(ren->oldpath))
 +              return PTR_ERR(ren->oldpath);
  
 -      if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
 -              /*
 -               * Splice operation will be punted aync, and here need to
 -               * modify io_wq_work.flags, so initialize io_wq_work firstly.
 -               */
 -              io_req_init_async(req);
 -              req->work.flags |= IO_WQ_WORK_UNBOUND;
 +      ren->newpath = getname(newf);
 +      if (IS_ERR(ren->newpath)) {
 +              putname(ren->oldpath);
 +              return PTR_ERR(ren->newpath);
        }
  
 +      req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
  }
  
 -static int io_tee_prep(struct io_kiocb *req,
 -                     const struct io_uring_sqe *sqe)
 -{
 -      if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
 -              return -EINVAL;
 -      return __io_splice_prep(req, sqe);
 -}
 -
 -static int io_tee(struct io_kiocb *req, bool force_nonblock)
 +static int io_renameat(struct io_kiocb *req, bool force_nonblock)
  {
 -      struct io_splice *sp = &req->splice;
 -      struct file *in = sp->file_in;
 -      struct file *out = sp->file_out;
 -      unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 -      long ret = 0;
 +      struct io_rename *ren = &req->rename;
 +      int ret;
  
        if (force_nonblock)
                return -EAGAIN;
 -      if (sp->len)
 -              ret = do_tee(in, out, sp->len, flags);
  
 -      io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
 -      req->flags &= ~REQ_F_NEED_CLEANUP;
 +      ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
 +                              ren->newpath, ren->flags);
  
 -      if (ret != sp->len)
 +      req->flags &= ~REQ_F_NEED_CLEANUP;
 +      if (ret < 0)
                req_set_fail_links(req);
        io_req_complete(req, ret);
        return 0;
  }
  
 -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 +static int io_unlinkat_prep(struct io_kiocb *req,
 +                          const struct io_uring_sqe *sqe)
  {
 -      struct io_splice* sp = &req->splice;
 +      struct io_unlink *un = &req->unlink;
 +      const char __user *fname;
  
 -      sp->off_in = READ_ONCE(sqe->splice_off_in);
 -      sp->off_out = READ_ONCE(sqe->off);
 -      return __io_splice_prep(req, sqe);
 +      if (unlikely(req->flags & REQ_F_FIXED_FILE))
 +              return -EBADF;
 +
 +      un->dfd = READ_ONCE(sqe->fd);
 +
 +      un->flags = READ_ONCE(sqe->unlink_flags);
 +      if (un->flags & ~AT_REMOVEDIR)
 +              return -EINVAL;
 +
 +      fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 +      un->filename = getname(fname);
 +      if (IS_ERR(un->filename))
 +              return PTR_ERR(un->filename);
 +
 +      req->flags |= REQ_F_NEED_CLEANUP;
 +      return 0;
  }
  
 -static int io_splice(struct io_kiocb *req, bool force_nonblock)
 +static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
 +{
 +      struct io_unlink *un = &req->unlink;
 +      int ret;
 +
 +      if (force_nonblock)
 +              return -EAGAIN;
 +
 +      if (un->flags & AT_REMOVEDIR)
 +              ret = do_rmdir(un->dfd, un->filename);
 +      else
 +              ret = do_unlinkat(un->dfd, un->filename);
 +
 +      req->flags &= ~REQ_F_NEED_CLEANUP;
 +      if (ret < 0)
 +              req_set_fail_links(req);
 +      io_req_complete(req, ret);
 +      return 0;
 +}
 +
 +static int io_shutdown_prep(struct io_kiocb *req,
 +                          const struct io_uring_sqe *sqe)
 +{
 +#if defined(CONFIG_NET)
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 +              return -EINVAL;
 +      if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
 +          sqe->buf_index)
 +              return -EINVAL;
 +
 +      req->shutdown.how = READ_ONCE(sqe->len);
 +      return 0;
 +#else
 +      return -EOPNOTSUPP;
 +#endif
 +}
 +
 +static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
 +{
 +#if defined(CONFIG_NET)
 +      struct socket *sock;
 +      int ret;
 +
 +      if (force_nonblock)
 +              return -EAGAIN;
 +
 +      sock = sock_from_file(req->file);
 +      if (unlikely(!sock))
 +              return -ENOTSOCK;
 +
 +      ret = __sys_shutdown_sock(sock, req->shutdown.how);
 +      io_req_complete(req, ret);
 +      return 0;
 +#else
 +      return -EOPNOTSUPP;
 +#endif
 +}
 +
 +static int __io_splice_prep(struct io_kiocb *req,
 +                          const struct io_uring_sqe *sqe)
 +{
 +      struct io_splice* sp = &req->splice;
 +      unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 +
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 +              return -EINVAL;
 +
 +      sp->file_in = NULL;
 +      sp->len = READ_ONCE(sqe->len);
 +      sp->flags = READ_ONCE(sqe->splice_flags);
 +
 +      if (unlikely(sp->flags & ~valid_flags))
 +              return -EINVAL;
 +
 +      sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
 +                                (sp->flags & SPLICE_F_FD_IN_FIXED));
 +      if (!sp->file_in)
 +              return -EBADF;
 +      req->flags |= REQ_F_NEED_CLEANUP;
 +
 +      if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
 +              /*
 +               * Splice operation will be punted aync, and here need to
 +               * modify io_wq_work.flags, so initialize io_wq_work firstly.
 +               */
 +              io_req_init_async(req);
 +              req->work.flags |= IO_WQ_WORK_UNBOUND;
 +      }
 +
 +      return 0;
 +}
 +
 +static int io_tee_prep(struct io_kiocb *req,
 +                     const struct io_uring_sqe *sqe)
 +{
 +      if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
 +              return -EINVAL;
 +      return __io_splice_prep(req, sqe);
 +}
 +
 +static int io_tee(struct io_kiocb *req, bool force_nonblock)
 +{
 +      struct io_splice *sp = &req->splice;
 +      struct file *in = sp->file_in;
 +      struct file *out = sp->file_out;
 +      unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 +      long ret = 0;
 +
 +      if (force_nonblock)
 +              return -EAGAIN;
 +      if (sp->len)
 +              ret = do_tee(in, out, sp->len, flags);
 +
 +      io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
 +      req->flags &= ~REQ_F_NEED_CLEANUP;
 +
 +      if (ret != sp->len)
 +              req_set_fail_links(req);
 +      io_req_complete(req, ret);
 +      return 0;
 +}
 +
 +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 +{
 +      struct io_splice* sp = &req->splice;
 +
 +      sp->off_in = READ_ONCE(sqe->splice_off_in);
 +      sp->off_out = READ_ONCE(sqe->off);
 +      return __io_splice_prep(req, sqe);
 +}
 +
 +static int io_splice(struct io_kiocb *req, bool force_nonblock)
  {
        struct io_splice *sp = &req->splice;
        struct file *in = sp->file_in;
@@@ -3998,7 -3793,6 +3996,7 @@@ static int __io_openat_prep(struct io_k
                return ret;
        }
        req->open.nofile = rlimit(RLIMIT_NOFILE);
 +      req->open.ignore_nonblock = false;
        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
  }
@@@ -4007,7 -3801,7 +4005,7 @@@ static int io_openat_prep(struct io_kio
  {
        u64 flags, mode;
  
 -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        mode = READ_ONCE(sqe->len);
        flags = READ_ONCE(sqe->open_flags);
@@@ -4021,7 -3815,7 +4019,7 @@@ static int io_openat2_prep(struct io_ki
        size_t len;
        int ret;
  
 -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        len = READ_ONCE(sqe->len);
@@@ -4042,7 -3836,7 +4040,7 @@@ static int io_openat2(struct io_kiocb *
        struct file *file;
        int ret;
  
 -      if (force_nonblock)
 +      if (force_nonblock && !req->open.ignore_nonblock)
                return -EAGAIN;
  
        ret = build_open_flags(&req->open.how, &op);
        if (IS_ERR(file)) {
                put_unused_fd(ret);
                ret = PTR_ERR(file);
 +              /*
 +               * A work-around to ensure that /proc/self works that way
 +               * that it should - if we get -EOPNOTSUPP back, then assume
 +               * that proc_self_get_link() failed us because we're in async
 +               * context. We should be safe to retry this from the task
 +               * itself with force_nonblock == false set, as it should not
 +               * block on lookup. Would be nice to know this upfront and
 +               * avoid the async dance, but doesn't seem feasible.
 +               */
 +              if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
 +                      req->open.ignore_nonblock = true;
 +                      refcount_inc(&req->refs);
 +                      io_req_task_queue(req);
 +                      return 0;
 +              }
        } else {
                fsnotify_open(file);
                fd_install(ret, file);
@@@ -4151,17 -3930,11 +4149,17 @@@ static int io_remove_buffers(struct io_
        head = idr_find(&ctx->io_buffer_idr, p->bgid);
        if (head)
                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
 -
 -      io_ring_submit_lock(ctx, !force_nonblock);
        if (ret < 0)
                req_set_fail_links(req);
 -      __io_req_complete(req, ret, 0, cs);
 +
 +      /* need to hold the lock to complete IOPOLL requests */
 +      if (ctx->flags & IORING_SETUP_IOPOLL) {
 +              __io_req_complete(req, ret, 0, cs);
 +              io_ring_submit_unlock(ctx, !force_nonblock);
 +      } else {
 +              io_ring_submit_unlock(ctx, !force_nonblock);
 +              __io_req_complete(req, ret, 0, cs);
 +      }
        return 0;
  }
  
@@@ -4246,17 -4019,10 +4244,17 @@@ static int io_provide_buffers(struct io
                }
        }
  out:
 -      io_ring_submit_unlock(ctx, !force_nonblock);
        if (ret < 0)
                req_set_fail_links(req);
 -      __io_req_complete(req, ret, 0, cs);
 +
 +      /* need to hold the lock to complete IOPOLL requests */
 +      if (ctx->flags & IORING_SETUP_IOPOLL) {
 +              __io_req_complete(req, ret, 0, cs);
 +              io_ring_submit_unlock(ctx, !force_nonblock);
 +      } else {
 +              io_ring_submit_unlock(ctx, !force_nonblock);
 +              __io_req_complete(req, ret, 0, cs);
 +      }
        return 0;
  }
  
@@@ -4428,7 -4194,7 +4426,7 @@@ static int io_close_prep(struct io_kioc
        io_req_init_async(req);
        req->work.flags |= IO_WQ_WORK_NO_CANCEL;
  
 -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
@@@ -4452,7 -4218,7 +4450,7 @@@ static int io_close(struct io_kiocb *re
  
        /* might be already done during nonblock submission */
        if (!close->put_file) {
 -              ret = __close_fd_get_file(close->fd, &close->put_file);
 +              ret = close_fd_get_file(close->fd, &close->put_file);
                if (ret < 0)
                        return (ret == -ENOENT) ? -EBADF : ret;
        }
@@@ -4572,9 -4338,9 +4570,9 @@@ static int io_sendmsg(struct io_kiocb *
        unsigned flags;
        int ret;
  
 -      sock = sock_from_file(req->file, &ret);
 +      sock = sock_from_file(req->file);
        if (unlikely(!sock))
 -              return ret;
 +              return -ENOTSOCK;
  
        if (req->async_data) {
                kmsg = req->async_data;
@@@ -4621,9 -4387,9 +4619,9 @@@ static int io_send(struct io_kiocb *req
        unsigned flags;
        int ret;
  
 -      sock = sock_from_file(req->file, &ret);
 +      sock = sock_from_file(req->file);
        if (unlikely(!sock))
 -              return ret;
 +              return -ENOTSOCK;
  
        ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
        if (unlikely(ret))
@@@ -4715,8 -4481,7 +4713,8 @@@ static int __io_compat_recvmsg_copy_hdr
                        return -EFAULT;
                if (clen < 0)
                        return -EINVAL;
 -              sr->len = iomsg->iov[0].iov_len;
 +              sr->len = clen;
 +              iomsg->iov[0].iov_len = clen;
                iomsg->iov = NULL;
        } else {
                ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
@@@ -4801,9 -4566,9 +4799,9 @@@ static int io_recvmsg(struct io_kiocb *
        unsigned flags;
        int ret, cflags = 0;
  
 -      sock = sock_from_file(req->file, &ret);
 +      sock = sock_from_file(req->file);
        if (unlikely(!sock))
 -              return ret;
 +              return -ENOTSOCK;
  
        if (req->async_data) {
                kmsg = req->async_data;
@@@ -4864,9 -4629,9 +4862,9 @@@ static int io_recv(struct io_kiocb *req
        unsigned flags;
        int ret, cflags = 0;
  
 -      sock = sock_from_file(req->file, &ret);
 +      sock = sock_from_file(req->file);
        if (unlikely(!sock))
 -              return ret;
 +              return -ENOTSOCK;
  
        if (req->flags & REQ_F_BUFFER_SELECT) {
                kbuf = io_recv_buffer_select(req, !force_nonblock);
@@@ -4910,7 -4675,7 +4908,7 @@@ static int io_accept_prep(struct io_kio
  {
        struct io_accept *accept = &req->accept;
  
 -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index)
                return -EINVAL;
@@@ -4951,7 -4716,7 +4949,7 @@@ static int io_connect_prep(struct io_ki
        struct io_connect *conn = &req->connect;
        struct io_async_connect *io = req->async_data;
  
 -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
                return -EINVAL;
@@@ -5075,6 -4840,7 +5073,6 @@@ struct io_poll_table 
  static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
                           __poll_t mask, task_work_func_t func)
  {
 -      bool twa_signal_ok;
        int ret;
  
        /* for instances that support it check for an event match first: */
        percpu_ref_get(&req->ctx->refs);
  
        /*
 -       * If we using the signalfd wait_queue_head for this wakeup, then
 -       * it's not safe to use TWA_SIGNAL as we could be recursing on the
 -       * tsk->sighand->siglock on doing the wakeup. Should not be needed
 -       * either, as the normal wakeup will suffice.
 -       */
 -      twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
 -
 -      /*
         * If this fails, then the task is exiting. When a task exits, the
         * work gets canceled, so just cancel this request as well instead
         * of executing it. We can't safely execute it anyway, as we may not
         * have the needed state needed for it anyway.
         */
 -      ret = io_req_task_work_add(req, twa_signal_ok);
 +      ret = io_req_task_work_add(req);
        if (unlikely(ret)) {
                struct task_struct *tsk;
  
@@@ -5486,8 -5260,7 +5484,8 @@@ static bool io_poll_remove_one(struct i
  /*
   * Returns true if we found and killed one or more poll requests
   */
 -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
 +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 +                             struct files_struct *files)
  {
        struct hlist_node *tmp;
        struct io_kiocb *req;
  
                list = &ctx->cancel_hash[i];
                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
 -                      if (io_task_match(req, tsk))
 +                      if (io_match_task(req, tsk, files))
                                posted += io_poll_remove_one(req);
                }
        }
@@@ -5537,7 -5310,7 +5535,7 @@@ static int io_poll_remove_prep(struct i
            sqe->poll_events)
                return -EINVAL;
  
 -      req->poll.addr = READ_ONCE(sqe->addr);
 +      req->poll_remove.addr = READ_ONCE(sqe->addr);
        return 0;
  }
  
  static int io_poll_remove(struct io_kiocb *req)
  {
        struct io_ring_ctx *ctx = req->ctx;
 -      u64 addr;
        int ret;
  
 -      addr = req->poll.addr;
        spin_lock_irq(&ctx->completion_lock);
 -      ret = io_poll_cancel(ctx, addr);
 +      ret = io_poll_cancel(ctx, req->poll_remove.addr);
        spin_unlock_irq(&ctx->completion_lock);
  
        if (ret < 0)
@@@ -5644,37 -5419,15 +5642,37 @@@ static enum hrtimer_restart io_timeout_
        return HRTIMER_NORESTART;
  }
  
 -static int __io_timeout_cancel(struct io_kiocb *req)
 +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 +                                         __u64 user_data)
  {
 -      struct io_timeout_data *io = req->async_data;
 -      int ret;
 +      struct io_timeout_data *io;
 +      struct io_kiocb *req;
 +      int ret = -ENOENT;
 +
 +      list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 +              if (user_data == req->user_data) {
 +                      ret = 0;
 +                      break;
 +              }
 +      }
  
 +      if (ret == -ENOENT)
 +              return ERR_PTR(ret);
 +
 +      io = req->async_data;
        ret = hrtimer_try_to_cancel(&io->timer);
        if (ret == -1)
 -              return -EALREADY;
 +              return ERR_PTR(-EALREADY);
        list_del_init(&req->timeout.list);
 +      return req;
 +}
 +
 +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 +{
 +      struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 +
 +      if (IS_ERR(req))
 +              return PTR_ERR(req);
  
        req_set_fail_links(req);
        io_cqring_fill_event(req, -ECANCELED);
        return 0;
  }
  
 -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 +                           struct timespec64 *ts, enum hrtimer_mode mode)
  {
 -      struct io_kiocb *req;
 -      int ret = -ENOENT;
 -
 -      list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 -              if (user_data == req->user_data) {
 -                      ret = 0;
 -                      break;
 -              }
 -      }
 +      struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 +      struct io_timeout_data *data;
  
 -      if (ret == -ENOENT)
 -              return ret;
 +      if (IS_ERR(req))
 +              return PTR_ERR(req);
  
 -      return __io_timeout_cancel(req);
 +      req->timeout.off = 0; /* noseq */
 +      data = req->async_data;
 +      list_add_tail(&req->timeout.list, &ctx->timeout_list);
 +      hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
 +      data->timer.function = io_timeout_fn;
 +      hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 +      return 0;
  }
  
  static int io_timeout_remove_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
  {
 +      struct io_timeout_rem *tr = &req->timeout_rem;
 +
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
 -      if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
 +      if (sqe->ioprio || sqe->buf_index || sqe->len)
                return -EINVAL;
  
 -      req->timeout_rem.addr = READ_ONCE(sqe->addr);
 +      tr->addr = READ_ONCE(sqe->addr);
 +      tr->flags = READ_ONCE(sqe->timeout_flags);
 +      if (tr->flags & IORING_TIMEOUT_UPDATE) {
 +              if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
 +                      return -EINVAL;
 +              if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 +                      return -EFAULT;
 +      } else if (tr->flags) {
 +              /* timeout removal doesn't support flags */
 +              return -EINVAL;
 +      }
 +
        return 0;
  }
  
   */
  static int io_timeout_remove(struct io_kiocb *req)
  {
 +      struct io_timeout_rem *tr = &req->timeout_rem;
        struct io_ring_ctx *ctx = req->ctx;
        int ret;
  
        spin_lock_irq(&ctx->completion_lock);
 -      ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
 +      if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
 +              enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
 +                                      ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
 +
 +              ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
 +      } else {
 +              ret = io_timeout_cancel(ctx, tr->addr);
 +      }
  
        io_cqring_fill_event(req, ret);
        io_commit_cqring(ctx);
@@@ -6024,12 -5756,6 +6022,12 @@@ static int io_req_prep(struct io_kiocb 
                return io_remove_buffers_prep(req, sqe);
        case IORING_OP_TEE:
                return io_tee_prep(req, sqe);
 +      case IORING_OP_SHUTDOWN:
 +              return io_shutdown_prep(req, sqe);
 +      case IORING_OP_RENAMEAT:
 +              return io_renameat_prep(req, sqe);
 +      case IORING_OP_UNLINKAT:
 +              return io_unlinkat_prep(req, sqe);
        }
  
        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@@ -6051,10 -5777,11 +6049,10 @@@ static u32 io_get_sequence(struct io_ki
  {
        struct io_kiocb *pos;
        struct io_ring_ctx *ctx = req->ctx;
 -      u32 total_submitted, nr_reqs = 1;
 +      u32 total_submitted, nr_reqs = 0;
  
 -      if (req->flags & REQ_F_LINK_HEAD)
 -              list_for_each_entry(pos, &req->link_list, link_list)
 -                      nr_reqs++;
 +      io_for_each_link(pos, req)
 +              nr_reqs++;
  
        total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
        return total_submitted - nr_reqs;
@@@ -6106,13 -5833,12 +6104,13 @@@ static int io_req_defer(struct io_kioc
  static void io_req_drop_files(struct io_kiocb *req)
  {
        struct io_ring_ctx *ctx = req->ctx;
 +      struct io_uring_task *tctx = req->task->io_uring;
        unsigned long flags;
  
        spin_lock_irqsave(&ctx->inflight_lock, flags);
        list_del(&req->inflight_entry);
 -      if (waitqueue_active(&ctx->inflight_wait))
 -              wake_up(&ctx->inflight_wait);
 +      if (atomic_read(&tctx->in_idle))
 +              wake_up(&tctx->wait);
        spin_unlock_irqrestore(&ctx->inflight_lock, flags);
        req->flags &= ~REQ_F_INFLIGHT;
        put_files_struct(req->work.identity->files);
@@@ -6167,13 -5893,6 +6165,13 @@@ static void __io_clean_op(struct io_kio
                        if (req->open.filename)
                                putname(req->open.filename);
                        break;
 +              case IORING_OP_RENAMEAT:
 +                      putname(req->rename.oldpath);
 +                      putname(req->rename.newpath);
 +                      break;
 +              case IORING_OP_UNLINKAT:
 +                      putname(req->unlink.filename);
 +                      break;
                }
                req->flags &= ~REQ_F_NEED_CLEANUP;
        }
@@@ -6280,15 -5999,6 +6278,15 @@@ static int io_issue_sqe(struct io_kioc
        case IORING_OP_TEE:
                ret = io_tee(req, force_nonblock);
                break;
 +      case IORING_OP_SHUTDOWN:
 +              ret = io_shutdown(req, force_nonblock);
 +              break;
 +      case IORING_OP_RENAMEAT:
 +              ret = io_renameat(req, force_nonblock);
 +              break;
 +      case IORING_OP_UNLINKAT:
 +              ret = io_unlinkat(req, force_nonblock);
 +              break;
        default:
                ret = -EINVAL;
                break;
                if (in_async)
                        mutex_lock(&ctx->uring_lock);
  
 -              io_iopoll_req_issued(req);
 +              io_iopoll_req_issued(req, in_async);
  
                if (in_async)
                        mutex_unlock(&ctx->uring_lock);
@@@ -6345,19 -6055,8 +6343,19 @@@ static struct io_wq_work *io_wq_submit_
        }
  
        if (ret) {
 -              req_set_fail_links(req);
 -              io_req_complete(req, ret);
 +              /*
 +               * io_iopoll_complete() does not hold completion_lock to complete
 +               * polled io, so here for polled io, just mark it done and still let
 +               * io_iopoll_complete() complete it.
 +               */
 +              if (req->ctx->flags & IORING_SETUP_IOPOLL) {
 +                      struct kiocb *kiocb = &req->rw.kiocb;
 +
 +                      kiocb_done(kiocb, ret, NULL);
 +              } else {
 +                      req_set_fail_links(req);
 +                      io_req_complete(req, ret);
 +              }
        }
  
        return io_steal_work(req);
@@@ -6383,7 -6082,10 +6381,7 @@@ static struct file *io_file_get(struct 
                        return NULL;
                fd = array_index_nospec(fd, ctx->nr_user_files);
                file = io_file_from_index(ctx, fd);
 -              if (file) {
 -                      req->fixed_file_refs = &ctx->file_data->node->refs;
 -                      percpu_ref_get(req->fixed_file_refs);
 -              }
 +              io_set_resource_node(req);
        } else {
                trace_io_uring_file_get(ctx, fd);
                file = __io_file_get(state, fd);
        return file;
  }
  
 -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 -                         int fd)
 -{
 -      bool fixed;
 -
 -      fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
 -      if (unlikely(!fixed && io_async_submit(req->ctx)))
 -              return -EBADF;
 -
 -      req->file = io_file_get(state, req, fd, fixed);
 -      if (req->file || io_op_defs[req->opcode].needs_file_no_error)
 -              return 0;
 -      return -EBADF;
 -}
 -
  static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
  {
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
 -      struct io_kiocb *req = data->req;
 +      struct io_kiocb *prev, *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
 -      struct io_kiocb *prev = NULL;
        unsigned long flags;
  
        spin_lock_irqsave(&ctx->completion_lock, flags);
 +      prev = req->timeout.head;
 +      req->timeout.head = NULL;
  
        /*
         * We don't expect the list to be empty, that will only happen if we
         * race with the completion of the linked work.
         */
 -      if (!list_empty(&req->link_list)) {
 -              prev = list_entry(req->link_list.prev, struct io_kiocb,
 -                                link_list);
 -              if (refcount_inc_not_zero(&prev->refs))
 -                      list_del_init(&req->link_list);
 -              else
 -                      prev = NULL;
 -      }
 -
 +      if (prev && refcount_inc_not_zero(&prev->refs))
 +              io_remove_next_linked(prev);
 +      else
 +              prev = NULL;
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
        if (prev) {
  static void __io_queue_linked_timeout(struct io_kiocb *req)
  {
        /*
 -       * If the list is now empty, then our linked request finished before
 -       * we got a chance to setup the timer
 +       * If the back reference is NULL, then our linked request finished
 +       * before we got a chance to setup the timer
         */
 -      if (!list_empty(&req->link_list)) {
 +      if (req->timeout.head) {
                struct io_timeout_data *data = req->async_data;
  
                data->timer.function = io_link_timeout_fn;
@@@ -6453,13 -6174,18 +6451,13 @@@ static void io_queue_linked_timeout(str
  
  static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
  {
 -      struct io_kiocb *nxt;
 +      struct io_kiocb *nxt = req->link;
  
 -      if (!(req->flags & REQ_F_LINK_HEAD))
 -              return NULL;
 -      if (req->flags & REQ_F_LINK_TIMEOUT)
 -              return NULL;
 -
 -      nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
 -                                      link_list);
 -      if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
 +      if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
 +          nxt->opcode != IORING_OP_LINK_TIMEOUT)
                return NULL;
  
 +      nxt->timeout.head = req;
        nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
        req->flags |= REQ_F_LINK_TIMEOUT;
        return nxt;
@@@ -6565,13 -6291,8 +6563,13 @@@ static inline void io_queue_link_head(s
                io_queue_sqe(req, NULL, cs);
  }
  
 +struct io_submit_link {
 +      struct io_kiocb *head;
 +      struct io_kiocb *last;
 +};
 +
  static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 -                       struct io_kiocb **link, struct io_comp_state *cs)
 +                       struct io_submit_link *link, struct io_comp_state *cs)
  {
        struct io_ring_ctx *ctx = req->ctx;
        int ret;
         * submitted sync once the chain is complete. If none of those
         * conditions are true (normal request), then just queue it.
         */
 -      if (*link) {
 -              struct io_kiocb *head = *link;
 +      if (link->head) {
 +              struct io_kiocb *head = link->head;
  
                /*
                 * Taking sequential execution of a link, draining both sides
                        return ret;
                }
                trace_io_uring_link(ctx, req, head);
 -              list_add_tail(&req->link_list, &head->link_list);
 +              link->last->link = req;
 +              link->last = req;
  
                /* last request of a link, enqueue the link */
                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        io_queue_link_head(head, cs);
 -                      *link = NULL;
 +                      link->head = NULL;
                }
        } else {
                if (unlikely(ctx->drain_next)) {
                        ctx->drain_next = 0;
                }
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 -                      req->flags |= REQ_F_LINK_HEAD;
 -                      INIT_LIST_HEAD(&req->link_list);
 -
                        ret = io_req_defer_prep(req, sqe);
                        if (unlikely(ret))
                                req->flags |= REQ_F_FAIL_LINK;
 -                      *link = req;
 +                      link->head = req;
 +                      link->last = req;
                } else {
                        io_queue_sqe(req, sqe, cs);
                }
@@@ -6638,8 -6360,7 +6636,8 @@@ static void io_submit_state_end(struct 
  {
        if (!list_empty(&state->comp.list))
                io_submit_flush_completions(&state->comp);
 -      blk_finish_plug(&state->plug);
 +      if (state->plug_started)
 +              blk_finish_plug(&state->plug);
        io_state_file_put(state);
        if (state->free_reqs)
                kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
  static void io_submit_state_start(struct io_submit_state *state,
                                  struct io_ring_ctx *ctx, unsigned int max_ios)
  {
 -      blk_start_plug(&state->plug);
 +      state->plug_started = false;
        state->comp.nr = 0;
        INIT_LIST_HEAD(&state->comp.list);
        state->comp.ctx = ctx;
        state->free_reqs = 0;
 -      state->file = NULL;
 +      state->file_refs = 0;
        state->ios_left = max_ios;
  }
  
@@@ -6751,8 -6472,6 +6749,8 @@@ static int io_init_req(struct io_ring_c
        req->file = NULL;
        req->ctx = ctx;
        req->flags = 0;
 +      req->link = NULL;
 +      req->fixed_file_refs = NULL;
        /* one is dropped after submission, the other at completion */
        refcount_set(&req->refs, 2);
        req->task = current;
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
  
 -      if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
 +      if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
                return -EFAULT;
  
        sqe_flags = READ_ONCE(sqe->flags);
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        req->flags |= sqe_flags;
  
 -      if (!io_op_defs[req->opcode].needs_file)
 -              return 0;
 +      /*
 +       * Plug now if we have more than 1 IO left after this, and the target
 +       * is potentially a read/write to block based storage.
 +       */
 +      if (!state->plug_started && state->ios_left > 1 &&
 +          io_op_defs[req->opcode].plug) {
 +              blk_start_plug(&state->plug);
 +              state->plug_started = true;
 +      }
 +
 +      ret = 0;
 +      if (io_op_defs[req->opcode].needs_file) {
 +              bool fixed = req->flags & REQ_F_FIXED_FILE;
 +
 +              req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
 +              if (unlikely(!req->file &&
 +                  !io_op_defs[req->opcode].needs_file_no_error))
 +                      ret = -EBADF;
 +      }
  
 -      ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
        state->ios_left--;
        return ret;
  }
  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
  {
        struct io_submit_state state;
 -      struct io_kiocb *link = NULL;
 +      struct io_submit_link link;
        int i, submitted = 0;
  
        /* if we have a backlog and couldn't flush it all, return BUSY */
        refcount_add(nr, &current->usage);
  
        io_submit_state_start(&state, ctx, nr);
 +      link.head = NULL;
  
        for (i = 0; i < nr; i++) {
                const struct io_uring_sqe *sqe;
@@@ -6887,8 -6589,8 +6885,8 @@@ fail_req
                percpu_counter_sub(&tctx->inflight, unused);
                put_task_struct_many(current, unused);
        }
 -      if (link)
 -              io_queue_link_head(link, &state.comp);
 +      if (link.head)
 +              io_queue_link_head(link.head, &state.comp);
        io_submit_state_end(&state);
  
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
@@@ -6912,45 -6614,111 +6910,45 @@@ static inline void io_ring_clear_wakeup
        spin_unlock_irq(&ctx->completion_lock);
  }
  
 -static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
 -                             int sync, void *key)
 +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
  {
 -      struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
 -      int ret;
 -
 -      ret = autoremove_wake_function(wqe, mode, sync, key);
 -      if (ret) {
 -              unsigned long flags;
 -
 -              spin_lock_irqsave(&ctx->completion_lock, flags);
 -              ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 -              spin_unlock_irqrestore(&ctx->completion_lock, flags);
 -      }
 -      return ret;
 -}
 -
 -enum sq_ret {
 -      SQT_IDLE        = 1,
 -      SQT_SPIN        = 2,
 -      SQT_DID_WORK    = 4,
 -};
 -
 -static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
 -                                unsigned long start_jiffies, bool cap_entries)
 -{
 -      unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
 -      struct io_sq_data *sqd = ctx->sq_data;
        unsigned int to_submit;
        int ret = 0;
  
 -again:
 -      if (!list_empty(&ctx->iopoll_list)) {
 +      to_submit = io_sqring_entries(ctx);
 +      /* if we're handling multiple rings, cap submit size for fairness */
 +      if (cap_entries && to_submit > 8)
 +              to_submit = 8;
 +
 +      if (!list_empty(&ctx->iopoll_list) || to_submit) {
                unsigned nr_events = 0;
  
                mutex_lock(&ctx->uring_lock);
 -              if (!list_empty(&ctx->iopoll_list) && !need_resched())
 +              if (!list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, &nr_events, 0);
 +
 +              if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
 +                      ret = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);
        }
  
 -      to_submit = io_sqring_entries(ctx);
 -
 -      /*
 -       * If submit got -EBUSY, flag us as needing the application
 -       * to enter the kernel to reap and flush events.
 -       */
 -      if (!to_submit || ret == -EBUSY || need_resched()) {
 -              /*
 -               * Drop cur_mm before scheduling, we can't hold it for
 -               * long periods (or over schedule()). Do this before
 -               * adding ourselves to the waitqueue, as the unuse/drop
 -               * may sleep.
 -               */
 -              io_sq_thread_drop_mm();
 -
 -              /*
 -               * We're polling. If we're within the defined idle
 -               * period, then let us spin without work before going
 -               * to sleep. The exception is if we got EBUSY doing
 -               * more IO, we should wait for the application to
 -               * reap events and wake us up.
 -               */
 -              if (!list_empty(&ctx->iopoll_list) || need_resched() ||
 -                  (!time_after(jiffies, timeout) && ret != -EBUSY &&
 -                  !percpu_ref_is_dying(&ctx->refs)))
 -                      return SQT_SPIN;
 +      if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
 +              wake_up(&ctx->sqo_sq_wait);
  
 -              prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
 -                                      TASK_INTERRUPTIBLE);
 +      return ret;
 +}
  
 -              /*
 -               * While doing polled IO, before going to sleep, we need
 -               * to check if there are new reqs added to iopoll_list,
 -               * it is because reqs may have been punted to io worker
 -               * and will be added to iopoll_list later, hence check
 -               * the iopoll_list again.
 -               */
 -              if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 -                  !list_empty_careful(&ctx->iopoll_list)) {
 -                      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
 -                      goto again;
 -              }
 +static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 +{
 +      struct io_ring_ctx *ctx;
 +      unsigned sq_thread_idle = 0;
  
 -              to_submit = io_sqring_entries(ctx);
 -              if (!to_submit || ret == -EBUSY)
 -                      return SQT_IDLE;
 +      list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 +              if (sq_thread_idle < ctx->sq_thread_idle)
 +                      sq_thread_idle = ctx->sq_thread_idle;
        }
  
 -      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
 -      io_ring_clear_wakeup_flag(ctx);
 -
 -      /* if we're handling multiple rings, cap submit size for fairness */
 -      if (cap_entries && to_submit > 8)
 -              to_submit = 8;
 -
 -      mutex_lock(&ctx->uring_lock);
 -      if (likely(!percpu_ref_is_dying(&ctx->refs)))
 -              ret = io_submit_sqes(ctx, to_submit);
 -      mutex_unlock(&ctx->uring_lock);
 -
 -      if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
 -              wake_up(&ctx->sqo_sq_wait);
 -
 -      return SQT_DID_WORK;
 +      sqd->sq_thread_idle = sq_thread_idle;
  }
  
  static void io_sqd_init_new(struct io_sq_data *sqd)
  
        while (!list_empty(&sqd->ctx_new_list)) {
                ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
 -              init_wait(&ctx->sqo_wait_entry);
 -              ctx->sqo_wait_entry.func = io_sq_wake_function;
                list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
                complete(&ctx->sq_thread_comp);
        }
 +
 +      io_sqd_update_thread_idle(sqd);
  }
  
  static int io_sq_thread(void *data)
  {
        struct cgroup_subsys_state *cur_css = NULL;
 +      struct files_struct *old_files = current->files;
 +      struct nsproxy *old_nsproxy = current->nsproxy;
        const struct cred *old_cred = NULL;
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
 -      unsigned long start_jiffies;
 +      unsigned long timeout = 0;
 +      DEFINE_WAIT(wait);
 +
 +      task_lock(current);
 +      current->files = NULL;
 +      current->nsproxy = NULL;
 +      task_unlock(current);
  
 -      start_jiffies = jiffies;
        while (!kthread_should_stop()) {
 -              enum sq_ret ret = 0;
 -              bool cap_entries;
 +              int ret;
 +              bool cap_entries, sqt_spin, needs_sched;
  
                /*
                 * Any changes to the sqd lists are synchronized through the
                 * kthread parking. This synchronizes the thread vs users,
                 * the users are synchronized on the sqd->ctx_lock.
                 */
 -              if (kthread_should_park())
 +              if (kthread_should_park()) {
                        kthread_parkme();
 +                      /*
 +                       * When sq thread is unparked, in case the previous park operation
 +                       * comes from io_put_sq_data(), which means that sq thread is going
 +                       * to be stopped, so here needs to have a check.
 +                       */
 +                      if (kthread_should_stop())
 +                              break;
 +              }
  
 -              if (unlikely(!list_empty(&sqd->ctx_new_list)))
 +              if (unlikely(!list_empty(&sqd->ctx_new_list))) {
                        io_sqd_init_new(sqd);
 +                      timeout = jiffies + sqd->sq_thread_idle;
 +              }
  
 +              sqt_spin = false;
                cap_entries = !list_is_singular(&sqd->ctx_list);
 -
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        if (current->cred != ctx->creds) {
                                if (old_cred)
                        current->sessionid = ctx->sessionid;
  #endif
  
 -                      ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
 +                      ret = __io_sq_thread(ctx, cap_entries);
 +                      if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
 +                              sqt_spin = true;
  
 -                      io_sq_thread_drop_mm();
 +                      io_sq_thread_drop_mm_files();
                }
  
 -              if (ret & SQT_SPIN) {
 +              if (sqt_spin || !time_after(jiffies, timeout)) {
                        io_run_task_work();
                        cond_resched();
 -              } else if (ret == SQT_IDLE) {
 -                      if (kthread_should_park())
 -                              continue;
 +                      if (sqt_spin)
 +                              timeout = jiffies + sqd->sq_thread_idle;
 +                      continue;
 +              }
 +
 +              if (kthread_should_park())
 +                      continue;
 +
 +              needs_sched = true;
 +              prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
 +              list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 +                      if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 +                          !list_empty_careful(&ctx->iopoll_list)) {
 +                              needs_sched = false;
 +                              break;
 +                      }
 +                      if (io_sqring_entries(ctx)) {
 +                              needs_sched = false;
 +                              break;
 +                      }
 +              }
 +
 +              if (needs_sched) {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_set_wakeup_flag(ctx);
 +
                        schedule();
 -                      start_jiffies = jiffies;
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_clear_wakeup_flag(ctx);
                }
 +
 +              finish_wait(&sqd->wait, &wait);
 +              timeout = jiffies + sqd->sq_thread_idle;
        }
  
        io_run_task_work();
        if (old_cred)
                revert_creds(old_cred);
  
 +      task_lock(current);
 +      current->files = old_files;
 +      current->nsproxy = old_nsproxy;
 +      task_unlock(current);
 +
        kthread_parkme();
  
        return 0;
@@@ -7122,8 -6843,13 +7120,8 @@@ static int io_run_task_work_sig(void
                return 1;
        if (!signal_pending(current))
                return 0;
 -      if (current->jobctl & JOBCTL_TASK_WORK) {
 -              spin_lock_irq(&current->sighand->siglock);
 -              current->jobctl &= ~JOBCTL_TASK_WORK;
 -              recalc_sigpending();
 -              spin_unlock_irq(&current->sighand->siglock);
 -              return 1;
 -      }
 +      if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
 +              return -ERESTARTSYS;
        return -EINTR;
  }
  
   * application must reap them itself, as they reside on the shared cq ring.
   */
  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 -                        const sigset_t __user *sig, size_t sigsz)
 +                        const sigset_t __user *sig, size_t sigsz,
 +                        struct __kernel_timespec __user *uts)
  {
        struct io_wait_queue iowq = {
                .wq = {
                .to_wait        = min_events,
        };
        struct io_rings *rings = ctx->rings;
 +      struct timespec64 ts;
 +      signed long timeout = 0;
        int ret = 0;
  
        do {
                        return ret;
        }
  
 +      if (uts) {
 +              if (get_timespec64(&ts, uts))
 +                      return -EFAULT;
 +              timeout = timespec64_to_jiffies(&ts);
 +      }
 +
        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
        trace_io_uring_cqring_wait(ctx, min_events);
        do {
                        break;
                if (io_should_wake(&iowq, false))
                        break;
 -              schedule();
 +              if (uts) {
 +                      timeout = schedule_timeout(timeout);
 +                      if (timeout == 0) {
 +                              ret = -ETIME;
 +                              break;
 +                      }
 +              } else {
 +                      schedule();
 +              }
        } while (1);
        finish_wait(&ctx->wait, &iowq.wq);
  
@@@ -7245,9 -6954,11 +7243,9 @@@ static int io_sqe_files_unregister(stru
        if (!data)
                return -ENXIO;
  
 -      spin_lock(&data->lock);
 -      if (!list_empty(&data->ref_list))
 -              ref_node = list_first_entry(&data->ref_list,
 -                              struct fixed_file_ref_node, node);
 -      spin_unlock(&data->lock);
 +      spin_lock_bh(&data->lock);
 +      ref_node = data->node;
 +      spin_unlock_bh(&data->lock);
        if (ref_node)
                percpu_ref_kill(&ref_node->refs);
  
@@@ -7370,11 -7081,12 +7368,11 @@@ static void io_sq_thread_stop(struct io
  
                mutex_lock(&sqd->ctx_lock);
                list_del(&ctx->sqd_list);
 +              io_sqd_update_thread_idle(sqd);
                mutex_unlock(&sqd->ctx_lock);
  
 -              if (sqd->thread) {
 -                      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
 +              if (sqd->thread)
                        io_sq_thread_unpark(sqd);
 -              }
  
                io_put_sq_data(sqd);
                ctx->sq_data = NULL;
@@@ -7594,6 -7306,10 +7592,6 @@@ static void __io_file_put_work(struct f
                kfree(pfile);
        }
  
 -      spin_lock(&file_data->lock);
 -      list_del(&ref_node->node);
 -      spin_unlock(&file_data->lock);
 -
        percpu_ref_exit(&ref_node->refs);
        kfree(ref_node);
        percpu_ref_put(&file_data->refs);
@@@ -7620,32 -7336,17 +7618,32 @@@ static void io_file_put_work(struct wor
  static void io_file_data_ref_zero(struct percpu_ref *ref)
  {
        struct fixed_file_ref_node *ref_node;
 +      struct fixed_file_data *data;
        struct io_ring_ctx *ctx;
 -      bool first_add;
 +      bool first_add = false;
        int delay = HZ;
  
        ref_node = container_of(ref, struct fixed_file_ref_node, refs);
 -      ctx = ref_node->file_data->ctx;
 +      data = ref_node->file_data;
 +      ctx = data->ctx;
 +
 +      spin_lock_bh(&data->lock);
 +      ref_node->done = true;
 +
 +      while (!list_empty(&data->ref_list)) {
 +              ref_node = list_first_entry(&data->ref_list,
 +                                      struct fixed_file_ref_node, node);
 +              /* recycle ref nodes in order */
 +              if (!ref_node->done)
 +                      break;
 +              list_del(&ref_node->node);
 +              first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
 +      }
 +      spin_unlock_bh(&data->lock);
  
 -      if (percpu_ref_is_dying(&ctx->file_data->refs))
 +      if (percpu_ref_is_dying(&data->refs))
                delay = 0;
  
 -      first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
        if (!delay)
                mod_delayed_work(system_wq, &ctx->file_put_work, 0);
        else if (first_add)
@@@ -7669,7 -7370,6 +7667,7 @@@ static struct fixed_file_ref_node *allo
        INIT_LIST_HEAD(&ref_node->node);
        INIT_LIST_HEAD(&ref_node->file_list);
        ref_node->file_data = ctx->file_data;
 +      ref_node->done = false;
        return ref_node;
  }
  
@@@ -7764,9 -7464,9 +7762,9 @@@ static int io_sqe_files_register(struc
        }
  
        file_data->node = ref_node;
 -      spin_lock(&file_data->lock);
 -      list_add(&ref_node->node, &file_data->ref_list);
 -      spin_unlock(&file_data->lock);
 +      spin_lock_bh(&file_data->lock);
 +      list_add_tail(&ref_node->node, &file_data->ref_list);
 +      spin_unlock_bh(&file_data->lock);
        percpu_ref_get(&file_data->refs);
        return ret;
  out_fput:
@@@ -7923,10 -7623,10 +7921,10 @@@ static int __io_sqe_files_update(struc
  
        if (needs_switch) {
                percpu_ref_kill(&data->node->refs);
 -              spin_lock(&data->lock);
 -              list_add(&ref_node->node, &data->ref_list);
 +              spin_lock_bh(&data->lock);
 +              list_add_tail(&ref_node->node, &data->ref_list);
                data->node = ref_node;
 -              spin_unlock(&data->lock);
 +              spin_unlock_bh(&data->lock);
                percpu_ref_get(&ctx->file_data->refs);
        } else
                destroy_fixed_file_ref_node(ref_node);
@@@ -8054,7 -7754,7 +8052,7 @@@ static int io_sq_offload_create(struct 
                struct io_sq_data *sqd;
  
                ret = -EPERM;
 -              if (!capable(CAP_SYS_ADMIN))
 +              if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
                        goto err;
  
                sqd = io_get_sq_data(p);
@@@ -8640,6 -8340,8 +8638,6 @@@ static void io_ring_exit_work(struct wo
         * as nobody else will be looking for them.
         */
        do {
 -              if (ctx->rings)
 -                      io_cqring_overflow_flush(ctx, true, NULL, NULL);
                io_iopoll_try_reap_events(ctx);
        } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
        io_ring_ctx_free(ctx);
@@@ -8649,17 -8351,17 +8647,17 @@@ static void io_ring_ctx_wait_and_kill(s
  {
        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
 +      if (ctx->rings)
 +              io_cqring_overflow_flush(ctx, true, NULL, NULL);
        mutex_unlock(&ctx->uring_lock);
  
 -      io_kill_timeouts(ctx, NULL);
 -      io_poll_remove_all(ctx, NULL);
 +      io_kill_timeouts(ctx, NULL, NULL);
 +      io_poll_remove_all(ctx, NULL, NULL);
  
        if (ctx->io_wq)
                io_wq_cancel_all(ctx->io_wq);
  
        /* if we failed setting up the ctx, we might not have any rings */
 -      if (ctx->rings)
 -              io_cqring_overflow_flush(ctx, true, NULL, NULL);
        io_iopoll_try_reap_events(ctx);
        idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
  
@@@ -8690,31 -8392,120 +8688,31 @@@ static int io_uring_release(struct inod
        return 0;
  }
  
 -static bool io_wq_files_match(struct io_wq_work *work, void *data)
 -{
 -      struct files_struct *files = data;
 -
 -      return !files || ((work->flags & IO_WQ_WORK_FILES) &&
 -                              work->identity->files == files);
 -}
 -
 -/*
 - * Returns true if 'preq' is the link parent of 'req'
 - */
 -static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
 -{
 -      struct io_kiocb *link;
 -
 -      if (!(preq->flags & REQ_F_LINK_HEAD))
 -              return false;
 -
 -      list_for_each_entry(link, &preq->link_list, link_list) {
 -              if (link == req)
 -                      return true;
 -      }
 -
 -      return false;
 -}
 -
 -/*
 - * We're looking to cancel 'req' because it's holding on to our files, but
 - * 'req' could be a link to another request. See if it is, and cancel that
 - * parent request if so.
 - */
 -static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
 -{
 -      struct hlist_node *tmp;
 -      struct io_kiocb *preq;
 -      bool found = false;
 -      int i;
 -
 -      spin_lock_irq(&ctx->completion_lock);
 -      for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 -              struct hlist_head *list;
 -
 -              list = &ctx->cancel_hash[i];
 -              hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
 -                      found = io_match_link(preq, req);
 -                      if (found) {
 -                              io_poll_remove_one(preq);
 -                              break;
 -                      }
 -              }
 -      }
 -      spin_unlock_irq(&ctx->completion_lock);
 -      return found;
 -}
 -
 -static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
 -                                 struct io_kiocb *req)
 -{
 -      struct io_kiocb *preq;
 -      bool found = false;
 -
 -      spin_lock_irq(&ctx->completion_lock);
 -      list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
 -              found = io_match_link(preq, req);
 -              if (found) {
 -                      __io_timeout_cancel(preq);
 -                      break;
 -              }
 -      }
 -      spin_unlock_irq(&ctx->completion_lock);
 -      return found;
 -}
 +struct io_task_cancel {
 +      struct task_struct *task;
 +      struct files_struct *files;
 +};
  
 -static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
 +static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
  {
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 +      struct io_task_cancel *cancel = data;
        bool ret;
  
 -      if (req->flags & REQ_F_LINK_TIMEOUT) {
 +      if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
                unsigned long flags;
                struct io_ring_ctx *ctx = req->ctx;
  
                /* protect against races with linked timeouts */
                spin_lock_irqsave(&ctx->completion_lock, flags);
 -              ret = io_match_link(req, data);
 +              ret = io_match_task(req, cancel->task, cancel->files);
                spin_unlock_irqrestore(&ctx->completion_lock, flags);
        } else {
 -              ret = io_match_link(req, data);
 +              ret = io_match_task(req, cancel->task, cancel->files);
        }
        return ret;
  }
  
 -static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
 -{
 -      enum io_wq_cancel cret;
 -
 -      /* cancel this particular work, if it's running */
 -      cret = io_wq_cancel_work(ctx->io_wq, &req->work);
 -      if (cret != IO_WQ_CANCEL_NOTFOUND)
 -              return;
 -
 -      /* find links that hold this pending, cancel those */
 -      cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
 -      if (cret != IO_WQ_CANCEL_NOTFOUND)
 -              return;
 -
 -      /* if we have a poll link holding this pending, cancel that */
 -      if (io_poll_remove_link(ctx, req))
 -              return;
 -
 -      /* final option, timeout link is holding this req pending */
 -      io_timeout_remove_link(ctx, req);
 -}
 -
  static void io_cancel_defer_files(struct io_ring_ctx *ctx,
                                  struct task_struct *task,
                                  struct files_struct *files)
  
        spin_lock_irq(&ctx->completion_lock);
        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 -              if (io_task_match(de->req, task) &&
 -                  io_match_files(de->req, files)) {
 +              if (io_match_task(de->req, task, files)) {
                        list_cut_position(&list, &ctx->defer_list, &de->list);
                        break;
                }
        }
  }
  
 -/*
 - * Returns true if we found and killed one or more files pinning requests
 - */
 -static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
 +static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 +                                struct task_struct *task,
                                  struct files_struct *files)
  {
 -      if (list_empty_careful(&ctx->inflight_list))
 -              return false;
 -
 -      /* cancel all at once, should be faster than doing it one by one*/
 -      io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
 -
        while (!list_empty_careful(&ctx->inflight_list)) {
 -              struct io_kiocb *cancel_req = NULL, *req;
 +              struct io_task_cancel cancel = { .task = task, .files = files };
 +              struct io_kiocb *req;
                DEFINE_WAIT(wait);
 +              bool found = false;
  
                spin_lock_irq(&ctx->inflight_lock);
                list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
 -                      if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
 +                      if (req->task != task ||
                            req->work.identity->files != files)
                                continue;
 -                      /* req is being completed, ignore */
 -                      if (!refcount_inc_not_zero(&req->refs))
 -                              continue;
 -                      cancel_req = req;
 +                      found = true;
                        break;
                }
 -              if (cancel_req)
 -                      prepare_to_wait(&ctx->inflight_wait, &wait,
 -                                              TASK_UNINTERRUPTIBLE);
 +              if (found)
 +                      prepare_to_wait(&task->io_uring->wait, &wait,
 +                                      TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(&ctx->inflight_lock);
  
                /* We need to keep going until we don't find a matching req */
 -              if (!cancel_req)
 +              if (!found)
                        break;
 -              /* cancel this request, or head link requests */
 -              io_attempt_cancel(ctx, cancel_req);
 -              io_put_req(cancel_req);
 +
 +              io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
 +              io_poll_remove_all(ctx, task, files);
 +              io_kill_timeouts(ctx, task, files);
                /* cancellations _may_ trigger task work */
                io_run_task_work();
                schedule();
 -              finish_wait(&ctx->inflight_wait, &wait);
 +              finish_wait(&task->io_uring->wait, &wait);
        }
 -
 -      return true;
  }
  
 -static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 +                                          struct task_struct *task)
  {
 -      struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 -      struct task_struct *task = data;
 -
 -      return io_task_match(req, task);
 -}
 -
 -static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 -                                          struct task_struct *task,
 -                                          struct files_struct *files)
 -{
 -      bool ret;
 -
 -      ret = io_uring_cancel_files(ctx, files);
 -      if (!files) {
 +      while (1) {
 +              struct io_task_cancel cancel = { .task = task, .files = NULL, };
                enum io_wq_cancel cret;
 +              bool ret = false;
  
 -              cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
 +              cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
                if (cret != IO_WQ_CANCEL_NOTFOUND)
                        ret = true;
  
                        }
                }
  
 -              ret |= io_poll_remove_all(ctx, task);
 -              ret |= io_kill_timeouts(ctx, task);
 +              ret |= io_poll_remove_all(ctx, task, NULL);
 +              ret |= io_kill_timeouts(ctx, task, NULL);
 +              if (!ret)
 +                      break;
 +              io_run_task_work();
 +              cond_resched();
        }
 -
 -      return ret;
  }
  
  /*
@@@ -8823,15 -8633,17 +8821,15 @@@ static void io_uring_cancel_task_reques
                io_sq_thread_park(ctx->sq_data);
        }
  
 -      if (files)
 -              io_cancel_defer_files(ctx, NULL, files);
 -      else
 -              io_cancel_defer_files(ctx, task, NULL);
 -
 +      io_cancel_defer_files(ctx, task, files);
 +      io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
        io_cqring_overflow_flush(ctx, true, task, files);
 +      io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
  
 -      while (__io_uring_cancel_task_requests(ctx, task, files)) {
 -              io_run_task_work();
 -              cond_resched();
 -      }
 +      if (!files)
 +              __io_uring_cancel_task_requests(ctx, task);
 +      else
 +              io_uring_cancel_files(ctx, task, files);
  
        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
                atomic_dec(&task->io_uring->in_idle);
@@@ -9089,39 -8901,9 +9087,39 @@@ static void io_sqpoll_wait_sq(struct io
        finish_wait(&ctx->sqo_sq_wait, &wait);
  }
  
 +static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
 +                        struct __kernel_timespec __user **ts,
 +                        const sigset_t __user **sig)
 +{
 +      struct io_uring_getevents_arg arg;
 +
 +      /*
 +       * If EXT_ARG isn't set, then we have no timespec and the argp pointer
 +       * is just a pointer to the sigset_t.
 +       */
 +      if (!(flags & IORING_ENTER_EXT_ARG)) {
 +              *sig = (const sigset_t __user *) argp;
 +              *ts = NULL;
 +              return 0;
 +      }
 +
 +      /*
 +       * EXT_ARG is set - ensure we agree on the size of it and copy in our
 +       * timespec and sigset_t pointers if good.
 +       */
 +      if (*argsz != sizeof(arg))
 +              return -EINVAL;
 +      if (copy_from_user(&arg, argp, sizeof(arg)))
 +              return -EFAULT;
 +      *sig = u64_to_user_ptr(arg.sigmask);
 +      *argsz = arg.sigmask_sz;
 +      *ts = u64_to_user_ptr(arg.ts);
 +      return 0;
 +}
 +
  SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 -              u32, min_complete, u32, flags, const sigset_t __user *, sig,
 -              size_t, sigsz)
 +              u32, min_complete, u32, flags, const void __user *, argp,
 +              size_t, argsz)
  {
        struct io_ring_ctx *ctx;
        long ret = -EBADF;
        io_run_task_work();
  
        if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 -                      IORING_ENTER_SQ_WAIT))
 +                      IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
                return -EINVAL;
  
        f = fdget(fd);
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
 +              io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                if (!list_empty_careful(&ctx->cq_overflow_list))
                        io_cqring_overflow_flush(ctx, false, NULL, NULL);
 +              io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT)
                        goto out;
        }
        if (flags & IORING_ENTER_GETEVENTS) {
 +              const sigset_t __user *sig;
 +              struct __kernel_timespec __user *ts;
 +
 +              ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
 +              if (unlikely(ret))
 +                      goto out;
 +
                min_complete = min(min_complete, ctx->cq_entries);
  
                /*
                    !(ctx->flags & IORING_SETUP_SQPOLL)) {
                        ret = io_iopoll_check(ctx, min_complete);
                } else {
 -                      ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
 +                      ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
                }
        }
  
@@@ -9381,7 -9154,6 +9379,7 @@@ static int io_uring_get_fd(struct io_ri
  {
        struct file *file;
        int ret;
 +      int fd;
  
  #if defined(CONFIG_UNIX)
        ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
        ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (ret < 0)
                goto err;
 +      fd = ret;
  
        file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                        O_RDWR | O_CLOEXEC);
        if (IS_ERR(file)) {
 -err_fd:
 -              put_unused_fd(ret);
 +              put_unused_fd(fd);
                ret = PTR_ERR(file);
                goto err;
        }
  #if defined(CONFIG_UNIX)
        ctx->ring_sock->file = file;
  #endif
 -      if (unlikely(io_uring_add_task_file(ctx, file))) {
 -              file = ERR_PTR(-ENOMEM);
 -              goto err_fd;
 +      ret = io_uring_add_task_file(ctx, file);
 +      if (ret) {
 +              fput(file);
 +              put_unused_fd(fd);
 +              goto err;
        }
 -      fd_install(ret, file);
 -      return ret;
 +      fd_install(fd, file);
 +      return fd;
  err:
  #if defined(CONFIG_UNIX)
        sock_release(ctx->ring_sock);
@@@ -9453,16 -9223,14 +9451,16 @@@ static int io_uring_create(unsigned ent
                 * to a power-of-two, if it isn't already. We do NOT impose
                 * any cq vs sq ring sizing.
                 */
 -              p->cq_entries = roundup_pow_of_two(p->cq_entries);
 -              if (p->cq_entries < p->sq_entries)
 +              if (!p->cq_entries)
                        return -EINVAL;
                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
                        if (!(p->flags & IORING_SETUP_CLAMP))
                                return -EINVAL;
                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
                }
 +              p->cq_entries = roundup_pow_of_two(p->cq_entries);
 +              if (p->cq_entries < p->sq_entries)
 +                      return -EINVAL;
        } else {
                p->cq_entries = 2 * p->sq_entries;
        }
        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 -                      IORING_FEAT_POLL_32BITS;
 +                      IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 +                      IORING_FEAT_EXT_ARG;
  
        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
diff --combined fs/pstore/blk.c
@@@ -90,6 -90,7 +90,6 @@@ MODULE_PARM_DESC(blkdev, "block device 
  static DEFINE_MUTEX(pstore_blk_lock);
  static struct block_device *psblk_bdev;
  static struct pstore_zone_info *pstore_zone_info;
 -static pstore_blk_panic_write_op blkdev_panic_write;
  
  struct bdev_info {
        dev_t devt;
@@@ -244,7 -245,7 +244,7 @@@ static struct block_device *psblk_get_b
                        return bdev;
        }
  
-       nr_sects = part_nr_sects_read(bdev->bd_part);
+       nr_sects = bdev_nr_sectors(bdev);
        if (!nr_sects) {
                pr_err("not enough space for '%s'\n", blkdev);
                blkdev_put(bdev, mode);
@@@ -340,11 -341,24 +340,11 @@@ static ssize_t psblk_generic_blk_write(
        return ret;
  }
  
 -static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
 -              loff_t off)
 -{
 -      int ret;
 -
 -      if (!blkdev_panic_write)
 -              return -EOPNOTSUPP;
 -
 -      /* size and off must align to SECTOR_SIZE for block device */
 -      ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
 -                      size >> SECTOR_SHIFT);
 -      /* try next zone */
 -      if (ret == -ENOMSG)
 -              return ret;
 -      return ret ? -EIO : size;
 -}
 -
 -static int __register_pstore_blk(struct pstore_blk_info *info)
 +/*
 + * This takes its configuration only from the module parameters now.
 + * See psblk_get_bdev() and blkdev.
 + */
 +static int __register_pstore_blk(void)
  {
        char bdev_name[BDEVNAME_SIZE];
        struct block_device *bdev;
        }
  
        /* only allow driver matching the @blkdev */
 -      if (!binfo.devt || (!best_effort &&
 -                          MAJOR(binfo.devt) != info->major)) {
 -              pr_debug("invalid major %u (expect %u)\n",
 -                              info->major, MAJOR(binfo.devt));
 +      if (!binfo.devt) {
 +              pr_debug("no major\n");
                ret = -ENODEV;
                goto err_put_bdev;
        }
  
        /* psblk_bdev must be assigned before register to pstore/blk */
        psblk_bdev = bdev;
 -      blkdev_panic_write = info->panic_write;
 -
 -      /* Copy back block device details. */
 -      info->devt = binfo.devt;
 -      info->nr_sects = binfo.nr_sects;
 -      info->start_sect = binfo.start_sect;
  
        memset(&dev, 0, sizeof(dev));
 -      dev.total_size = info->nr_sects << SECTOR_SHIFT;
 -      dev.flags = info->flags;
 +      dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
        dev.read = psblk_generic_blk_read;
        dev.write = psblk_generic_blk_write;
 -      dev.erase = NULL;
 -      dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
  
        ret = __register_pstore_device(&dev);
        if (ret)
                goto err_put_bdev;
  
        bdevname(bdev, bdev_name);
 -      pr_info("attached %s%s\n", bdev_name,
 -              info->panic_write ? "" : " (no dedicated panic_write!)");
 +      pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
        return 0;
  
  err_put_bdev:
        psblk_bdev = NULL;
 -      blkdev_panic_write = NULL;
        psblk_put_bdev(bdev, holder);
        return ret;
  }
  
 -/**
 - * register_pstore_blk() - register block device to pstore/blk
 - *
 - * @info: details on the desired block device interface
 - *
 - * Return:
 - * * 0                - OK
 - * * Others   - something error.
 - */
 -int register_pstore_blk(struct pstore_blk_info *info)
 -{
 -      int ret;
 -
 -      mutex_lock(&pstore_blk_lock);
 -      ret = __register_pstore_blk(info);
 -      mutex_unlock(&pstore_blk_lock);
 -
 -      return ret;
 -}
 -EXPORT_SYMBOL_GPL(register_pstore_blk);
 -
  static void __unregister_pstore_blk(unsigned int major)
  {
        struct pstore_device_info dev = { .read = psblk_generic_blk_read };
        if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
                __unregister_pstore_device(&dev);
                psblk_put_bdev(psblk_bdev, holder);
 -              blkdev_panic_write = NULL;
                psblk_bdev = NULL;
        }
  }
  
 -/**
 - * unregister_pstore_blk() - unregister block device from pstore/blk
 - *
 - * @major: the major device number of device
 - */
 -void unregister_pstore_blk(unsigned int major)
 -{
 -      mutex_lock(&pstore_blk_lock);
 -      __unregister_pstore_blk(major);
 -      mutex_unlock(&pstore_blk_lock);
 -}
 -EXPORT_SYMBOL_GPL(unregister_pstore_blk);
 -
  /* get information of pstore/blk */
  int pstore_blk_get_config(struct pstore_blk_config *info)
  {
@@@ -421,11 -483,12 +421,11 @@@ EXPORT_SYMBOL_GPL(pstore_blk_get_config
  
  static int __init pstore_blk_init(void)
  {
 -      struct pstore_blk_info info = { };
        int ret = 0;
  
        mutex_lock(&pstore_blk_lock);
        if (!pstore_zone_info && best_effort && blkdev[0])
 -              ret = __register_pstore_blk(&info);
 +              ret = __register_pstore_blk();
        mutex_unlock(&pstore_blk_lock);
  
        return ret;
diff --combined include/linux/blkdev.h
@@@ -191,7 -191,7 +191,7 @@@ struct request 
        };
  
        struct gendisk *rq_disk;
-       struct hd_struct *part;
+       struct block_device *part;
  #ifdef CONFIG_BLK_RQ_ALLOC_TIME
        /* Time that the first bio started allocating this request. */
        u64 alloc_time_ns;
@@@ -1073,15 -1073,12 +1073,15 @@@ static inline unsigned int blk_queue_ge
   * file system requests.
   */
  static inline unsigned int blk_max_size_offset(struct request_queue *q,
 -                                             sector_t offset)
 +                                             sector_t offset,
 +                                             unsigned int chunk_sectors)
  {
 -      unsigned int chunk_sectors = q->limits.chunk_sectors;
 -
 -      if (!chunk_sectors)
 -              return q->limits.max_sectors;
 +      if (!chunk_sectors) {
 +              if (q->limits.chunk_sectors)
 +                      chunk_sectors = q->limits.chunk_sectors;
 +              else
 +                      return q->limits.max_sectors;
 +      }
  
        if (likely(is_power_of_2(chunk_sectors)))
                chunk_sectors -= offset & (chunk_sectors - 1);
@@@ -1104,7 -1101,7 +1104,7 @@@ static inline unsigned int blk_rq_get_m
            req_op(rq) == REQ_OP_SECURE_ERASE)
                return blk_queue_get_max_sectors(q, req_op(rq));
  
 -      return min(blk_max_size_offset(q, offset),
 +      return min(blk_max_size_offset(q, offset, 0),
                        blk_queue_get_max_sectors(q, req_op(rq)));
  }
  
@@@ -1491,7 -1488,7 +1491,7 @@@ static inline int bdev_alignment_offset
                return -1;
        if (bdev_is_partition(bdev))
                return queue_limit_alignment_offset(&q->limits,
-                               bdev->bd_part->start_sect);
+                               bdev->bd_start_sect);
        return q->limits.alignment_offset;
  }
  
@@@ -1532,7 -1529,7 +1532,7 @@@ static inline int bdev_discard_alignmen
  
        if (bdev_is_partition(bdev))
                return queue_limit_discard_alignment(&q->limits,
-                               bdev->bd_part->start_sect);
+                               bdev->bd_start_sect);
        return q->limits.discard_alignment;
  }
  
@@@ -1853,6 -1850,7 +1853,7 @@@ struct block_device_operations 
        void (*unlock_native_capacity) (struct gendisk *);
        int (*revalidate_disk) (struct gendisk *);
        int (*getgeo)(struct block_device *, struct hd_geometry *);
+       int (*set_read_only)(struct block_device *bdev, bool ro);
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        int (*report_zones)(struct gendisk *, sector_t sector,
@@@ -1869,8 -1867,6 +1870,6 @@@ extern int blkdev_compat_ptr_ioctl(stru
  #define blkdev_compat_ptr_ioctl NULL
  #endif
  
- extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
-                                unsigned long);
  extern int bdev_read_page(struct block_device *, sector_t, struct page *);
  extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
@@@ -1947,9 -1943,9 +1946,9 @@@ unsigned long disk_start_io_acct(struc
  void disk_end_io_acct(struct gendisk *disk, unsigned int op,
                unsigned long start_time);
  
- unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
-                                struct bio *bio);
- void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long part_start_io_acct(struct gendisk *disk,
+               struct block_device **part, struct bio *bio);
+ void part_end_io_acct(struct block_device *part, struct bio *bio,
                      unsigned long start_time);
  
  /**
@@@ -1977,7 -1973,7 +1976,7 @@@ int bdev_read_only(struct block_device 
  int set_blocksize(struct block_device *bdev, int size);
  
  const char *bdevname(struct block_device *bdev, char *buffer);
struct block_device *lookup_bdev(const char *);
int lookup_bdev(const char *pathname, dev_t *dev);
  
  void blkdev_show(struct seq_file *seqf, off_t offset);
  
  struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                void *holder);
  struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
- int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
-               void *holder);
- void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
-               void *holder);
+ int bd_prepare_to_claim(struct block_device *bdev, void *holder);
+ void bd_abort_claiming(struct block_device *bdev, void *holder);
  void blkdev_put(struct block_device *bdev, fmode_t mode);
  
+ /* just for blk-cgroup, don't use elsewhere */
+ struct block_device *blkdev_get_no_open(dev_t dev);
+ void blkdev_put_no_open(struct block_device *bdev);
+ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
+ void bdev_add(struct block_device *bdev, dev_t dev);
  struct block_device *I_BDEV(struct inode *inode);
- struct block_device *bdget_part(struct hd_struct *part);
  struct block_device *bdgrab(struct block_device *bdev);
  void bdput(struct block_device *);
  
@@@ -2024,7 -2023,7 +2026,7 @@@ static inline int sync_blockdev(struct 
  #endif
  int fsync_bdev(struct block_device *bdev);
  
struct super_block *freeze_bdev(struct block_device *bdev);
- int thaw_bdev(struct block_device *bdev, struct super_block *sb);
int freeze_bdev(struct block_device *bdev);
+ int thaw_bdev(struct block_device *bdev);
  
  #endif /* _LINUX_BLKDEV_H */
diff --combined include/linux/fs.h
@@@ -696,7 -696,6 +696,6 @@@ struct inode 
        struct list_head        i_devices;
        union {
                struct pipe_inode_info  *i_pipe;
-               struct block_device     *i_bdev;
                struct cdev             *i_cdev;
                char                    *i_link;
                unsigned                i_dir_seq;
@@@ -923,7 -922,7 +922,7 @@@ struct file 
        const struct file_operations    *f_op;
  
        /*
 -       * Protects f_ep_links, f_flags.
 +       * Protects f_ep, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t              f_lock;
  
  #ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
 -      struct list_head        f_ep_links;
 -      struct list_head        f_tfile_llink;
 +      struct hlist_head       *f_ep;
  #endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
        errseq_t                f_wb_err;
@@@ -1408,7 -1408,7 +1407,7 @@@ enum 
  
  struct sb_writers {
        int                             frozen;         /* Is sb frozen? */
-       wait_queue_head_t               wait_unfrozen;  /* for get_super_thawed() */
+       wait_queue_head_t               wait_unfrozen;  /* wait for thaw */
        struct percpu_rw_semaphore      rw_sem[SB_FREEZE_LEVELS];
  };
  
@@@ -3131,8 -3131,6 +3130,6 @@@ extern struct file_system_type *get_fil
  extern void put_filesystem(struct file_system_type *fs);
  extern struct file_system_type *get_fs_type(const char *name);
  extern struct super_block *get_super(struct block_device *);
- extern struct super_block *get_super_thawed(struct block_device *);
- extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
  extern struct super_block *get_active_super(struct block_device *bdev);
  extern void drop_super(struct super_block *sb);
  extern void drop_super_exclusive(struct super_block *sb);
@@@ -3229,7 -3227,7 +3226,7 @@@ static inline bool vma_is_fsdax(struct 
  {
        struct inode *inode;
  
 -      if (!vma->vm_file)
 +      if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
diff --combined kernel/trace/blktrace.c
@@@ -449,7 -449,7 +449,7 @@@ static struct dentry *blk_create_buf_fi
                                        &relay_file_operations);
  }
  
 -static struct rchan_callbacks blk_relay_callbacks = {
 +static const struct rchan_callbacks blk_relay_callbacks = {
        .subbuf_start           = blk_subbuf_start_callback,
        .create_buf_file        = blk_create_buf_file_callback,
        .remove_buf_file        = blk_remove_buf_file_callback,
  static void blk_trace_setup_lba(struct blk_trace *bt,
                                struct block_device *bdev)
  {
-       struct hd_struct *part = NULL;
-       if (bdev)
-               part = bdev->bd_part;
-       if (part) {
-               bt->start_lba = part->start_sect;
-               bt->end_lba = part->start_sect + part->nr_sects;
+       if (bdev) {
+               bt->start_lba = bdev->bd_start_sect;
+               bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
        } else {
                bt->start_lba = 0;
                bt->end_lba = -1ULL;
@@@ -800,12 -795,12 +795,12 @@@ static u64 blk_trace_bio_get_cgid(struc
  #endif
  
  static u64
- blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+ blk_trace_request_get_cgid(struct request *rq)
  {
        if (!rq->bio)
                return 0;
        /* Use the first bio */
-       return blk_trace_bio_get_cgid(q, rq->bio);
+       return blk_trace_bio_get_cgid(rq->q, rq->bio);
  }
  
  /*
@@@ -846,40 -841,35 +841,35 @@@ static void blk_add_trace_rq(struct req
        rcu_read_unlock();
  }
  
- static void blk_add_trace_rq_insert(void *ignore,
-                                   struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
  {
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
  }
  
- static void blk_add_trace_rq_issue(void *ignore,
-                                  struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
  {
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
  }
  
- static void blk_add_trace_rq_merge(void *ignore,
-                                  struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
  {
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
  }
  
- static void blk_add_trace_rq_requeue(void *ignore,
-                                    struct request_queue *q,
-                                    struct request *rq)
+ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
  {
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
  }
  
  static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
                        int error, unsigned int nr_bytes)
  {
        blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
-                        blk_trace_request_get_cgid(rq->q, rq));
+                        blk_trace_request_get_cgid(rq));
  }
  
  /**
@@@ -911,10 -901,9 +901,9 @@@ static void blk_add_trace_bio(struct re
        rcu_read_unlock();
  }
  
- static void blk_add_trace_bio_bounce(void *ignore,
-                                    struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
  {
-       blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
  }
  
  static void blk_add_trace_bio_complete(void *ignore,
                          blk_status_to_errno(bio->bi_status));
  }
  
- static void blk_add_trace_bio_backmerge(void *ignore,
-                                       struct request_queue *q,
-                                       struct request *rq,
-                                       struct bio *bio)
+ static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
  {
-       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
  }
  
- static void blk_add_trace_bio_frontmerge(void *ignore,
-                                        struct request_queue *q,
-                                        struct request *rq,
-                                        struct bio *bio)
+ static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
  {
-       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
  }
  
- static void blk_add_trace_bio_queue(void *ignore,
-                                   struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
  {
-       blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
  }
  
- static void blk_add_trace_getrq(void *ignore,
-                               struct request_queue *q,
-                               struct bio *bio, int rw)
+ static void blk_add_trace_getrq(void *ignore, struct bio *bio)
  {
-       if (bio)
-               blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
-       else {
-               struct blk_trace *bt;
-               rcu_read_lock();
-               bt = rcu_dereference(q->blk_trace);
-               if (bt)
-                       __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
-                                       NULL, 0);
-               rcu_read_unlock();
-       }
- }
- static void blk_add_trace_sleeprq(void *ignore,
-                                 struct request_queue *q,
-                                 struct bio *bio, int rw)
- {
-       if (bio)
-               blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
-       else {
-               struct blk_trace *bt;
-               rcu_read_lock();
-               bt = rcu_dereference(q->blk_trace);
-               if (bt)
-                       __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
-                                       0, 0, NULL, 0);
-               rcu_read_unlock();
-       }
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
  }
  
  static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@@ -1015,10 -965,9 +965,9 @@@ static void blk_add_trace_unplug(void *
        rcu_read_unlock();
  }
  
- static void blk_add_trace_split(void *ignore,
-                               struct request_queue *q, struct bio *bio,
-                               unsigned int pdu)
+ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
  {
+       struct request_queue *q = bio->bi_disk->queue;
        struct blk_trace *bt;
  
        rcu_read_lock();
  /**
   * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
   * @ignore:   trace callback data parameter (not used)
-  * @q:                queue the io is for
   * @bio:      the source bio
-  * @dev:      target device
+  * @dev:      source device
   * @from:     source sector
   *
-  * Description:
-  *     Device mapper or raid target sometimes need to split a bio because
-  *     it spans a stripe (or similar). Add a trace for that action.
-  *
+  * Called after a bio is remapped to a different device and/or sector.
   **/
- static void blk_add_trace_bio_remap(void *ignore,
-                                   struct request_queue *q, struct bio *bio,
-                                   dev_t dev, sector_t from)
+ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+                                   sector_t from)
  {
+       struct request_queue *q = bio->bi_disk->queue;
        struct blk_trace *bt;
        struct blk_io_trace_remap r;
  
  /**
   * blk_add_trace_rq_remap - Add a trace for a request-remap operation
   * @ignore:   trace callback data parameter (not used)
-  * @q:                queue the io is for
   * @rq:               the source request
   * @dev:      target device
   * @from:     source sector
   *     Add a trace for that action.
   *
   **/
- static void blk_add_trace_rq_remap(void *ignore,
-                                  struct request_queue *q,
-                                  struct request *rq, dev_t dev,
+ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
                                   sector_t from)
  {
        struct blk_trace *bt;
        struct blk_io_trace_remap r;
  
        rcu_read_lock();
-       bt = rcu_dereference(q->blk_trace);
+       bt = rcu_dereference(rq->q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
  
        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
                        rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-                       sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+                       sizeof(r), &r, blk_trace_request_get_cgid(rq));
        rcu_read_unlock();
  }
  
  /**
   * blk_add_driver_data - Add binary message with driver-specific data
-  * @q:                queue the io is for
   * @rq:               io request
   * @data:     driver-specific data
   * @len:      length of driver-specific data
   *     Some drivers might want to write driver-specific data per request.
   *
   **/
- void blk_add_driver_data(struct request_queue *q,
-                        struct request *rq,
-                        void *data, size_t len)
+ void blk_add_driver_data(struct request *rq, void *data, size_t len)
  {
        struct blk_trace *bt;
  
        rcu_read_lock();
-       bt = rcu_dereference(q->blk_trace);
+       bt = rcu_dereference(rq->q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
  
        __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
                                BLK_TA_DRV_DATA, 0, len, data,
-                               blk_trace_request_get_cgid(q, rq));
+                               blk_trace_request_get_cgid(rq));
        rcu_read_unlock();
  }
  EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@@ -1169,8 -1108,6 +1108,6 @@@ static void blk_register_tracepoints(vo
        WARN_ON(ret);
        ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
-       WARN_ON(ret);
        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
        ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@@ -1190,7 -1127,6 +1127,6 @@@ static void blk_unregister_tracepoints(
        unregister_trace_block_split(blk_add_trace_split, NULL);
        unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
        unregister_trace_block_plug(blk_add_trace_plug, NULL);
-       unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
        unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@@ -1815,30 -1751,15 +1751,15 @@@ static ssize_t blk_trace_mask2str(char 
        return p - buf;
  }
  
- static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
- {
-       if (bdev->bd_disk == NULL)
-               return NULL;
-       return bdev_get_queue(bdev);
- }
  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
  {
-       struct block_device *bdev = bdget_part(dev_to_part(dev));
-       struct request_queue *q;
+       struct block_device *bdev = dev_to_bdev(dev);
+       struct request_queue *q = bdev_get_queue(bdev);
        struct blk_trace *bt;
        ssize_t ret = -ENXIO;
  
-       if (bdev == NULL)
-               goto out;
-       q = blk_trace_get_queue(bdev);
-       if (q == NULL)
-               goto out_bdput;
        mutex_lock(&q->debugfs_mutex);
  
        bt = rcu_dereference_protected(q->blk_trace,
  
  out_unlock_bdev:
        mutex_unlock(&q->debugfs_mutex);
- out_bdput:
-       bdput(bdev);
- out:
        return ret;
  }
  
@@@ -1871,8 -1789,8 +1789,8 @@@ static ssize_t sysfs_blk_trace_attr_sto
                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
  {
-       struct block_device *bdev;
-       struct request_queue *q;
+       struct block_device *bdev = dev_to_bdev(dev);
+       struct request_queue *q = bdev_get_queue(bdev);
        struct blk_trace *bt;
        u64 value;
        ssize_t ret = -EINVAL;
                                goto out;
                        value = ret;
                }
-       } else if (kstrtoull(buf, 0, &value))
-               goto out;
-       ret = -ENXIO;
-       bdev = bdget_part(dev_to_part(dev));
-       if (bdev == NULL)
-               goto out;
-       q = blk_trace_get_queue(bdev);
-       if (q == NULL)
-               goto out_bdput;
+       } else {
+               if (kstrtoull(buf, 0, &value))
+                       goto out;
+       }
  
        mutex_lock(&q->debugfs_mutex);
  
  
  out_unlock_bdev:
        mutex_unlock(&q->debugfs_mutex);
- out_bdput:
-       bdput(bdev);
  out:
        return ret ? ret : count;
  }
diff --combined mm/filemap.c
   *    ->swap_lock             (try_to_unmap_one)
   *    ->private_lock          (try_to_unmap_one)
   *    ->i_pages lock          (try_to_unmap_one)
 - *    ->pgdat->lru_lock               (follow_page->mark_page_accessed)
 - *    ->pgdat->lru_lock               (check_pte_range->isolate_lru_page)
 + *    ->lruvec->lru_lock      (follow_page->mark_page_accessed)
 + *    ->lruvec->lru_lock      (check_pte_range->isolate_lru_page)
   *    ->private_lock          (page_remove_rmap->set_page_dirty)
   *    ->i_pages lock          (page_remove_rmap->set_page_dirty)
   *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
@@@ -204,9 -204,9 +204,9 @@@ static void unaccount_page_cache_page(s
        if (PageSwapBacked(page)) {
                __mod_lruvec_page_state(page, NR_SHMEM, -nr);
                if (PageTransHuge(page))
 -                      __dec_node_page_state(page, NR_SHMEM_THPS);
 +                      __dec_lruvec_page_state(page, NR_SHMEM_THPS);
        } else if (PageTransHuge(page)) {
 -              __dec_node_page_state(page, NR_FILE_THPS);
 +              __dec_lruvec_page_state(page, NR_FILE_THPS);
                filemap_nr_thps_dec(mapping);
        }
  
@@@ -1359,7 -1359,7 +1359,7 @@@ static int __wait_on_page_locked_async(
        else
                ret = PageLocked(page);
        /*
 -       * If we were succesful now, we know we're still on the
 +       * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
@@@ -1484,19 -1484,11 +1484,19 @@@ void end_page_writeback(struct page *pa
                rotate_reclaimable_page(page);
        }
  
 +      /*
 +       * Writeback does not hold a page reference of its own, relying
 +       * on truncation to wait for the clearing of PG_writeback.
 +       * But here we must make sure that the page is not freed and
 +       * reused before the wake_up_page().
 +       */
 +      get_page(page);
        if (!test_clear_page_writeback(page))
                BUG();
  
        smp_mb__after_atomic();
        wake_up_page(page, PG_writeback);
 +      put_page(page);
  }
  EXPORT_SYMBOL(end_page_writeback);
  
@@@ -1583,20 -1575,19 +1583,20 @@@ int __lock_page_or_retry(struct page *p
                else
                        wait_on_page_locked(page);
                return 0;
 -      } else {
 -              if (flags & FAULT_FLAG_KILLABLE) {
 -                      int ret;
 +      }
 +      if (flags & FAULT_FLAG_KILLABLE) {
 +              int ret;
  
 -                      ret = __lock_page_killable(page);
 -                      if (ret) {
 -                              mmap_read_unlock(mm);
 -                              return 0;
 -                      }
 -              } else
 -                      __lock_page(page);
 -              return 1;
 +              ret = __lock_page_killable(page);
 +              if (ret) {
 +                      mmap_read_unlock(mm);
 +                      return 0;
 +              }
 +      } else {
 +              __lock_page(page);
        }
 +      return 1;
 +
  }
  
  /**
@@@ -2167,259 -2158,6 +2167,259 @@@ static void shrink_readahead_size_eio(s
        ra->ra_pages /= 4;
  }
  
 +static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
 +{
 +      if (iocb->ki_flags & IOCB_WAITQ)
 +              return lock_page_async(page, iocb->ki_waitq);
 +      else if (iocb->ki_flags & IOCB_NOWAIT)
 +              return trylock_page(page) ? 0 : -EAGAIN;
 +      else
 +              return lock_page_killable(page);
 +}
 +
 +static struct page *
 +generic_file_buffered_read_readpage(struct kiocb *iocb,
 +                                  struct file *filp,
 +                                  struct address_space *mapping,
 +                                  struct page *page)
 +{
 +      struct file_ra_state *ra = &filp->f_ra;
 +      int error;
 +
 +      if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
 +              unlock_page(page);
 +              put_page(page);
 +              return ERR_PTR(-EAGAIN);
 +      }
 +
 +      /*
 +       * A previous I/O error may have been due to temporary
 +       * failures, eg. multipath errors.
 +       * PG_error will be set again if readpage fails.
 +       */
 +      ClearPageError(page);
 +      /* Start the actual read. The read will unlock the page. */
 +      error = mapping->a_ops->readpage(filp, page);
 +
 +      if (unlikely(error)) {
 +              put_page(page);
 +              return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
 +      }
 +
 +      if (!PageUptodate(page)) {
 +              error = lock_page_for_iocb(iocb, page);
 +              if (unlikely(error)) {
 +                      put_page(page);
 +                      return ERR_PTR(error);
 +              }
 +              if (!PageUptodate(page)) {
 +                      if (page->mapping == NULL) {
 +                              /*
 +                               * invalidate_mapping_pages got it
 +                               */
 +                              unlock_page(page);
 +                              put_page(page);
 +                              return NULL;
 +                      }
 +                      unlock_page(page);
 +                      shrink_readahead_size_eio(ra);
 +                      put_page(page);
 +                      return ERR_PTR(-EIO);
 +              }
 +              unlock_page(page);
 +      }
 +
 +      return page;
 +}
 +
 +static struct page *
 +generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
 +                                         struct file *filp,
 +                                         struct iov_iter *iter,
 +                                         struct page *page,
 +                                         loff_t pos, loff_t count)
 +{
 +      struct address_space *mapping = filp->f_mapping;
 +      struct inode *inode = mapping->host;
 +      int error;
 +
 +      /*
 +       * See comment in do_read_cache_page on why
 +       * wait_on_page_locked is used to avoid unnecessarily
 +       * serialisations and why it's safe.
 +       */
 +      if (iocb->ki_flags & IOCB_WAITQ) {
 +              error = wait_on_page_locked_async(page,
 +                                              iocb->ki_waitq);
 +      } else {
 +              error = wait_on_page_locked_killable(page);
 +      }
 +      if (unlikely(error)) {
 +              put_page(page);
 +              return ERR_PTR(error);
 +      }
 +      if (PageUptodate(page))
 +              return page;
 +
 +      if (inode->i_blkbits == PAGE_SHIFT ||
 +                      !mapping->a_ops->is_partially_uptodate)
 +              goto page_not_up_to_date;
 +      /* pipes can't handle partially uptodate pages */
 +      if (unlikely(iov_iter_is_pipe(iter)))
 +              goto page_not_up_to_date;
 +      if (!trylock_page(page))
 +              goto page_not_up_to_date;
 +      /* Did it get truncated before we got the lock? */
 +      if (!page->mapping)
 +              goto page_not_up_to_date_locked;
 +      if (!mapping->a_ops->is_partially_uptodate(page,
 +                              pos & ~PAGE_MASK, count))
 +              goto page_not_up_to_date_locked;
 +      unlock_page(page);
 +      return page;
 +
 +page_not_up_to_date:
 +      /* Get exclusive access to the page ... */
 +      error = lock_page_for_iocb(iocb, page);
 +      if (unlikely(error)) {
 +              put_page(page);
 +              return ERR_PTR(error);
 +      }
 +
 +page_not_up_to_date_locked:
 +      /* Did it get truncated before we got the lock? */
 +      if (!page->mapping) {
 +              unlock_page(page);
 +              put_page(page);
 +              return NULL;
 +      }
 +
 +      /* Did somebody else fill it already? */
 +      if (PageUptodate(page)) {
 +              unlock_page(page);
 +              return page;
 +      }
 +
 +      return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 +}
 +
 +static struct page *
 +generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
 +                                        struct iov_iter *iter)
 +{
 +      struct file *filp = iocb->ki_filp;
 +      struct address_space *mapping = filp->f_mapping;
 +      pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 +      struct page *page;
 +      int error;
 +
 +      if (iocb->ki_flags & IOCB_NOIO)
 +              return ERR_PTR(-EAGAIN);
 +
 +      /*
 +       * Ok, it wasn't cached, so we need to create a new
 +       * page..
 +       */
 +      page = page_cache_alloc(mapping);
 +      if (!page)
 +              return ERR_PTR(-ENOMEM);
 +
 +      error = add_to_page_cache_lru(page, mapping, index,
 +                                    mapping_gfp_constraint(mapping, GFP_KERNEL));
 +      if (error) {
 +              put_page(page);
 +              return error != -EEXIST ? ERR_PTR(error) : NULL;
 +      }
 +
 +      return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 +}
 +
 +static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 +                                              struct iov_iter *iter,
 +                                              struct page **pages,
 +                                              unsigned int nr)
 +{
 +      struct file *filp = iocb->ki_filp;
 +      struct address_space *mapping = filp->f_mapping;
 +      struct file_ra_state *ra = &filp->f_ra;
 +      pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 +      pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
 +      int i, j, nr_got, err = 0;
 +
 +      nr = min_t(unsigned long, last_index - index, nr);
 +find_page:
 +      if (fatal_signal_pending(current))
 +              return -EINTR;
 +
 +      nr_got = find_get_pages_contig(mapping, index, nr, pages);
 +      if (nr_got)
 +              goto got_pages;
 +
 +      if (iocb->ki_flags & IOCB_NOIO)
 +              return -EAGAIN;
 +
 +      page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
 +
 +      nr_got = find_get_pages_contig(mapping, index, nr, pages);
 +      if (nr_got)
 +              goto got_pages;
 +
 +      pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
 +      err = PTR_ERR_OR_ZERO(pages[0]);
 +      if (!IS_ERR_OR_NULL(pages[0]))
 +              nr_got = 1;
 +got_pages:
 +      for (i = 0; i < nr_got; i++) {
 +              struct page *page = pages[i];
 +              pgoff_t pg_index = index + i;
 +              loff_t pg_pos = max(iocb->ki_pos,
 +                                  (loff_t) pg_index << PAGE_SHIFT);
 +              loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
 +
 +              if (PageReadahead(page)) {
 +                      if (iocb->ki_flags & IOCB_NOIO) {
 +                              for (j = i; j < nr_got; j++)
 +                                      put_page(pages[j]);
 +                              nr_got = i;
 +                              err = -EAGAIN;
 +                              break;
 +                      }
 +                      page_cache_async_readahead(mapping, ra, filp, page,
 +                                      pg_index, last_index - pg_index);
 +              }
 +
 +              if (!PageUptodate(page)) {
 +                      if ((iocb->ki_flags & IOCB_NOWAIT) ||
 +                          ((iocb->ki_flags & IOCB_WAITQ) && i)) {
 +                              for (j = i; j < nr_got; j++)
 +                                      put_page(pages[j]);
 +                              nr_got = i;
 +                              err = -EAGAIN;
 +                              break;
 +                      }
 +
 +                      page = generic_file_buffered_read_pagenotuptodate(iocb,
 +                                      filp, iter, page, pg_pos, pg_count);
 +                      if (IS_ERR_OR_NULL(page)) {
 +                              for (j = i + 1; j < nr_got; j++)
 +                                      put_page(pages[j]);
 +                              nr_got = i;
 +                              err = PTR_ERR_OR_ZERO(page);
 +                              break;
 +                      }
 +              }
 +      }
 +
 +      if (likely(nr_got))
 +              return nr_got;
 +      if (err)
 +              return err;
 +      /*
 +       * No pages and no error means we raced and should retry:
 +       */
 +      goto find_page;
 +}
 +
  /**
   * generic_file_buffered_read - generic file read routine
   * @iocb:     the iocb to read
@@@ -2440,117 -2178,284 +2440,117 @@@ ssize_t generic_file_buffered_read(stru
                struct iov_iter *iter, ssize_t written)
  {
        struct file *filp = iocb->ki_filp;
 +      struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
 -      struct file_ra_state *ra = &filp->f_ra;
 -      loff_t *ppos = &iocb->ki_pos;
 -      pgoff_t index;
 -      pgoff_t last_index;
 -      pgoff_t prev_index;
 -      unsigned long offset;      /* offset into pagecache page */
 -      unsigned int prev_offset;
 -      int error = 0;
 -
 -      if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
 +      struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
 +      unsigned int nr_pages = min_t(unsigned int, 512,
 +                      ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
 +                      (iocb->ki_pos >> PAGE_SHIFT));
 +      int i, pg_nr, error = 0;
 +      bool writably_mapped;
 +      loff_t isize, end_offset;
 +
 +      if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
  
 -      index = *ppos >> PAGE_SHIFT;
 -      prev_index = ra->prev_pos >> PAGE_SHIFT;
 -      prev_offset = ra->prev_pos & (PAGE_SIZE-1);
 -      last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
 -      offset = *ppos & ~PAGE_MASK;
 +      if (nr_pages > ARRAY_SIZE(pages_onstack))
 +              pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
  
 -      /*
 -       * If we've already successfully copied some data, then we
 -       * can no longer safely return -EIOCBQUEUED. Hence mark
 -       * an async read NOWAIT at that point.
 -       */
 -      if (written && (iocb->ki_flags & IOCB_WAITQ))
 -              iocb->ki_flags |= IOCB_NOWAIT;
 -
 -      for (;;) {
 -              struct page *page;
 -              pgoff_t end_index;
 -              loff_t isize;
 -              unsigned long nr, ret;
 +      if (!pages) {
 +              pages = pages_onstack;
 +              nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
 +      }
  
 +      do {
                cond_resched();
 -find_page:
 -              if (fatal_signal_pending(current)) {
 -                      error = -EINTR;
 -                      goto out;
 -              }
  
 -              page = find_get_page(mapping, index);
 -              if (!page) {
 -                      if (iocb->ki_flags & IOCB_NOIO)
 -                              goto would_block;
 -                      page_cache_sync_readahead(mapping,
 -                                      ra, filp,
 -                                      index, last_index - index);
 -                      page = find_get_page(mapping, index);
 -                      if (unlikely(page == NULL))
 -                              goto no_cached_page;
 -              }
 -              if (PageReadahead(page)) {
 -                      if (iocb->ki_flags & IOCB_NOIO) {
 -                              put_page(page);
 -                              goto out;
 -                      }
 -                      page_cache_async_readahead(mapping,
 -                                      ra, filp, page,
 -                                      index, last_index - index);
 -              }
 -              if (!PageUptodate(page)) {
 -                      /*
 -                       * See comment in do_read_cache_page on why
 -                       * wait_on_page_locked is used to avoid unnecessarily
 -                       * serialisations and why it's safe.
 -                       */
 -                      if (iocb->ki_flags & IOCB_WAITQ) {
 -                              if (written) {
 -                                      put_page(page);
 -                                      goto out;
 -                              }
 -                              error = wait_on_page_locked_async(page,
 -                                                              iocb->ki_waitq);
 -                      } else {
 -                              if (iocb->ki_flags & IOCB_NOWAIT) {
 -                                      put_page(page);
 -                                      goto would_block;
 -                              }
 -                              error = wait_on_page_locked_killable(page);
 -                      }
 -                      if (unlikely(error))
 -                              goto readpage_error;
 -                      if (PageUptodate(page))
 -                              goto page_ok;
 -
 -                      if (inode->i_blkbits == PAGE_SHIFT ||
 -                                      !mapping->a_ops->is_partially_uptodate)
 -                              goto page_not_up_to_date;
 -                      /* pipes can't handle partially uptodate pages */
 -                      if (unlikely(iov_iter_is_pipe(iter)))
 -                              goto page_not_up_to_date;
 -                      if (!trylock_page(page))
 -                              goto page_not_up_to_date;
 -                      /* Did it get truncated before we got the lock? */
 -                      if (!page->mapping)
 -                              goto page_not_up_to_date_locked;
 -                      if (!mapping->a_ops->is_partially_uptodate(page,
 -                                                      offset, iter->count))
 -                              goto page_not_up_to_date_locked;
 -                      unlock_page(page);
 +              /*
 +               * If we've already successfully copied some data, then we
 +               * can no longer safely return -EIOCBQUEUED. Hence mark
 +               * an async read NOWAIT at that point.
 +               */
 +              if ((iocb->ki_flags & IOCB_WAITQ) && written)
 +                      iocb->ki_flags |= IOCB_NOWAIT;
 +
 +              i = 0;
 +              pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
 +                                                           pages, nr_pages);
 +              if (pg_nr < 0) {
 +                      error = pg_nr;
 +                      break;
                }
 -page_ok:
 +
                /*
 -               * i_size must be checked after we know the page is Uptodate.
 +               * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
 -
                isize = i_size_read(inode);
 -              end_index = (isize - 1) >> PAGE_SHIFT;
 -              if (unlikely(!isize || index > end_index)) {
 -                      put_page(page);
 -                      goto out;
 -              }
 +              if (unlikely(iocb->ki_pos >= isize))
 +                      goto put_pages;
  
 -              /* nr is the maximum number of bytes to copy from this page */
 -              nr = PAGE_SIZE;
 -              if (index == end_index) {
 -                      nr = ((isize - 1) & ~PAGE_MASK) + 1;
 -                      if (nr <= offset) {
 -                              put_page(page);
 -                              goto out;
 -                      }
 -              }
 -              nr = nr - offset;
 +              end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
  
 -              /* If users can be writing to this page using arbitrary
 -               * virtual addresses, take care about potential aliasing
 -               * before reading the page on the kernel side.
 -               */
 -              if (mapping_writably_mapped(mapping))
 -                      flush_dcache_page(page);
 +              while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
 +                     (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
 +                      put_page(pages[--pg_nr]);
  
                /*
 -               * When a sequential read accesses a page several times,
 -               * only mark it as accessed the first time.
 +               * Once we start copying data, we don't want to be touching any
 +               * cachelines that might be contended:
                 */
 -              if (prev_index != index || offset != prev_offset)
 -                      mark_page_accessed(page);
 -              prev_index = index;
 +              writably_mapped = mapping_writably_mapped(mapping);
  
                /*
 -               * Ok, we have the page, and it's up-to-date, so
 -               * now we can copy it to user space...
 +               * When a sequential read accesses a page several times, only
 +               * mark it as accessed the first time.
                 */
 +              if (iocb->ki_pos >> PAGE_SHIFT !=
 +                  ra->prev_pos >> PAGE_SHIFT)
 +                      mark_page_accessed(pages[0]);
 +              for (i = 1; i < pg_nr; i++)
 +                      mark_page_accessed(pages[i]);
 +
 +              for (i = 0; i < pg_nr; i++) {
 +                      unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
 +                      unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
 +                                                 PAGE_SIZE - offset);
 +                      unsigned int copied;
  
 -              ret = copy_page_to_iter(page, offset, nr, iter);
 -              offset += ret;
 -              index += offset >> PAGE_SHIFT;
 -              offset &= ~PAGE_MASK;
 -              prev_offset = offset;
 -
 -              put_page(page);
 -              written += ret;
 -              if (!iov_iter_count(iter))
 -                      goto out;
 -              if (ret < nr) {
 -                      error = -EFAULT;
 -                      goto out;
 -              }
 -              continue;
 -
 -page_not_up_to_date:
 -              /* Get exclusive access to the page ... */
 -              if (iocb->ki_flags & IOCB_WAITQ)
 -                      error = lock_page_async(page, iocb->ki_waitq);
 -              else
 -                      error = lock_page_killable(page);
 -              if (unlikely(error))
 -                      goto readpage_error;
 -
 -page_not_up_to_date_locked:
 -              /* Did it get truncated before we got the lock? */
 -              if (!page->mapping) {
 -                      unlock_page(page);
 -                      put_page(page);
 -                      continue;
 -              }
 -
 -              /* Did somebody else fill it already? */
 -              if (PageUptodate(page)) {
 -                      unlock_page(page);
 -                      goto page_ok;
 -              }
 +                      /*
 +                       * If users can be writing to this page using arbitrary
 +                       * virtual addresses, take care about potential aliasing
 +                       * before reading the page on the kernel side.
 +                       */
 +                      if (writably_mapped)
 +                              flush_dcache_page(pages[i]);
  
 -readpage:
 -              if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
 -                      unlock_page(page);
 -                      put_page(page);
 -                      goto would_block;
 -              }
 -              /*
 -               * A previous I/O error may have been due to temporary
 -               * failures, eg. multipath errors.
 -               * PG_error will be set again if readpage fails.
 -               */
 -              ClearPageError(page);
 -              /* Start the actual read. The read will unlock the page. */
 -              error = mapping->a_ops->readpage(filp, page);
 +                      copied = copy_page_to_iter(pages[i], offset, bytes, iter);
  
 -              if (unlikely(error)) {
 -                      if (error == AOP_TRUNCATED_PAGE) {
 -                              put_page(page);
 -                              error = 0;
 -                              goto find_page;
 -                      }
 -                      goto readpage_error;
 -              }
 +                      written += copied;
 +                      iocb->ki_pos += copied;
 +                      ra->prev_pos = iocb->ki_pos;
  
 -              if (!PageUptodate(page)) {
 -                      if (iocb->ki_flags & IOCB_WAITQ)
 -                              error = lock_page_async(page, iocb->ki_waitq);
 -                      else
 -                              error = lock_page_killable(page);
 -
 -                      if (unlikely(error))
 -                              goto readpage_error;
 -                      if (!PageUptodate(page)) {
 -                              if (page->mapping == NULL) {
 -                                      /*
 -                                       * invalidate_mapping_pages got it
 -                                       */
 -                                      unlock_page(page);
 -                                      put_page(page);
 -                                      goto find_page;
 -                              }
 -                              unlock_page(page);
 -                              shrink_readahead_size_eio(ra);
 -                              error = -EIO;
 -                              goto readpage_error;
 +                      if (copied < bytes) {
 +                              error = -EFAULT;
 +                              break;
                        }
 -                      unlock_page(page);
                }
 +put_pages:
 +              for (i = 0; i < pg_nr; i++)
 +                      put_page(pages[i]);
 +      } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
  
 -              goto page_ok;
 -
 -readpage_error:
 -              /* UHHUH! A synchronous read error occurred. Report it */
 -              put_page(page);
 -              goto out;
 -
 -no_cached_page:
 -              /*
 -               * Ok, it wasn't cached, so we need to create a new
 -               * page..
 -               */
 -              page = page_cache_alloc(mapping);
 -              if (!page) {
 -                      error = -ENOMEM;
 -                      goto out;
 -              }
 -              error = add_to_page_cache_lru(page, mapping, index,
 -                              mapping_gfp_constraint(mapping, GFP_KERNEL));
 -              if (error) {
 -                      put_page(page);
 -                      if (error == -EEXIST) {
 -                              error = 0;
 -                              goto find_page;
 -                      }
 -                      goto out;
 -              }
 -              goto readpage;
 -      }
 +      file_accessed(filp);
  
 -would_block:
 -      error = -EAGAIN;
 -out:
 -      ra->prev_pos = prev_index;
 -      ra->prev_pos <<= PAGE_SHIFT;
 -      ra->prev_pos |= prev_offset;
 +      if (pages != pages_onstack)
 +              kfree(pages);
  
 -      *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
 -      file_accessed(filp);
        return written ? written : error;
  }
  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
@@@ -2981,14 -2886,14 +2981,14 @@@ EXPORT_SYMBOL(filemap_map_pages)
  
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
  {
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct page *page = vmf->page;
-       struct inode *inode = file_inode(vmf->vma->vm_file);
        vm_fault_t ret = VM_FAULT_LOCKED;
  
-       sb_start_pagefault(inode->i_sb);
+       sb_start_pagefault(mapping->host->i_sb);
        file_update_time(vmf->vma->vm_file);
        lock_page(page);
-       if (page->mapping != inode->i_mapping) {
+       if (page->mapping != mapping) {
                unlock_page(page);
                ret = VM_FAULT_NOPAGE;
                goto out;
        set_page_dirty(page);
        wait_for_stable_page(page);
  out:
-       sb_end_pagefault(inode->i_sb);
+       sb_end_pagefault(mapping->host->i_sb);
        return ret;
  }
  
@@@ -3244,10 -3149,9 +3244,9 @@@ void dio_warn_stale_pagecache(struct fi
  {
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
-       struct inode *inode = file_inode(filp);
        char *path;
  
-       errseq_set(&inode->i_mapping->wb_err, -EIO);
+       errseq_set(&filp->f_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
@@@ -3274,7 -3178,7 +3273,7 @@@ generic_file_direct_write(struct kiocb 
  
        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* If there are pages to writeback, return */
-               if (filemap_range_has_page(inode->i_mapping, pos,
+               if (filemap_range_has_page(file->f_mapping, pos,
                                           pos + write_len - 1))
                        return -EAGAIN;
        } else {