Merge tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
diff --combined block/blk-flush.c

index fd5cee9,996d5d0..76c1624
--- 1/block/blk-flush.c
--- 2/block/blk-flush.c
+++ b/block/blk-flush.c
@@@ -69,7 -69,6 +69,6 @@@
   #include <linux/blkdev.h>
   #include <linux/gfp.h>
   #include <linux/blk-mq.h>
- #include <linux/lockdep.h>
   
   #include "blk.h"
   #include "blk-mq.h"
@@@ -139,7 -138,7 +138,7 @@@ static void blk_flush_queue_rq(struct r
   
   static void blk_account_io_flush(struct request *rq)
   {
-       struct hd_struct *part = &rq->rq_disk->part0;
+       struct block_device *part = rq->rq_disk->part0;
   
         part_stat_lock();
         part_stat_inc(part, ios[STAT_FLUSH]);
@@@ -225,18 -224,13 +224,18 @@@ static void flush_end_io(struct reques
         /* release the tag's ownership to the req cloned from */
         spin_lock_irqsave(&fq->mq_flush_lock, flags);
   
- -      WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
         if (!refcount_dec_and_test(&flush_rq->ref)) {
                 fq->rq_status = error;
                 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
                 return;
         }
   
+ +      /*
+ +       * Flush request has to be marked as IDLE when it is really ended
+ +       * because its .end_io() is called from timeout code path too for
+ +       * avoiding use-after-free.
+ +       */
+ +      WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
         if (fq->rq_status != BLK_STS_OK)
                 error = fq->rq_status;
   
@@@ -474,9 -468,6 +473,6 @@@ struct blk_flush_queue *blk_alloc_flush
         INIT_LIST_HEAD(&fq->flush_queue[1]);
         INIT_LIST_HEAD(&fq->flush_data_in_flight);
   
-       lockdep_register_key(&fq->key);
-       lockdep_set_class(&fq->mq_flush_lock, &fq->key);
- 
         return fq;
   
    fail_rq:
@@@ -491,7 -482,31 +487,31 @@@ void blk_free_flush_queue(struct blk_fl
         if (!fq)
                 return;
   
-       lockdep_unregister_key(&fq->key);
         kfree(fq->flush_rq);
         kfree(fq);
   }
+ 
+ /*
+  * Allow driver to set its own lock class to fq->mq_flush_lock for
+  * avoiding lockdep complaint.
+  *
+  * flush_end_io() may be called recursively from some driver, such as
+  * nvme-loop, so lockdep may complain 'possible recursive locking' because
+  * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
+  * key. We need to assign different lock class for these driver's
+  * fq->mq_flush_lock for avoiding the lockdep warning.
+  *
+  * Use dynamically allocated lock class key for each 'blk_flush_queue'
+  * instance is over-kill, and more worse it introduces horrible boot delay
+  * issue because synchronize_rcu() is implied in lockdep_unregister_key which
+  * is called for each hctx release. SCSI probing may synchronously create and
+  * destroy lots of MQ request_queues for non-existent devices, and some robot
+  * test kernel always enable lockdep option. It is observed that more than half
+  * an hour is taken during SCSI MQ probe with per-fq lock class.
+  */
+ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+               struct lock_class_key *key)
+ {
+       lockdep_set_class(&hctx->fq->mq_flush_lock, key);
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
diff --combined block/blk-merge.c

index 97b7c28,c3399bf..808768f
--- 1/block/blk-merge.c
--- 2/block/blk-merge.c
+++ b/block/blk-merge.c
@@@ -144,7 -144,7 +144,7 @@@ static struct bio *blk_bio_write_same_s
   static inline unsigned get_max_io_size(struct request_queue *q,
                                        struct bio *bio)
   {
- -      unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
+ +      unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
         unsigned max_sectors = sectors;
         unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
         unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
@@@ -279,6 -279,14 +279,14 @@@ static struct bio *blk_bio_segment_spli
         return NULL;
   split:
         *segs = nsegs;
+ 
+       /*
+        * Bio splitting may cause subtle trouble such as hang when doing sync
+        * iopoll in direct IO routine. Given performance gain of iopoll for
+        * big IO can be trival, disable iopoll when split needed.
+        */
+       bio->bi_opf &= ~REQ_HIPRI;
+ 
         return bio_split(bio, sectors, GFP_NOIO, bs);
   }
   
@@@ -338,7 -346,7 +346,7 @@@ void __blk_queue_split(struct bio **bio
                 split->bi_opf |= REQ_NOMERGE;
   
                 bio_chain(split, *bio);
-               trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
+               trace_block_split(split, (*bio)->bi_iter.bi_sector);
                 submit_bio_noacct(*bio);
                 *bio = split;
         }
@@@ -683,8 -691,6 +691,6 @@@ static void blk_account_io_merge_reques
                 part_stat_lock();
                 part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                 part_stat_unlock();
- 
-               hd_struct_put(req->part);
         }
   }
   
@@@ -801,7 -807,7 +807,7 @@@ static struct request *attempt_merge(st
          */
         blk_account_io_merge_request(next);
   
-       trace_block_rq_merge(q, next);
+       trace_block_rq_merge(next);
   
         /*
          * ownership of bio passed from next to req, return 'next' for
@@@ -924,7 -930,7 +930,7 @@@ static enum bio_merge_status bio_attemp
         if (!ll_back_merge_fn(req, bio, nr_segs))
                 return BIO_MERGE_FAILED;
   
-       trace_block_bio_backmerge(req->q, req, bio);
+       trace_block_bio_backmerge(bio);
         rq_qos_merge(req->q, req, bio);
   
         if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
@@@ -948,7 -954,7 +954,7 @@@ static enum bio_merge_status bio_attemp
         if (!ll_front_merge_fn(req, bio, nr_segs))
                 return BIO_MERGE_FAILED;
   
-       trace_block_bio_frontmerge(req->q, req, bio);
+       trace_block_bio_frontmerge(bio);
         rq_qos_merge(req->q, req, bio);
   
         if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
diff --combined block/blk-mq.c

index d35b3c0,6f207ec..14a4469
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -95,7 -95,7 +95,7 @@@ static void blk_mq_hctx_clear_pending(s
   }
   
   struct mq_inflight {
-       struct hd_struct *part;
+       struct block_device *part;
         unsigned int inflight[2];
   };
   
@@@ -105,13 -105,15 +105,15 @@@ static bool blk_mq_check_inflight(struc
   {
         struct mq_inflight *mi = priv;
   
-       if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+       if ((!mi->part->bd_partno || rq->part == mi->part) &&
+           blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                 mi->inflight[rq_data_dir(rq)]++;
   
         return true;
   }
   
- unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
+ unsigned int blk_mq_in_flight(struct request_queue *q,
+               struct block_device *part)
   {
         struct mq_inflight mi = { .part = part };
   
@@@ -120,8 -122,8 +122,8 @@@
         return mi.inflight[0] + mi.inflight[1];
   }
   
- void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
-                        unsigned int inflight[2])
+ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+               unsigned int inflight[2])
   {
         struct mq_inflight mi = { .part = part };
   
@@@ -671,7 -673,9 +673,7 @@@ bool blk_mq_complete_request_remote(str
                 return false;
   
         if (blk_mq_complete_need_ipi(rq)) {
- -              rq->csd.func = __blk_mq_complete_request_remote;
- -              rq->csd.info = rq;
- -              rq->csd.flags = 0;
+ +              INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
                 smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
         } else {
                 if (rq->q->nr_hw_queues > 1)
@@@ -729,7 -733,7 +731,7 @@@ void blk_mq_start_request(struct reques
   {
         struct request_queue *q = rq->q;
   
-       trace_block_rq_issue(q, rq);
+       trace_block_rq_issue(rq);
   
         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
                 rq->io_start_time_ns = ktime_get_ns();
@@@ -756,7 -760,7 +758,7 @@@ static void __blk_mq_requeue_request(st
   
         blk_mq_put_driver_tag(rq);
   
-       trace_block_rq_requeue(q, rq);
+       trace_block_rq_requeue(rq);
         rq_qos_requeue(q, rq);
   
         if (blk_mq_request_started(rq)) {
@@@ -1590,7 -1594,7 +1592,7 @@@ select_cpu
    * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
    * @hctx: Pointer to the hardware queue to run.
    * @async: If we want to run the queue asynchronously.
-  * @msecs: Microseconds of delay to wait before running the queue.
+  * @msecs: Milliseconds of delay to wait before running the queue.
    *
    * If !@async, try to run the queue now. Else, run the queue asynchronously and
    * with a delay of @msecs.
@@@ -1619,7 -1623,7 +1621,7 @@@ static void __blk_mq_delay_run_hw_queue
   /**
    * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
    * @hctx: Pointer to the hardware queue to run.
-  * @msecs: Microseconds of delay to wait before running the queue.
+  * @msecs: Milliseconds of delay to wait before running the queue.
    *
    * Run a hardware queue asynchronously with a delay of @msecs.
    */
@@@ -1683,7 -1687,7 +1685,7 @@@ EXPORT_SYMBOL(blk_mq_run_hw_queues)
   /**
    * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
    * @q: Pointer to the request queue to run.
-  * @msecs: Microseconds of delay to wait before running the queues.
+  * @msecs: Milliseconds of delay to wait before running the queues.
    */
   void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
   {
@@@ -1817,7 -1821,7 +1819,7 @@@ static inline void __blk_mq_insert_req_
   
         lockdep_assert_held(&ctx->lock);
   
-       trace_block_rq_insert(hctx->queue, rq);
+       trace_block_rq_insert(rq);
   
         if (at_head)
                 list_add(&rq->queuelist, &ctx->rq_lists[type]);
@@@ -1874,7 -1878,7 +1876,7 @@@ void blk_mq_insert_requests(struct blk_
          */
         list_for_each_entry(rq, list, queuelist) {
                 BUG_ON(rq->mq_ctx != ctx);
-               trace_block_rq_insert(hctx->queue, rq);
+               trace_block_rq_insert(rq);
         }
   
         spin_lock(&ctx->lock);
@@@ -2155,6 -2159,7 +2157,7 @@@ blk_qc_t blk_mq_submit_bio(struct bio *
         unsigned int nr_segs;
         blk_qc_t cookie;
         blk_status_t ret;
+       bool hipri;
   
         blk_queue_bounce(q, &bio);
         __blk_queue_split(&bio, &nr_segs);
@@@ -2171,6 -2176,8 +2174,8 @@@
   
         rq_qos_throttle(q, bio);
   
+       hipri = bio->bi_opf & REQ_HIPRI;
+ 
         data.cmd_flags = bio->bi_opf;
         rq = __blk_mq_alloc_request(&data);
         if (unlikely(!rq)) {
@@@ -2180,7 -2187,7 +2185,7 @@@
                 goto queue_exit;
         }
   
-       trace_block_getrq(q, bio, bio->bi_opf);
+       trace_block_getrq(bio);
   
         rq_qos_track(q, rq, bio);
   
@@@ -2263,6 -2270,8 +2268,8 @@@
                 blk_mq_sched_insert_request(rq, false, true, true);
         }
   
+       if (!hipri)
+               return BLK_QC_T_NONE;
         return cookie;
   queue_exit:
         blk_queue_exit(q);
@@@ -3373,6 -3382,12 +3380,12 @@@ static int blk_mq_realloc_tag_set_tags(
         return 0;
   }
   
+ static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
+                               int new_nr_hw_queues)
+ {
+       return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
+ }
+ 
   /*
    * Alloc a tag set to be associated with one or more request queues.
    * May fail with EINVAL for various error conditions. May adjust the
@@@ -3426,7 -3441,7 +3439,7 @@@ int blk_mq_alloc_tag_set(struct blk_mq_
         if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                 set->nr_hw_queues = nr_cpu_ids;
   
-       if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
+       if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
                 return -ENOMEM;
   
         ret = -ENOMEM;
@@@ -3861,9 -3876,10 +3874,10 @@@ int blk_poll(struct request_queue *q, b
          * the state. Like for the other success return cases, the
          * caller is responsible for checking if the IO completed. If
          * the IO isn't complete, we'll get called again and will go
-        * straight to the busy poll loop.
+        * straight to the busy poll loop. If specified not to spin,
+        * we also should not sleep.
          */
-       if (blk_mq_poll_hybrid(q, hctx, cookie))
+       if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
                 return 1;
   
         hctx->poll_considered++;
diff --combined drivers/block/xen-blkback/common.h

index a1b9df2,0762db2..b0c71d3
--- 1/drivers/block/xen-blkback/common.h
--- 2/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@@ -288,7 -288,9 +288,7 @@@ struct xen_blkif_ring 
         struct work_struct      persistent_purge_work;
   
         /* Buffer of free pages to map grant refs. */
- -      spinlock_t              free_pages_lock;
- -      int                     free_pages_num;
- -      struct list_head        free_pages;
+ +      struct gnttab_page_cache free_pages;
   
         struct work_struct      free_work;
         /* Thread shutdown wait queue. */
@@@ -356,9 -358,7 +356,7 @@@ struct pending_req 
   };
   
   
- #define vbd_sz(_v)    ((_v)->bdev->bd_part ? \
-                        (_v)->bdev->bd_part->nr_sects : \
-                         get_capacity((_v)->bdev->bd_disk))
+ #define vbd_sz(_v)    bdev_nr_sectors((_v)->bdev)
   
   #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
   #define xen_blkif_put(_b)                             \
diff --combined drivers/block/zram/zram_drv.c

index 66a33e4,b0701ba..e2933cb
--- 1/drivers/block/zram/zram_drv.c
--- 2/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@@ -42,7 -42,7 +42,7 @@@ static DEFINE_IDR(zram_index_idr)
   static DEFINE_MUTEX(zram_index_mutex);
   
   static int zram_major;
- -static const char *default_compressor = "lzo-rle";
+ +static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
   
   /* Module params (documentation at end) */
   static unsigned int num_devices = 1;
@@@ -403,13 -403,10 +403,10 @@@ static void reset_bdev(struct zram *zra
                 return;
   
         bdev = zram->bdev;
-       if (zram->old_block_size)
-               set_blocksize(bdev, zram->old_block_size);
         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
         /* hope filp_close flush all of IO */
         filp_close(zram->backing_dev, NULL);
         zram->backing_dev = NULL;
-       zram->old_block_size = 0;
         zram->bdev = NULL;
         zram->disk->fops = &zram_devops;
         kvfree(zram->bitmap);
@@@ -454,7 -451,7 +451,7 @@@ static ssize_t backing_dev_store(struc
         struct file *backing_dev = NULL;
         struct inode *inode;
         struct address_space *mapping;
-       unsigned int bitmap_sz, old_block_size = 0;
+       unsigned int bitmap_sz;
         unsigned long nr_pages, *bitmap = NULL;
         struct block_device *bdev = NULL;
         int err;
@@@ -509,14 -506,8 +506,8 @@@
                 goto out;
         }
   
-       old_block_size = block_size(bdev);
-       err = set_blocksize(bdev, PAGE_SIZE);
-       if (err)
-               goto out;
- 
         reset_bdev(zram);
   
-       zram->old_block_size = old_block_size;
         zram->bdev = bdev;
         zram->backing_dev = backing_dev;
         zram->bitmap = bitmap;
@@@ -620,19 -611,15 +611,19 @@@ static int read_from_bdev_async(struct 
         return 1;
   }
   
+ +#define PAGE_WB_SIG "page_index="
+ +
+ +#define PAGE_WRITEBACK 0
   #define HUGE_WRITEBACK 1
   #define IDLE_WRITEBACK 2
   
+ +
   static ssize_t writeback_store(struct device *dev,
                 struct device_attribute *attr, const char *buf, size_t len)
   {
         struct zram *zram = dev_to_zram(dev);
         unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
- -      unsigned long index;
+ +      unsigned long index = 0;
         struct bio bio;
         struct bio_vec bio_vec;
         struct page *page;
@@@ -644,17 -631,8 +635,17 @@@
                 mode = IDLE_WRITEBACK;
         else if (sysfs_streq(buf, "huge"))
                 mode = HUGE_WRITEBACK;
- -      else
- -              return -EINVAL;
+ +      else {
+ +              if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
+ +                      return -EINVAL;
+ +
+ +              ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
+ +              if (ret || index >= nr_pages)
+ +                      return -EINVAL;
+ +
+ +              nr_pages = 1;
+ +              mode = PAGE_WRITEBACK;
+ +      }
   
         down_read(&zram->init_lock);
         if (!init_done(zram)) {
@@@ -673,7 -651,7 +664,7 @@@
                 goto release_init_lock;
         }
   
- -      for (index = 0; index < nr_pages; index++) {
+ +      while (nr_pages--) {
                 struct bio_vec bvec;
   
                 bvec.bv_page = page;
@@@ -1084,7 -1062,7 +1075,7 @@@ static ssize_t mm_stat_show(struct devi
         max_used = atomic_long_read(&zram->stats.max_used_pages);
   
         ret = scnprintf(buf, PAGE_SIZE,
- -                      "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
+ +                      "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
                         orig_size << PAGE_SHIFT,
                         (u64)atomic64_read(&zram->stats.compr_data_size),
                         mem_used << PAGE_SHIFT,
@@@ -1092,8 -1070,7 +1083,8 @@@
                         max_used << PAGE_SHIFT,
                         (u64)atomic64_read(&zram->stats.same_pages),
                         pool_stats.pages_compacted,
- -                      (u64)atomic64_read(&zram->stats.huge_pages));
+ +                      (u64)atomic64_read(&zram->stats.huge_pages),
+ +                      (u64)atomic64_read(&zram->stats.huge_pages_since));
         up_read(&zram->init_lock);
   
         return ret;
@@@ -1425,7 -1402,6 +1416,7 @@@ out
         if (comp_len == PAGE_SIZE) {
                 zram_set_flag(zram, index, ZRAM_HUGE);
                 atomic64_inc(&zram->stats.huge_pages);
+ +              atomic64_inc(&zram->stats.huge_pages_since);
         }
   
         if (flags) {
@@@ -1710,8 -1686,8 +1701,8 @@@ static void zram_reset_device(struct zr
         disksize = zram->disksize;
         zram->disksize = 0;
   
-       set_capacity(zram->disk, 0);
-       part_stat_set_all(&zram->disk->part0, 0);
+       set_capacity_and_notify(zram->disk, 0);
+       part_stat_set_all(zram->disk->part0, 0);
   
         up_write(&zram->init_lock);
         /* I/O operation under all of CPU are done so let's free */
@@@ -1756,9 -1732,7 +1747,7 @@@ static ssize_t disksize_store(struct de
   
         zram->comp = comp;
         zram->disksize = disksize;
-       set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
- 
-       revalidate_disk_size(zram->disk, true);
+       set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
         up_write(&zram->init_lock);
   
         return len;
@@@ -1786,15 -1760,12 +1775,12 @@@ static ssize_t reset_store(struct devic
                 return -EINVAL;
   
         zram = dev_to_zram(dev);
-       bdev = bdget_disk(zram->disk, 0);
-       if (!bdev)
-               return -ENOMEM;
+       bdev = zram->disk->part0;
   
         mutex_lock(&bdev->bd_mutex);
         /* Do not reset an active device or claimed device */
         if (bdev->bd_openers || zram->claim) {
                 mutex_unlock(&bdev->bd_mutex);
-               bdput(bdev);
                 return -EBUSY;
         }
   
@@@ -1805,8 -1776,6 +1791,6 @@@
         /* Make sure all the pending I/O are finished */
         fsync_bdev(bdev);
         zram_reset_device(zram);
-       revalidate_disk_size(zram->disk, true);
-       bdput(bdev);
   
         mutex_lock(&bdev->bd_mutex);
         zram->claim = false;
@@@ -1992,16 -1961,11 +1976,11 @@@ out_free_dev
   
   static int zram_remove(struct zram *zram)
   {
-       struct block_device *bdev;
- 
-       bdev = bdget_disk(zram->disk, 0);
-       if (!bdev)
-               return -ENOMEM;
+       struct block_device *bdev = zram->disk->part0;
   
         mutex_lock(&bdev->bd_mutex);
         if (bdev->bd_openers || zram->claim) {
                 mutex_unlock(&bdev->bd_mutex);
-               bdput(bdev);
                 return -EBUSY;
         }
   
@@@ -2013,7 -1977,6 +1992,6 @@@
         /* Make sure all the pending I/O are finished */
         fsync_bdev(bdev);
         zram_reset_device(zram);
-       bdput(bdev);
   
         pr_info("Removed device: %s\n", zram->disk->disk_name);
   
diff --combined drivers/block/zram/zram_drv.h

index 9cabcbb,712354a..419a7e8
--- 1/drivers/block/zram/zram_drv.h
--- 2/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@@ -78,7 -78,6 +78,7 @@@ struct zram_stats 
         atomic64_t notify_free; /* no. of swap slot free notifications */
         atomic64_t same_pages;          /* no. of same element filled pages */
         atomic64_t huge_pages;          /* no. of huge pages */
+ +      atomic64_t huge_pages_since;    /* no. of huge pages since zram set up */
         atomic64_t pages_stored;        /* no. of pages currently stored */
         atomic_long_t max_used_pages;   /* no. of maximum pages stored */
         atomic64_t writestall;          /* no. of write slow paths */
@@@ -119,7 -118,6 +119,6 @@@ struct zram 
         bool wb_limit_enable;
         u64 bd_wb_limit;
         struct block_device *bdev;
-       unsigned int old_block_size;
         unsigned long *bitmap;
         unsigned long nr_pages;
   #endif
diff --combined drivers/ide/ide-probe.c

index 430b29e,1c1567b..aefd74c
--- 1/drivers/ide/ide-probe.c
--- 2/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@@ -902,65 -902,14 +902,14 @@@ out_up
         return 1;
   }
   
- static int ata_lock(dev_t dev, void *data)
+ static void ata_probe(dev_t dev)
   {
-       /* FIXME: we want to pin hwif down */
-       return 0;
+       request_module("ide-disk");
+       request_module("ide-cd");
+       request_module("ide-tape");
+       request_module("ide-floppy");
   }
   
- static struct kobject *ata_probe(dev_t dev, int *part, void *data)
- {
-       ide_hwif_t *hwif = data;
-       int unit = *part >> PARTN_BITS;
-       ide_drive_t *drive = hwif->devices[unit];
- 
-       if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-               return NULL;
- 
-       if (drive->media == ide_disk)
-               request_module("ide-disk");
-       if (drive->media == ide_cdrom || drive->media == ide_optical)
-               request_module("ide-cd");
-       if (drive->media == ide_tape)
-               request_module("ide-tape");
-       if (drive->media == ide_floppy)
-               request_module("ide-floppy");
- 
-       return NULL;
- }
- 
- static struct kobject *exact_match(dev_t dev, int *part, void *data)
- {
-       struct gendisk *p = data;
-       *part &= (1 << PARTN_BITS) - 1;
-       return &disk_to_dev(p)->kobj;
- }
- 
- static int exact_lock(dev_t dev, void *data)
- {
-       struct gendisk *p = data;
- 
-       if (!get_disk_and_module(p))
-               return -1;
-       return 0;
- }
- 
- void ide_register_region(struct gendisk *disk)
- {
-       blk_register_region(MKDEV(disk->major, disk->first_minor),
-                           disk->minors, NULL, exact_match, exact_lock, disk);
- }
- 
- EXPORT_SYMBOL_GPL(ide_register_region);
- 
- void ide_unregister_region(struct gendisk *disk)
- {
-       blk_unregister_region(MKDEV(disk->major, disk->first_minor),
-                             disk->minors);
- }
- 
- EXPORT_SYMBOL_GPL(ide_unregister_region);
- 
   void ide_init_disk(struct gendisk *disk, ide_drive_t *drive)
   {
         ide_hwif_t *hwif = drive->hwif;
@@@ -999,7 -948,7 +948,7 @@@ static int hwif_init(ide_hwif_t *hwif
                 return 0;
         }
   
-       if (register_blkdev(hwif->major, hwif->name))
+       if (__register_blkdev(hwif->major, hwif->name, ata_probe))
                 return 0;
   
         if (!hwif->sg_max_nents)
@@@ -1021,8 -970,6 +970,6 @@@
                 goto out;
         }
   
-       blk_register_region(MKDEV(hwif->major, 0), MAX_DRIVES << PARTN_BITS,
-                           THIS_MODULE, ata_probe, ata_lock, hwif);
         return 1;
   
   out:
@@@ -1592,6 -1539,9 +1539,6 @@@ EXPORT_SYMBOL_GPL(ide_port_unregister_d
   
   static void ide_unregister(ide_hwif_t *hwif)
   {
- -      BUG_ON(in_interrupt());
- -      BUG_ON(irqs_disabled());
- -
         mutex_lock(&ide_cfg_mtx);
   
         if (hwif->present) {
@@@ -1611,7 -1561,6 +1558,6 @@@
         /*
          * Remove us from the kernel's knowledge
          */
-       blk_unregister_region(MKDEV(hwif->major, 0), MAX_DRIVES<<PARTN_BITS);
         kfree(hwif->sg_table);
         unregister_blkdev(hwif->major, hwif->name);
   
diff --combined drivers/md/dm-raid.c

index 56b723d,294f34d..23c3877
--- 1/drivers/md/dm-raid.c
--- 2/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@@ -700,8 -700,7 +700,7 @@@ static void rs_set_capacity(struct raid
   {
         struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
   
-       set_capacity(gendisk, rs->md.array_sectors);
-       revalidate_disk_size(gendisk, true);
+       set_capacity_and_notify(gendisk, rs->md.array_sectors);
   }
   
   /*
@@@ -3728,15 -3727,6 +3727,15 @@@ static void raid_io_hints(struct dm_tar
   
         blk_limits_io_min(limits, chunk_size_bytes);
         blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
+ +
+ +      /*
+ +       * RAID1 and RAID10 personalities require bio splitting,
+ +       * RAID0/4/5/6 don't and process large discard bios properly.
+ +       */
+ +      if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+ +              limits->discard_granularity = chunk_size_bytes;
+ +              limits->max_discard_sectors = rs->md.chunk_sectors;
+ +      }
   }
   
   static void raid_postsuspend(struct dm_target *ti)
diff --combined drivers/md/dm-table.c

index 7eeb7c4,dea6777..188f412
--- 1/drivers/md/dm-table.c
--- 2/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@@ -18,6 -18,7 +18,6 @@@
   #include <linux/mutex.h>
   #include <linux/delay.h>
   #include <linux/atomic.h>
- -#include <linux/lcm.h>
   #include <linux/blk-mq.h>
   #include <linux/mount.h>
   #include <linux/dax.h>
@@@ -347,16 -348,9 +347,9 @@@ static int upgrade_mode(struct dm_dev_i
   dev_t dm_get_dev_t(const char *path)
   {
         dev_t dev;
-       struct block_device *bdev;
   
-       bdev = lookup_bdev(path);
-       if (IS_ERR(bdev))
+       if (lookup_bdev(path, &dev))
                 dev = name_to_dev_t(path);
-       else {
-               dev = bdev->bd_dev;
-               bdput(bdev);
-       }
- 
         return dev;
   }
   EXPORT_SYMBOL_GPL(dm_get_dev_t);
@@@ -1246,6 -1240,12 +1239,6 @@@ void dm_table_event_callback(struct dm_
   
   void dm_table_event(struct dm_table *t)
   {
- -      /*
- -       * You can no longer call dm_table_event() from interrupt
- -       * context, use a bottom half instead.
- -       */
- -      BUG_ON(in_interrupt());
- -
         mutex_lock(&_event_lock);
         if (t->event_fn)
                 t->event_fn(t->event_context);
@@@ -1448,6 -1448,10 +1441,6 @@@ int dm_calculate_queue_limits(struct dm
                         zone_sectors = ti_limits.chunk_sectors;
                 }
   
- -              /* Stack chunk_sectors if target-specific splitting is required */
- -              if (ti->max_io_len)
- -                      ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len,
- -                                                             ti_limits.chunk_sectors);
                 /* Set I/O hints portion of queue limits */
                 if (ti->type->io_hints)
                         ti->type->io_hints(ti, &ti_limits);
diff --combined drivers/md/dm.c

index 4e0cbfe,5181907..5b2f371
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -476,10 -476,8 +476,10 @@@ static int dm_blk_report_zones(struct g
                 return -EAGAIN;
   
         map = dm_get_live_table(md, &srcu_idx);
- -      if (!map)
- -              return -EIO;
+ +      if (!map) {
+ +              ret = -EIO;
+ +              goto out;
+ +      }
   
         do {
                 struct dm_target *tgt;
@@@ -509,6 -507,7 +509,6 @@@ out
   
   static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
                             struct block_device **bdev)
- -      __acquires(md->io_barrier)
   {
         struct dm_target *tgt;
         struct dm_table *map;
@@@ -542,6 -541,7 +542,6 @@@ retry
   }
   
   static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
- -      __releases(md->io_barrier)
   {
         dm_put_live_table(md, srcu_idx);
   }
@@@ -570,7 -570,10 +570,10 @@@ static int dm_blk_ioctl(struct block_de
                 }
         }
   
-       r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+       if (!bdev->bd_disk->fops->ioctl)
+               r = -ENOTTY;
+       else
+               r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
   out:
         dm_unprepare_ioctl(md, srcu_idx);
         return r;
@@@ -1037,18 -1040,15 +1040,18 @@@ static sector_t max_io_len(struct dm_ta
         sector_t max_len;
   
         /*
- -       * Does the target need to split even further?
- -       * - q->limits.chunk_sectors reflects ti->max_io_len so
- -       *   blk_max_size_offset() provides required splitting.
- -       * - blk_max_size_offset() also respects q->limits.max_sectors
+ +       * Does the target need to split IO even further?
+ +       * - varied (per target) IO splitting is a tenet of DM; this
+ +       *   explains why stacked chunk_sectors based splitting via
+ +       *   blk_max_size_offset() isn't possible here. So pass in
+ +       *   ti->max_io_len to override stacked chunk_sectors.
          */
- -      max_len = blk_max_size_offset(ti->table->md->queue,
- -                                    target_offset);
- -      if (len > max_len)
- -              len = max_len;
+ +      if (ti->max_io_len) {
+ +              max_len = blk_max_size_offset(ti->table->md->queue,
+ +                                            target_offset, ti->max_io_len);
+ +              if (len > max_len)
+ +                      len = max_len;
+ +      }
   
         return len;
   }
@@@ -1199,9 -1199,11 +1202,9 @@@ static int dm_dax_zero_page_range(struc
                  * ->zero_page_range() is mandatory dax operation. If we are
                  *  here, something is wrong.
                  */
- -              dm_put_live_table(md, srcu_idx);
                 goto out;
         }
         ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
- -
    out:
         dm_put_live_table(md, srcu_idx);
   
@@@ -1274,8 -1276,7 +1277,7 @@@ static blk_qc_t __map_bio(struct dm_tar
                 break;
         case DM_MAPIO_REMAPPED:
                 /* the bio has been remapped so dispatch it */
-               trace_block_bio_remap(clone->bi_disk->queue, clone,
-                                     bio_dev(io->orig_bio), sector);
+               trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
                 ret = submit_bio_noacct(clone);
                 break;
         case DM_MAPIO_KILL:
@@@ -1420,18 -1421,12 +1422,12 @@@ static int __send_empty_flush(struct cl
          */
         bio_init(&flush_bio, NULL, 0);
         flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+       flush_bio.bi_disk = ci->io->md->disk;
+       bio_associate_blkg(&flush_bio);
+ 
         ci->bio = &flush_bio;
         ci->sector_count = 0;
   
-       /*
-        * Empty flush uses a statically initialized bio, as the base for
-        * cloning.  However, blkg association requires that a bdev is
-        * associated with a gendisk, which doesn't happen until the bdev is
-        * opened.  So, blkg association is done at issue time of the flush
-        * rather than when the device is created in alloc_dev().
-        */
-       bio_set_dev(ci->bio, ci->io->md->bdev);
- 
         BUG_ON(bio_has_data(ci->bio));
         while ((ti = dm_table_get_target(ci->map, target_nr++)))
                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
@@@ -1611,12 -1606,12 +1607,12 @@@ static blk_qc_t __split_and_process_bio
                                  * (by eliminating DM's splitting and just using bio_split)
                                  */
                                 part_stat_lock();
-                               __dm_part_stat_sub(&dm_disk(md)->part0,
+                               __dm_part_stat_sub(dm_disk(md)->part0,
                                                    sectors[op_stat_group(bio_op(bio))], ci.sector_count);
                                 part_stat_unlock();
   
                                 bio_chain(b, bio);
-                               trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
+                               trace_block_split(b, bio->bi_iter.bi_sector);
                                 ret = submit_bio_noacct(bio);
                                 break;
                         }
@@@ -1748,11 -1743,6 +1744,6 @@@ static void cleanup_mapped_device(struc
   
         cleanup_srcu_struct(&md->io_barrier);
   
-       if (md->bdev) {
-               bdput(md->bdev);
-               md->bdev = NULL;
-       }
- 
         mutex_destroy(&md->suspend_lock);
         mutex_destroy(&md->type_lock);
         mutex_destroy(&md->table_devices_lock);
@@@ -1844,10 -1834,6 +1835,6 @@@ static struct mapped_device *alloc_dev(
         if (!md->wq)
                 goto bad;
   
-       md->bdev = bdget_disk(md->disk, 0);
-       if (!md->bdev)
-               goto bad;
- 
         dm_stats_init(&md->stats);
   
         /* Populate the mapping, nobody knows we exist yet */
@@@ -1972,8 -1958,7 +1959,7 @@@ static struct dm_table *__bind(struct m
         if (size != dm_get_size(md))
                 memset(&md->geometry, 0, sizeof(md->geometry));
   
-       set_capacity(md->disk, size);
-       bd_set_nr_sectors(md->bdev, size);
+       set_capacity_and_notify(md->disk, size);
   
         dm_table_event_callback(t, event_callback, md);
   
@@@ -2256,7 -2241,7 +2242,7 @@@ EXPORT_SYMBOL_GPL(dm_put)
   static bool md_in_flight_bios(struct mapped_device *md)
   {
         int cpu;
-       struct hd_struct *part = &dm_disk(md)->part0;
+       struct block_device *part = dm_disk(md)->part0;
         long sum = 0;
   
         for_each_possible_cpu(cpu) {
@@@ -2391,27 -2376,19 +2377,19 @@@ static int lock_fs(struct mapped_devic
   {
         int r;
   
-       WARN_ON(md->frozen_sb);
- 
-       md->frozen_sb = freeze_bdev(md->bdev);
-       if (IS_ERR(md->frozen_sb)) {
-               r = PTR_ERR(md->frozen_sb);
-               md->frozen_sb = NULL;
-               return r;
-       }
- 
-       set_bit(DMF_FROZEN, &md->flags);
+       WARN_ON(test_bit(DMF_FROZEN, &md->flags));
   
-       return 0;
+       r = freeze_bdev(md->disk->part0);
+       if (!r)
+               set_bit(DMF_FROZEN, &md->flags);
+       return r;
   }
   
   static void unlock_fs(struct mapped_device *md)
   {
         if (!test_bit(DMF_FROZEN, &md->flags))
                 return;
- 
-       thaw_bdev(md->bdev, md->frozen_sb);
-       md->frozen_sb = NULL;
+       thaw_bdev(md->disk->part0);
         clear_bit(DMF_FROZEN, &md->flags);
   }
   
diff --combined drivers/md/md.c

index 0037c6e,c555be0..0445f44
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -464,7 -464,7 +464,7 @@@ struct md_io 
         bio_end_io_t *orig_bi_end_io;
         void *orig_bi_private;
         unsigned long start_time;
-       struct hd_struct *part;
+       struct block_device *part;
   };
   
   static void md_end_io(struct bio *bio)
@@@ -2414,7 -2414,6 +2414,6 @@@ EXPORT_SYMBOL(md_integrity_add_rdev)
   static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
   {
         char b[BDEVNAME_SIZE];
-       struct kobject *ko;
         int err;
   
         /* prevent duplicates */
@@@ -2477,9 -2476,8 +2476,8 @@@
         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
                 goto fail;
   
-       ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
         /* failure here is OK */
-       err = sysfs_create_link(&rdev->kobj, ko, "block");
+       err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
         rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
         rdev->sysfs_unack_badblocks =
                 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
@@@ -5355,10 -5353,9 +5353,9 @@@ array_size_store(struct mddev *mddev, c
   
         if (!err) {
                 mddev->array_sectors = sectors;
-               if (mddev->pers) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
-               }
+               if (mddev->pers)
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
         }
         mddev_unlock(mddev);
         return err ?: len;
@@@ -5765,11 -5762,12 +5762,12 @@@ static int md_alloc(dev_t dev, char *na
         return error;
   }
   
- static struct kobject *md_probe(dev_t dev, int *part, void *data)
+ static void md_probe(dev_t dev)
   {
+       if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
+               return;
         if (create_on_open)
                 md_alloc(dev, NULL);
-       return NULL;
   }
   
   static int add_named_array(const char *val, const struct kernel_param *kp)
@@@ -6107,8 -6105,7 +6105,7 @@@ int do_md_run(struct mddev *mddev
         md_wakeup_thread(mddev->thread);
         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
   
-       set_capacity(mddev->gendisk, mddev->array_sectors);
-       revalidate_disk_size(mddev->gendisk, true);
+       set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
         clear_bit(MD_NOT_READY, &mddev->flags);
         mddev->changed = 1;
         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
@@@ -6423,10 -6420,9 +6420,9 @@@ static int do_md_stop(struct mddev *mdd
                         if (rdev->raid_disk >= 0)
                                 sysfs_unlink_rdev(mddev, rdev);
   
-               set_capacity(disk, 0);
+               set_capacity_and_notify(disk, 0);
                 mutex_unlock(&mddev->open_mutex);
                 mddev->changed = 1;
-               revalidate_disk_size(disk, true);
   
                 if (mddev->ro)
                         mddev->ro = 0;
@@@ -6535,7 -6531,7 +6531,7 @@@ static void autorun_devices(int part
                         break;
                 }
   
-               md_probe(dev, NULL, NULL);
+               md_probe(dev);
                 mddev = mddev_find(dev);
                 if (!mddev || !mddev->gendisk) {
                         if (mddev)
@@@ -7257,8 -7253,8 +7253,8 @@@ static int update_size(struct mddev *md
                 if (mddev_is_clustered(mddev))
                         md_cluster_ops->update_size(mddev, old_dev_sectors);
                 else if (mddev->queue) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
                 }
         }
         return rv;
@@@ -7480,7 -7476,6 +7476,6 @@@ static inline bool md_ioctl_valid(unsig
   {
         switch (cmd) {
         case ADD_NEW_DISK:
-       case BLKROSET:
         case GET_ARRAY_INFO:
         case GET_BITMAP_FILE:
         case GET_DISK_INFO:
@@@ -7507,7 -7502,6 +7502,6 @@@ static int md_ioctl(struct block_devic
         int err = 0;
         void __user *argp = (void __user *)arg;
         struct mddev *mddev = NULL;
-       int ro;
         bool did_set_md_closing = false;
   
         if (!md_ioctl_valid(cmd))
@@@ -7687,35 -7681,6 +7681,6 @@@
                         goto unlock;
                 }
                 break;
- 
-       case BLKROSET:
-               if (get_user(ro, (int __user *)(arg))) {
-                       err = -EFAULT;
-                       goto unlock;
-               }
-               err = -EINVAL;
- 
-               /* if the bdev is going readonly the value of mddev->ro
-                * does not matter, no writes are coming
-                */
-               if (ro)
-                       goto unlock;
- 
-               /* are we are already prepared for writes? */
-               if (mddev->ro != 1)
-                       goto unlock;
- 
-               /* transitioning to readauto need only happen for
-                * arrays that call md_write_start
-                */
-               if (mddev->pers) {
-                       err = restart_array(mddev);
-                       if (err == 0) {
-                               mddev->ro = 2;
-                               set_disk_ro(mddev->gendisk, 0);
-                       }
-               }
-               goto unlock;
         }
   
         /*
@@@ -7809,6 -7774,36 +7774,36 @@@ static int md_compat_ioctl(struct block
   }
   #endif /* CONFIG_COMPAT */
   
+ static int md_set_read_only(struct block_device *bdev, bool ro)
+ {
+       struct mddev *mddev = bdev->bd_disk->private_data;
+       int err;
+ 
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+ 
+       if (!mddev->raid_disks && !mddev->external) {
+               err = -ENODEV;
+               goto out_unlock;
+       }
+ 
+       /*
+        * Transitioning to read-auto need only happen for arrays that call
+        * md_write_start and which are not ready for writes yet.
+        */
+       if (!ro && mddev->ro == 1 && mddev->pers) {
+               err = restart_array(mddev);
+               if (err)
+                       goto out_unlock;
+               mddev->ro = 2;
+       }
+ 
+ out_unlock:
+       mddev_unlock(mddev);
+       return err;
+ }
+ 
   static int md_open(struct block_device *bdev, fmode_t mode)
   {
         /*
@@@ -7886,6 -7881,7 +7881,7 @@@ const struct block_device_operations md
   #endif
         .getgeo         = md_getgeo,
         .check_events   = md_check_events,
+       .set_read_only  = md_set_read_only,
   };
   
   static int md_thread(void *arg)
@@@ -8445,7 -8441,7 +8441,7 @@@ static int is_mddev_idle(struct mddev *
         rcu_read_lock();
         rdev_for_each_rcu(rdev, mddev) {
                 struct gendisk *disk = rdev->bdev->bd_disk;
-               curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
+               curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
                               atomic_read(&disk->sync_io);
                 /* sync IO will cause sync_io to increase before the disk_stats
                  * as sync_io is counted when a request starts, and
@@@ -8582,6 -8578,25 +8578,6 @@@ void md_write_end(struct mddev *mddev
   
   EXPORT_SYMBOL(md_write_end);
   
- -/* This is used by raid0 and raid10 */
- -void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
- -                      struct bio *bio, sector_t start, sector_t size)
- -{
- -      struct bio *discard_bio = NULL;
- -
- -      if (__blkdev_issue_discard(rdev->bdev, start, size,
- -              GFP_NOIO, 0, &discard_bio) || !discard_bio)
- -              return;
- -
- -      bio_chain(discard_bio, bio);
- -      bio_clone_blkg_association(discard_bio, bio);
- -      if (mddev->gendisk)
- -              trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk),
- -                                    bio->bi_iter.bi_sector);
- -      submit_bio_noacct(discard_bio);
- -}
- -EXPORT_SYMBOL(md_submit_discard_bio);
- -
   /* md_allow_write(mddev)
    * Calling this ensures that the array is marked 'active' so that writes
    * may proceed without blocking.  It is important to call this before
@@@ -9015,10 -9030,9 +9011,9 @@@ void md_do_sync(struct md_thread *threa
                 mddev_lock_nointr(mddev);
                 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
                 mddev_unlock(mddev);
-               if (!mddev_is_clustered(mddev)) {
-                       set_capacity(mddev->gendisk, mddev->array_sectors);
-                       revalidate_disk_size(mddev->gendisk, true);
-               }
+               if (!mddev_is_clustered(mddev))
+                       set_capacity_and_notify(mddev->gendisk,
+                                               mddev->array_sectors);
         }
   
         spin_lock(&mddev->lock);
@@@ -9547,18 -9561,15 +9542,15 @@@ static int __init md_init(void
         if (!md_rdev_misc_wq)
                 goto err_rdev_misc_wq;
   
-       if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+       ret = __register_blkdev(MD_MAJOR, "md", md_probe);
+       if (ret < 0)
                 goto err_md;
   
-       if ((ret = register_blkdev(0, "mdp")) < 0)
+       ret = __register_blkdev(0, "mdp", md_probe);
+       if (ret < 0)
                 goto err_mdp;
         mdp_major = ret;
   
-       blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
-                           md_probe, NULL, NULL);
-       blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
-                           md_probe, NULL, NULL);
- 
         register_reboot_notifier(&md_notifier);
         raid_table_header = register_sysctl_table(raid_root_table);
   
@@@ -9825,9 -9836,6 +9817,6 @@@ static __exit void md_exit(void
         struct list_head *tmp;
         int delay = 1;
   
-       blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
-       blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
- 
         unregister_blkdev(MD_MAJOR,"md");
         unregister_blkdev(mdp_major, "mdp");
         unregister_reboot_notifier(&md_notifier);
diff --combined drivers/md/raid0.c

index 35843df,e5d7411..67f157f
--- 1/drivers/md/raid0.c
--- 2/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@@ -477,7 -477,6 +477,7 @@@ static void raid0_handle_discard(struc
   
         for (disk = 0; disk < zone->nb_dev; disk++) {
                 sector_t dev_start, dev_end;
+ +              struct bio *discard_bio = NULL;
                 struct md_rdev *rdev;
   
                 if (disk < start_disk_index)
@@@ -500,18 -499,9 +500,18 @@@
   
                 rdev = conf->devlist[(zone - conf->strip_zone) *
                         conf->strip_zone[0].nb_dev + disk];
- -              md_submit_discard_bio(mddev, rdev, bio,
+ +              if (__blkdev_issue_discard(rdev->bdev,
                         dev_start + zone->dev_start + rdev->data_offset,
- -                      dev_end - dev_start);
+ +                      dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+ +                  !discard_bio)
+ +                      continue;
+ +              bio_chain(discard_bio, bio);
+ +              bio_clone_blkg_association(discard_bio, bio);
+ +              if (mddev->gendisk)
-                       trace_block_bio_remap(bdev_get_queue(rdev->bdev),
-                               discard_bio, disk_devt(mddev->gendisk),
++                      trace_block_bio_remap(discard_bio,
++                              disk_devt(mddev->gendisk),
+ +                              bio->bi_iter.bi_sector);
+ +              submit_bio_noacct(discard_bio);
         }
         bio_endio(bio);
   }
@@@ -581,8 -571,8 +581,8 @@@ static bool raid0_make_request(struct m
                 tmp_dev->data_offset;
   
         if (mddev->gendisk)
-               trace_block_bio_remap(bio->bi_disk->queue, bio,
-                               disk_devt(mddev->gendisk), bio_sector);
+               trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+                                     bio_sector);
         mddev_check_writesame(mddev, bio);
         mddev_check_write_zeroes(mddev, bio);
         submit_bio_noacct(bio);
diff --combined drivers/md/raid10.c

index 3b598a3,a6f99fa..800fe06
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -91,7 -91,7 +91,7 @@@ static inline struct r10bio *get_resync
   static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
   {
         struct r10conf *conf = data;
- -      int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+ +      int size = offsetof(struct r10bio, devs[conf->copies]);
   
         /* allocate a r10bio with room for raid_disks entries in the
          * bios array */
@@@ -238,7 -238,7 +238,7 @@@ static void put_all_bios(struct r10con
   {
         int i;
   
- -      for (i = 0; i < conf->geo.raid_disks; i++) {
+ +      for (i = 0; i < conf->copies; i++) {
                 struct bio **bio = & r10_bio->devs[i].bio;
                 if (!BIO_SPECIAL(*bio))
                         bio_put(*bio);
@@@ -327,7 -327,7 +327,7 @@@ static int find_bio_disk(struct r10con
         int slot;
         int repl = 0;
   
- -      for (slot = 0; slot < conf->geo.raid_disks; slot++) {
+ +      for (slot = 0; slot < conf->copies; slot++) {
                 if (r10_bio->devs[slot].bio == bio)
                         break;
                 if (r10_bio->devs[slot].repl_bio == bio) {
@@@ -336,7 -336,6 +336,7 @@@
                 }
         }
   
+ +      BUG_ON(slot == conf->copies);
         update_head_pos(slot, r10_bio);
   
         if (slotp)
@@@ -1201,8 -1200,7 +1201,7 @@@ static void raid10_read_request(struct 
         read_bio->bi_private = r10_bio;
   
         if (mddev->gendisk)
-               trace_block_bio_remap(read_bio->bi_disk->queue,
-                                     read_bio, disk_devt(mddev->gendisk),
+               trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
                                       r10_bio->sector);
         submit_bio_noacct(read_bio);
         return;
@@@ -1251,8 -1249,7 +1250,7 @@@ static void raid10_write_one_disk(struc
         mbio->bi_private = r10_bio;
   
         if (conf->mddev->gendisk)
-               trace_block_bio_remap(mbio->bi_disk->queue,
-                                     mbio, disk_devt(conf->mddev->gendisk),
+               trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
                                       r10_bio->sector);
         /* flush_pending_writes() needs access to the rdev so...*/
         mbio->bi_disk = (void *)rdev;
@@@ -1276,12 -1273,75 +1274,12 @@@
         }
   }
   
- -static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
- -{
- -      int i;
- -      struct r10conf *conf = mddev->private;
- -      struct md_rdev *blocked_rdev;
- -
- -retry_wait:
- -      blocked_rdev = NULL;
- -      rcu_read_lock();
- -      for (i = 0; i < conf->copies; i++) {
- -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- -              struct md_rdev *rrdev = rcu_dereference(
- -                      conf->mirrors[i].replacement);
- -              if (rdev == rrdev)
- -                      rrdev = NULL;
- -              if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- -                      atomic_inc(&rdev->nr_pending);
- -                      blocked_rdev = rdev;
- -                      break;
- -              }
- -              if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
- -                      atomic_inc(&rrdev->nr_pending);
- -                      blocked_rdev = rrdev;
- -                      break;
- -              }
- -
- -              if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
- -                      sector_t first_bad;
- -                      sector_t dev_sector = r10_bio->devs[i].addr;
- -                      int bad_sectors;
- -                      int is_bad;
- -
- -                      /* Discard request doesn't care the write result
- -                       * so it doesn't need to wait blocked disk here.
- -                       */
- -                      if (!r10_bio->sectors)
- -                              continue;
- -
- -                      is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
- -                                           &first_bad, &bad_sectors);
- -                      if (is_bad < 0) {
- -                              /* Mustn't write here until the bad block
- -                               * is acknowledged
- -                               */
- -                              atomic_inc(&rdev->nr_pending);
- -                              set_bit(BlockedBadBlocks, &rdev->flags);
- -                              blocked_rdev = rdev;
- -                              break;
- -                      }
- -              }
- -      }
- -      rcu_read_unlock();
- -
- -      if (unlikely(blocked_rdev)) {
- -              /* Have to wait for this device to get unblocked, then retry */
- -              allow_barrier(conf);
- -              raid10_log(conf->mddev, "%s wait rdev %d blocked",
- -                              __func__, blocked_rdev->raid_disk);
- -              md_wait_for_blocked_rdev(blocked_rdev, mddev);
- -              wait_barrier(conf);
- -              goto retry_wait;
- -      }
- -}
- -
   static void raid10_write_request(struct mddev *mddev, struct bio *bio,
                                  struct r10bio *r10_bio)
   {
         struct r10conf *conf = mddev->private;
         int i;
+ +      struct md_rdev *blocked_rdev;
         sector_t sectors;
         int max_sectors;
   
@@@ -1339,8 -1399,9 +1337,8 @@@
   
         r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
         raid10_find_phys(conf, r10_bio);
- -
- -      wait_blocked_dev(mddev, r10_bio);
- -
+ +retry_write:
+ +      blocked_rdev = NULL;
         rcu_read_lock();
         max_sectors = r10_bio->sectors;
   
@@@ -1351,16 -1412,6 +1349,16 @@@
                         conf->mirrors[d].replacement);
                 if (rdev == rrdev)
                         rrdev = NULL;
+ +              if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ +                      atomic_inc(&rdev->nr_pending);
+ +                      blocked_rdev = rdev;
+ +                      break;
+ +              }
+ +              if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ +                      atomic_inc(&rrdev->nr_pending);
+ +                      blocked_rdev = rrdev;
+ +                      break;
+ +              }
                 if (rdev && (test_bit(Faulty, &rdev->flags)))
                         rdev = NULL;
                 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@@ -1381,15 -1432,6 +1379,15 @@@
   
                         is_bad = is_badblock(rdev, dev_sector, max_sectors,
                                              &first_bad, &bad_sectors);
+ +                      if (is_bad < 0) {
+ +                              /* Mustn't write here until the bad block
+ +                               * is acknowledged
+ +                               */
+ +                              atomic_inc(&rdev->nr_pending);
+ +                              set_bit(BlockedBadBlocks, &rdev->flags);
+ +                              blocked_rdev = rdev;
+ +                              break;
+ +                      }
                         if (is_bad && first_bad <= dev_sector) {
                                 /* Cannot write here at all */
                                 bad_sectors -= (dev_sector - first_bad);
@@@ -1425,35 -1467,6 +1423,35 @@@
         }
         rcu_read_unlock();
   
+ +      if (unlikely(blocked_rdev)) {
+ +              /* Have to wait for this device to get unblocked, then retry */
+ +              int j;
+ +              int d;
+ +
+ +              for (j = 0; j < i; j++) {
+ +                      if (r10_bio->devs[j].bio) {
+ +                              d = r10_bio->devs[j].devnum;
+ +                              rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ +                      }
+ +                      if (r10_bio->devs[j].repl_bio) {
+ +                              struct md_rdev *rdev;
+ +                              d = r10_bio->devs[j].devnum;
+ +                              rdev = conf->mirrors[d].replacement;
+ +                              if (!rdev) {
+ +                                      /* Race with remove_disk */
+ +                                      smp_mb();
+ +                                      rdev = conf->mirrors[d].rdev;
+ +                              }
+ +                              rdev_dec_pending(rdev, mddev);
+ +                      }
+ +              }
+ +              allow_barrier(conf);
+ +              raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+ +              md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ +              wait_barrier(conf);
+ +              goto retry_write;
+ +      }
+ +
         if (max_sectors < r10_bio->sectors)
                 r10_bio->sectors = max_sectors;
   
@@@ -1493,7 -1506,7 +1491,7 @@@ static void __make_request(struct mdde
         r10_bio->mddev = mddev;
         r10_bio->sector = bio->bi_iter.bi_sector;
         r10_bio->state = 0;
- -      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
+ +      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
   
         if (bio_data_dir(bio) == READ)
                 raid10_read_request(mddev, bio, r10_bio);
@@@ -1501,6 -1514,296 +1499,6 @@@
                 raid10_write_request(mddev, bio, r10_bio);
   }
   
- -static struct bio *raid10_split_bio(struct r10conf *conf,
- -                      struct bio *bio, sector_t sectors, bool want_first)
- -{
- -      struct bio *split;
- -
- -      split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
- -      bio_chain(split, bio);
- -      allow_barrier(conf);
- -      if (want_first) {
- -              submit_bio_noacct(bio);
- -              bio = split;
- -      } else
- -              submit_bio_noacct(split);
- -      wait_barrier(conf);
- -
- -      return bio;
- -}
- -
- -static void raid_end_discard_bio(struct r10bio *r10bio)
- -{
- -      struct r10conf *conf = r10bio->mddev->private;
- -      struct r10bio *first_r10bio;
- -
- -      while (atomic_dec_and_test(&r10bio->remaining)) {
- -
- -              allow_barrier(conf);
- -
- -              if (!test_bit(R10BIO_Discard, &r10bio->state)) {
- -                      first_r10bio = (struct r10bio *)r10bio->master_bio;
- -                      free_r10bio(r10bio);
- -                      r10bio = first_r10bio;
- -              } else {
- -                      md_write_end(r10bio->mddev);
- -                      bio_endio(r10bio->master_bio);
- -                      free_r10bio(r10bio);
- -                      break;
- -              }
- -      }
- -}
- -
- -static void raid10_end_discard_request(struct bio *bio)
- -{
- -      struct r10bio *r10_bio = bio->bi_private;
- -      struct r10conf *conf = r10_bio->mddev->private;
- -      struct md_rdev *rdev = NULL;
- -      int dev;
- -      int slot, repl;
- -
- -      /*
- -       * We don't care the return value of discard bio
- -       */
- -      if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- -              set_bit(R10BIO_Uptodate, &r10_bio->state);
- -
- -      dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- -      if (repl)
- -              rdev = conf->mirrors[dev].replacement;
- -      if (!rdev) {
- -              /* raid10_remove_disk uses smp_mb to make sure rdev is set to
- -               * replacement before setting replacement to NULL. It can read
- -               * rdev first without barrier protect even replacment is NULL
- -               */
- -              smp_rmb();
- -              rdev = conf->mirrors[dev].rdev;
- -      }
- -
- -      raid_end_discard_bio(r10_bio);
- -      rdev_dec_pending(rdev, conf->mddev);
- -}
- -
- -/* There are some limitations to handle discard bio
- - * 1st, the discard size is bigger than stripe_size*2.
- - * 2st, if the discard bio spans reshape progress, we use the old way to
- - * handle discard bio
- - */
- -static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
- -{
- -      struct r10conf *conf = mddev->private;
- -      struct geom *geo = &conf->geo;
- -      struct r10bio *r10_bio, *first_r10bio;
- -      int far_copies = geo->far_copies;
- -      bool first_copy = true;
- -
- -      int disk;
- -      sector_t chunk;
- -      unsigned int stripe_size;
- -      sector_t split_size;
- -
- -      sector_t bio_start, bio_end;
- -      sector_t first_stripe_index, last_stripe_index;
- -      sector_t start_disk_offset;
- -      unsigned int start_disk_index;
- -      sector_t end_disk_offset;
- -      unsigned int end_disk_index;
- -      unsigned int remainder;
- -
- -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- -              return -EAGAIN;
- -
- -      wait_barrier(conf);
- -
- -      /* Check reshape again to avoid reshape happens after checking
- -       * MD_RECOVERY_RESHAPE and before wait_barrier
- -       */
- -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- -              goto out;
- -
- -      stripe_size = geo->raid_disks << geo->chunk_shift;
- -      bio_start = bio->bi_iter.bi_sector;
- -      bio_end = bio_end_sector(bio);
- -
- -      /* Maybe one discard bio is smaller than strip size or across one stripe
- -       * and discard region is larger than one stripe size. For far offset layout,
- -       * if the discard region is not aligned with stripe size, there is hole
- -       * when we submit discard bio to member disk. For simplicity, we only
- -       * handle discard bio which discard region is bigger than stripe_size*2
- -       */
- -      if (bio_sectors(bio) < stripe_size*2)
- -              goto out;
- -
- -      /* For far and far offset layout, if bio is not aligned with stripe size,
- -       * it splits the part that is not aligned with strip size.
- -       */
- -      div_u64_rem(bio_start, stripe_size, &remainder);
- -      if ((far_copies > 1) && remainder) {
- -              split_size = stripe_size - remainder;
- -              bio = raid10_split_bio(conf, bio, split_size, false);
- -      }
- -      div_u64_rem(bio_end, stripe_size, &remainder);
- -      if ((far_copies > 1) && remainder) {
- -              split_size = bio_sectors(bio) - remainder;
- -              bio = raid10_split_bio(conf, bio, split_size, true);
- -      }
- -
- -      bio_start = bio->bi_iter.bi_sector;
- -      bio_end = bio_end_sector(bio);
- -
- -      /* raid10 uses chunk as the unit to store data. It's similar like raid0.
- -       * One stripe contains the chunks from all member disk (one chunk from
- -       * one disk at the same HBA address). For layout detail, see 'man md 4'
- -       */
- -      chunk = bio_start >> geo->chunk_shift;
- -      chunk *= geo->near_copies;
- -      first_stripe_index = chunk;
- -      start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
- -      if (geo->far_offset)
- -              first_stripe_index *= geo->far_copies;
- -      start_disk_offset = (bio_start & geo->chunk_mask) +
- -                              (first_stripe_index << geo->chunk_shift);
- -
- -      chunk = bio_end >> geo->chunk_shift;
- -      chunk *= geo->near_copies;
- -      last_stripe_index = chunk;
- -      end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
- -      if (geo->far_offset)
- -              last_stripe_index *= geo->far_copies;
- -      end_disk_offset = (bio_end & geo->chunk_mask) +
- -                              (last_stripe_index << geo->chunk_shift);
- -
- -retry_discard:
- -      r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
- -      r10_bio->mddev = mddev;
- -      r10_bio->state = 0;
- -      r10_bio->sectors = 0;
- -      memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
- -      wait_blocked_dev(mddev, r10_bio);
- -
- -      /* For far layout it needs more than one r10bio to cover all regions.
- -       * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
- -       * to record the discard bio. Other r10bio->master_bio record the first
- -       * r10bio. The first r10bio only release after all other r10bios finish.
- -       * The discard bio returns only first r10bio finishes
- -       */
- -      if (first_copy) {
- -              r10_bio->master_bio = bio;
- -              set_bit(R10BIO_Discard, &r10_bio->state);
- -              first_copy = false;
- -              first_r10bio = r10_bio;
- -      } else
- -              r10_bio->master_bio = (struct bio *)first_r10bio;
- -
- -      rcu_read_lock();
- -      for (disk = 0; disk < geo->raid_disks; disk++) {
- -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
- -              struct md_rdev *rrdev = rcu_dereference(
- -                      conf->mirrors[disk].replacement);
- -
- -              r10_bio->devs[disk].bio = NULL;
- -              r10_bio->devs[disk].repl_bio = NULL;
- -
- -              if (rdev && (test_bit(Faulty, &rdev->flags)))
- -                      rdev = NULL;
- -              if (rrdev && (test_bit(Faulty, &rrdev->flags)))
- -                      rrdev = NULL;
- -              if (!rdev && !rrdev)
- -                      continue;
- -
- -              if (rdev) {
- -                      r10_bio->devs[disk].bio = bio;
- -                      atomic_inc(&rdev->nr_pending);
- -              }
- -              if (rrdev) {
- -                      r10_bio->devs[disk].repl_bio = bio;
- -                      atomic_inc(&rrdev->nr_pending);
- -              }
- -      }
- -      rcu_read_unlock();
- -
- -      atomic_set(&r10_bio->remaining, 1);
- -      for (disk = 0; disk < geo->raid_disks; disk++) {
- -              sector_t dev_start, dev_end;
- -              struct bio *mbio, *rbio = NULL;
- -              struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
- -              struct md_rdev *rrdev = rcu_dereference(
- -                      conf->mirrors[disk].replacement);
- -
- -              /*
- -               * Now start to calculate the start and end address for each disk.
- -               * The space between dev_start and dev_end is the discard region.
- -               *
- -               * For dev_start, it needs to consider three conditions:
- -               * 1st, the disk is before start_disk, you can imagine the disk in
- -               * the next stripe. So the dev_start is the start address of next
- -               * stripe.
- -               * 2st, the disk is after start_disk, it means the disk is at the
- -               * same stripe of first disk
- -               * 3st, the first disk itself, we can use start_disk_offset directly
- -               */
- -              if (disk < start_disk_index)
- -                      dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
- -              else if (disk > start_disk_index)
- -                      dev_start = first_stripe_index * mddev->chunk_sectors;
- -              else
- -                      dev_start = start_disk_offset;
- -
- -              if (disk < end_disk_index)
- -                      dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
- -              else if (disk > end_disk_index)
- -                      dev_end = last_stripe_index * mddev->chunk_sectors;
- -              else
- -                      dev_end = end_disk_offset;
- -
- -              /* It only handles discard bio which size is >= stripe size, so
- -               * dev_end > dev_start all the time
- -               */
- -              if (r10_bio->devs[disk].bio) {
- -                      mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
- -                      mbio->bi_end_io = raid10_end_discard_request;
- -                      mbio->bi_private = r10_bio;
- -                      r10_bio->devs[disk].bio = mbio;
- -                      r10_bio->devs[disk].devnum = disk;
- -                      atomic_inc(&r10_bio->remaining);
- -                      md_submit_discard_bio(mddev, rdev, mbio,
- -                                      dev_start + choose_data_offset(r10_bio, rdev),
- -                                      dev_end - dev_start);
- -                      bio_endio(mbio);
- -              }
- -              if (r10_bio->devs[disk].repl_bio) {
- -                      rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
- -                      rbio->bi_end_io = raid10_end_discard_request;
- -                      rbio->bi_private = r10_bio;
- -                      r10_bio->devs[disk].repl_bio = rbio;
- -                      r10_bio->devs[disk].devnum = disk;
- -                      atomic_inc(&r10_bio->remaining);
- -                      md_submit_discard_bio(mddev, rrdev, rbio,
- -                                      dev_start + choose_data_offset(r10_bio, rrdev),
- -                                      dev_end - dev_start);
- -                      bio_endio(rbio);
- -              }
- -      }
- -
- -      if (!geo->far_offset && --far_copies) {
- -              first_stripe_index += geo->stride >> geo->chunk_shift;
- -              start_disk_offset += geo->stride;
- -              last_stripe_index += geo->stride >> geo->chunk_shift;
- -              end_disk_offset += geo->stride;
- -              atomic_inc(&first_r10bio->remaining);
- -              raid_end_discard_bio(r10_bio);
- -              wait_barrier(conf);
- -              goto retry_discard;
- -      }
- -
- -      raid_end_discard_bio(r10_bio);
- -
- -      return 0;
- -out:
- -      allow_barrier(conf);
- -      return -EAGAIN;
- -}
- -
   static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
   {
         struct r10conf *conf = mddev->private;
@@@ -1515,6 -1818,10 +1513,6 @@@
         if (!md_write_start(mddev, bio))
                 return false;
   
- -      if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
- -              if (!raid10_handle_discard(mddev, bio))
- -                      return true;
- -
         /*
          * If this request crosses a chunk boundary, we need to split
          * it.
@@@ -3754,7 -4061,7 +3752,7 @@@ static int raid10_run(struct mddev *mdd
   
         if (mddev->queue) {
                 blk_queue_max_discard_sectors(mddev->queue,
- -                                            UINT_MAX);
+ +                                            mddev->chunk_sectors);
                 blk_queue_max_write_same_sectors(mddev->queue, 0);
                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
                 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
diff --combined drivers/nvme/host/core.c

index 9a270e4,bc89e86..9b6ebeb
--- 1/drivers/nvme/host/core.c
--- 2/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -93,16 -93,6 +93,6 @@@ static void nvme_put_subsystem(struct n
   static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
                                            unsigned nsid);
   
- static void nvme_update_bdev_size(struct gendisk *disk)
- {
-       struct block_device *bdev = bdget_disk(disk, 0);
- 
-       if (bdev) {
-               bd_set_nr_sectors(bdev, get_capacity(disk));
-               bdput(bdev);
-       }
- }
- 
   /*
    * Prepare a queue for teardown.
    *
@@@ -119,8 -109,7 +109,7 @@@ static void nvme_set_queue_dying(struc
         blk_set_queue_dying(ns->queue);
         blk_mq_unquiesce_queue(ns->queue);
   
-       set_capacity(ns->disk, 0);
-       nvme_update_bdev_size(ns->disk);
+       set_capacity_and_notify(ns->disk, 0);
   }
   
   static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@@ -2053,7 -2042,7 +2042,7 @@@ static void nvme_update_disk_info(struc
                         capacity = 0;
         }
   
-       set_capacity_revalidate_and_notify(disk, capacity, false);
+       set_capacity_and_notify(disk, capacity);
   
         nvme_config_discard(disk, ns);
         nvme_config_write_zeroes(disk, ns);
@@@ -2134,7 -2123,6 +2123,6 @@@ static int nvme_update_ns_info(struct n
                 blk_stack_limits(&ns->head->disk->queue->limits,
                                  &ns->queue->limits, 0);
                 blk_queue_update_readahead(ns->head->disk->queue);
-               nvme_update_bdev_size(ns->head->disk);
                 blk_mq_unfreeze_queue(ns->head->disk->queue);
         }
   #endif
@@@ -2929,7 -2917,7 +2917,7 @@@ int nvme_get_log(struct nvme_ctrl *ctrl
   static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
                                 struct nvme_effects_log **log)
   {
- -      struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
+ +      struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
         int ret;
   
         if (cel)
@@@ -2940,15 -2928,16 +2928,15 @@@
                 return -ENOMEM;
   
         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
- -                      &cel->log, sizeof(cel->log), 0);
+ +                      cel, sizeof(*cel), 0);
         if (ret) {
                 kfree(cel);
                 return ret;
         }
   
- -      cel->csi = csi;
- -      xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
+ +      xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
   out:
- -      *log = &cel->log;
+ +      *log = cel;
         return 0;
   }
   
@@@ -3962,8 -3951,6 +3950,6 @@@ out
          */
         if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR)))
                 nvme_ns_remove(ns);
-       else
-               revalidate_disk_size(ns->disk, true);
   }
   
   static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@@ -4373,19 -4360,6 +4359,19 @@@ void nvme_uninit_ctrl(struct nvme_ctrl 
   }
   EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
   
+ +static void nvme_free_cels(struct nvme_ctrl *ctrl)
+ +{
+ +      struct nvme_effects_log *cel;
+ +      unsigned long i;
+ +
+ +      xa_for_each (&ctrl->cels, i, cel) {
+ +              xa_erase(&ctrl->cels, i);
+ +              kfree(cel);
+ +      }
+ +
+ +      xa_destroy(&ctrl->cels);
+ +}
+ +
   static void nvme_free_ctrl(struct device *dev)
   {
         struct nvme_ctrl *ctrl =
@@@ -4395,7 -4369,8 +4381,7 @@@
         if (!subsys || ctrl->instance != subsys->instance)
                 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
   
- -      xa_destroy(&ctrl->cels);
- -
+ +      nvme_free_cels(ctrl);
         nvme_mpath_uninit(ctrl);
         __free_page(ctrl->discard_page);
   
diff --combined drivers/s390/block/dasd.c

index fd56824,1825fa8..6efacad
--- 1/drivers/s390/block/dasd.c
--- 2/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@@ -75,6 -75,7 +75,6 @@@ static int dasd_flush_block_queue(struc
   static void dasd_device_tasklet(unsigned long);
   static void dasd_block_tasklet(unsigned long);
   static void do_kick_device(struct work_struct *);
- -static void do_restore_device(struct work_struct *);
   static void do_reload_device(struct work_struct *);
   static void do_requeue_requests(struct work_struct *);
   static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
@@@ -137,6 -138,7 +137,6 @@@ struct dasd_device *dasd_alloc_device(v
         INIT_LIST_HEAD(&device->ccw_queue);
         timer_setup(&device->timer, dasd_device_timeout, 0);
         INIT_WORK(&device->kick_work, do_kick_device);
- -      INIT_WORK(&device->restore_device, do_restore_device);
         INIT_WORK(&device->reload_device, do_reload_device);
         INIT_WORK(&device->requeue_requests, do_requeue_requests);
         device->state = DASD_STATE_NEW;
@@@ -430,7 -432,7 +430,7 @@@ dasd_state_ready_to_online(struct dasd_
   {
         struct gendisk *disk;
         struct disk_part_iter piter;
-       struct hd_struct *part;
+       struct block_device *part;
   
         device->state = DASD_STATE_ONLINE;
         if (device->block) {
@@@ -443,7 -445,7 +443,7 @@@
                 disk = device->block->bdev->bd_disk;
                 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
                 while ((part = disk_part_iter_next(&piter)))
-                       kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+                       kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
                 disk_part_iter_exit(&piter);
         }
         return 0;
@@@ -457,7 -459,7 +457,7 @@@ static int dasd_state_online_to_ready(s
         int rc;
         struct gendisk *disk;
         struct disk_part_iter piter;
-       struct hd_struct *part;
+       struct block_device *part;
   
         if (device->discipline->online_to_ready) {
                 rc = device->discipline->online_to_ready(device);
@@@ -470,7 -472,7 +470,7 @@@
                 disk = device->block->bdev->bd_disk;
                 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
                 while ((part = disk_part_iter_next(&piter)))
-                       kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+                       kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
                 disk_part_iter_exit(&piter);
         }
         return 0;
@@@ -619,6 -621,26 +619,6 @@@ void dasd_reload_device(struct dasd_dev
   EXPORT_SYMBOL(dasd_reload_device);
   
   /*
- - * dasd_restore_device will schedule a call do do_restore_device to the kernel
- - * event daemon.
- - */
- -static void do_restore_device(struct work_struct *work)
- -{
- -      struct dasd_device *device = container_of(work, struct dasd_device,
- -                                                restore_device);
- -      device->cdev->drv->restore(device->cdev);
- -      dasd_put_device(device);
- -}
- -
- -void dasd_restore_device(struct dasd_device *device)
- -{
- -      dasd_get_device(device);
- -      /* queue call to dasd_restore_device to the kernel event daemon. */
- -      if (!schedule_work(&device->restore_device))
- -              dasd_put_device(device);
- -}
- -
- -/*
    * Set the target state for a device and starts the state change.
    */
   void dasd_set_target_state(struct dasd_device *device, int target)
@@@ -1492,6 -1514,7 +1492,6 @@@ int dasd_start_IO(struct dasd_ccw_req *
                               "start_IO: -EIO device gone, retry");
                 break;
         case -EINVAL:
- -              /* most likely caused in power management context */
                 DBF_DEV_EVENT(DBF_WARNING, device, "%s",
                               "start_IO: -EINVAL device currently "
                               "not accessible");
@@@ -2025,7 -2048,7 +2025,7 @@@ static void __dasd_device_check_expire(
   static int __dasd_device_is_unusable(struct dasd_device *device,
                                      struct dasd_ccw_req *cqr)
   {
- -      int mask = ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM | DASD_STOPPED_NOSPC);
+ +      int mask = ~(DASD_STOPPED_DC_WAIT | DASD_STOPPED_NOSPC);
   
         if (test_bit(DASD_FLAG_OFFLINE, &device->flags) &&
             !test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
@@@ -2089,7 -2112,8 +2089,7 @@@ static void __dasd_device_check_path_ev
         if (!dasd_path_get_tbvpm(device))
                 return;
   
- -      if (device->stopped &
- -          ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM))
+ +      if (device->stopped & ~(DASD_STOPPED_DC_WAIT))
                 return;
         rc = device->discipline->verify_path(device,
                                              dasd_path_get_tbvpm(device));
@@@ -2956,12 -2980,6 +2956,12 @@@ static int _dasd_requeue_request(struc
   
         if (!block)
                 return -EINVAL;
+ +      /*
+ +       * If the request is an ERP request there is nothing to requeue.
+ +       * This will be done with the remaining original request.
+ +       */
+ +      if (cqr->refers)
+ +              return 0;
         spin_lock_irq(&cqr->dq->lock);
         req = (struct request *) cqr->callback_data;
         blk_mq_requeue_request(req, false);
@@@ -3376,6 -3394,7 +3376,7 @@@ dasd_device_operations = 
         .ioctl          = dasd_ioctl,
         .compat_ioctl   = dasd_ioctl,
         .getgeo         = dasd_getgeo,
+       .set_read_only  = dasd_set_read_only,
   };
   
   /*******************************************************************************
@@@ -3770,6 -3789,11 +3771,6 @@@ int dasd_generic_path_operational(struc
                  "operational\n");
         DBF_DEV_EVENT(DBF_WARNING, device, "%s", "path operational");
         dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT);
- -      if (device->stopped & DASD_UNRESUMED_PM) {
- -              dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM);
- -              dasd_restore_device(device);
- -              return 1;
- -      }
         dasd_schedule_device_bh(device);
         if (device->block) {
                 dasd_schedule_block_bh(device->block);
@@@ -4029,6 -4053,66 +4030,6 @@@ void dasd_schedule_requeue(struct dasd_
   }
   EXPORT_SYMBOL(dasd_schedule_requeue);
   
- -int dasd_generic_pm_freeze(struct ccw_device *cdev)
- -{
- -      struct dasd_device *device = dasd_device_from_cdev(cdev);
- -
- -      if (IS_ERR(device))
- -              return PTR_ERR(device);
- -
- -      /* mark device as suspended */
- -      set_bit(DASD_FLAG_SUSPENDED, &device->flags);
- -
- -      if (device->discipline->freeze)
- -              device->discipline->freeze(device);
- -
- -      /* disallow new I/O  */
- -      dasd_device_set_stop_bits(device, DASD_STOPPED_PM);
- -
- -      return dasd_generic_requeue_all_requests(device);
- -}
- -EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);
- -
- -int dasd_generic_restore_device(struct ccw_device *cdev)
- -{
- -      struct dasd_device *device = dasd_device_from_cdev(cdev);
- -      int rc = 0;
- -
- -      if (IS_ERR(device))
- -              return PTR_ERR(device);
- -
- -      /* allow new IO again */
- -      dasd_device_remove_stop_bits(device,
- -                                   (DASD_STOPPED_PM | DASD_UNRESUMED_PM));
- -
- -      dasd_schedule_device_bh(device);
- -
- -      /*
- -       * call discipline restore function
- -       * if device is stopped do nothing e.g. for disconnected devices
- -       */
- -      if (device->discipline->restore && !(device->stopped))
- -              rc = device->discipline->restore(device);
- -      if (rc || device->stopped)
- -              /*
- -               * if the resume failed for the DASD we put it in
- -               * an UNRESUMED stop state
- -               */
- -              device->stopped |= DASD_UNRESUMED_PM;
- -
- -      if (device->block) {
- -              dasd_schedule_block_bh(device->block);
- -              if (device->block->request_queue)
- -                      blk_mq_run_hw_queues(device->block->request_queue,
- -                                           true);
- -      }
- -
- -      clear_bit(DASD_FLAG_SUSPENDED, &device->flags);
- -      dasd_put_device(device);
- -      return 0;
- -}
- -EXPORT_SYMBOL_GPL(dasd_generic_restore_device);
- -
   static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device,
                                                    int rdc_buffer_size,
                                                    int magic)
diff --combined drivers/s390/block/dasd_int.h

index 7a34161,c59a0d6..8ca077f
--- 1/drivers/s390/block/dasd_int.h
--- 2/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@@ -355,6 -355,10 +355,6 @@@ struct dasd_discipline 
         int (*fill_info) (struct dasd_device *, struct dasd_information2_t *);
         int (*ioctl) (struct dasd_block *, unsigned int, void __user *);
   
- -      /* suspend/resume functions */
- -      int (*freeze) (struct dasd_device *);
- -      int (*restore) (struct dasd_device *);
- -
         /* reload device after state change */
         int (*reload) (struct dasd_device *);
   
@@@ -516,6 -520,7 +516,6 @@@ struct dasd_device 
         atomic_t tasklet_scheduled;
           struct tasklet_struct tasklet;
         struct work_struct kick_work;
- -      struct work_struct restore_device;
         struct work_struct reload_device;
         struct work_struct kick_validate;
         struct work_struct suc_work;
@@@ -587,6 -592,8 +587,6 @@@ struct dasd_queue 
   #define DASD_STOPPED_PENDING 4         /* long busy */
   #define DASD_STOPPED_DC_WAIT 8         /* disconnected, wait */
   #define DASD_STOPPED_SU      16        /* summary unit check handling */
- -#define DASD_STOPPED_PM      32        /* pm state transition */
- -#define DASD_UNRESUMED_PM    64        /* pm resume failed state */
   #define DASD_STOPPED_NOSPC   128       /* no space left */
   
   /* per device flags */
@@@ -746,6 -753,7 +746,6 @@@ enum blk_eh_timer_return dasd_times_out
   void dasd_enable_device(struct dasd_device *);
   void dasd_set_target_state(struct dasd_device *, int);
   void dasd_kick_device(struct dasd_device *);
- -void dasd_restore_device(struct dasd_device *);
   void dasd_reload_device(struct dasd_device *);
   void dasd_schedule_requeue(struct dasd_device *);
   
@@@ -777,6 -785,8 +777,6 @@@ int dasd_generic_path_operational(struc
   void dasd_generic_shutdown(struct ccw_device *);
   
   void dasd_generic_handle_state_change(struct dasd_device *);
- -int dasd_generic_pm_freeze(struct ccw_device *);
- -int dasd_generic_restore_device(struct ccw_device *);
   enum uc_todo dasd_generic_uc_handler(struct ccw_device *, struct irb *);
   void dasd_generic_path_event(struct ccw_device *, int *);
   int dasd_generic_verify_path(struct dasd_device *, __u8);
@@@ -834,7 -844,8 +834,8 @@@ int dasd_scan_partitions(struct dasd_bl
   void dasd_destroy_partitions(struct dasd_block *);
   
   /* externals in dasd_ioctl.c */
- int  dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_set_read_only(struct block_device *bdev, bool ro);
   
   /* externals in dasd_proc.c */
   int dasd_proc_init(void);
diff --combined fs/btrfs/sysfs.c

index 4522a1c,24b6c6d..19b9fff
--- 1/fs/btrfs/sysfs.c
--- 2/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@@ -263,10 -263,6 +263,10 @@@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_H
   BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
   BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
   BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+ +/* Remove once support for zoned allocation is feature complete */
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+ +#endif
   
   static struct attribute *btrfs_supported_feature_attrs[] = {
         BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@@ -282,9 -278,6 +282,9 @@@
         BTRFS_FEAT_ATTR_PTR(metadata_uuid),
         BTRFS_FEAT_ATTR_PTR(free_space_tree),
         BTRFS_FEAT_ATTR_PTR(raid1c34),
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      BTRFS_FEAT_ATTR_PTR(zoned),
+ +#endif
         NULL
   };
   
@@@ -336,35 -329,10 +336,35 @@@ static ssize_t send_stream_version_show
   }
   BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
   
+ +static const char *rescue_opts[] = {
+ +      "usebackuproot",
+ +      "nologreplay",
+ +      "ignorebadroots",
+ +      "ignoredatacsums",
+ +      "all",
+ +};
+ +
+ +static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ +                                           struct kobj_attribute *a,
+ +                                           char *buf)
+ +{
+ +      ssize_t ret = 0;
+ +      int i;
+ +
+ +      for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ +              ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ +                               (i ? " " : ""), rescue_opts[i]);
+ +      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ +      return ret;
+ +}
+ +BTRFS_ATTR(static_feature, supported_rescue_options,
+ +         supported_rescue_options_show);
+ +
   static struct attribute *btrfs_supported_static_feature_attrs[] = {
         BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
         BTRFS_ATTR_PTR(static_feature, supported_checksums),
         BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ +      BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
         NULL
   };
   
@@@ -465,8 -433,7 +465,8 @@@ static ssize_t btrfs_discard_iops_limit
                 return -EINVAL;
   
         WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
- -
+ +      btrfs_discard_calc_delay(discard_ctl);
+ +      btrfs_discard_schedule_work(discard_ctl, true);
         return len;
   }
   BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
@@@ -496,7 -463,7 +496,7 @@@ static ssize_t btrfs_discard_kbps_limit
                 return -EINVAL;
   
         WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
- -
+ +      btrfs_discard_schedule_work(discard_ctl, true);
         return len;
   }
   BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
@@@ -887,82 -854,6 +887,82 @@@ static ssize_t btrfs_exclusive_operatio
   }
   BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
   
+ +static ssize_t btrfs_generation_show(struct kobject *kobj,
+ +                                   struct kobj_attribute *a, char *buf)
+ +{
+ +      struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ +
+ +      return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+ +}
+ +BTRFS_ATTR(, generation, btrfs_generation_show);
+ +
+ +/*
+ + * Look for an exact string @string in @buffer with possible leading or
+ + * trailing whitespace
+ + */
+ +static bool strmatch(const char *buffer, const char *string)
+ +{
+ +      const size_t len = strlen(string);
+ +
+ +      /* Skip leading whitespace */
+ +      buffer = skip_spaces(buffer);
+ +
+ +      /* Match entire string, check if the rest is whitespace or empty */
+ +      if (strncmp(string, buffer, len) == 0 &&
+ +          strlen(skip_spaces(buffer + len)) == 0)
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +static const char * const btrfs_read_policy_name[] = { "pid" };
+ +
+ +static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ +                                    struct kobj_attribute *a, char *buf)
+ +{
+ +      struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ +      ssize_t ret = 0;
+ +      int i;
+ +
+ +      for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ +              if (fs_devices->read_policy == i)
+ +                      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ +                                       (ret == 0 ? "" : " "),
+ +                                       btrfs_read_policy_name[i]);
+ +              else
+ +                      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ +                                       (ret == 0 ? "" : " "),
+ +                                       btrfs_read_policy_name[i]);
+ +      }
+ +
+ +      ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ +
+ +      return ret;
+ +}
+ +
+ +static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ +                                     struct kobj_attribute *a,
+ +                                     const char *buf, size_t len)
+ +{
+ +      struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ +      int i;
+ +
+ +      for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ +              if (strmatch(buf, btrfs_read_policy_name[i])) {
+ +                      if (i != fs_devices->read_policy) {
+ +                              fs_devices->read_policy = i;
+ +                              btrfs_info(fs_devices->fs_info,
+ +                                         "read policy set to '%s'",
+ +                                         btrfs_read_policy_name[i]);
+ +                      }
+ +                      return len;
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+ +
   static const struct attribute *btrfs_attrs[] = {
         BTRFS_ATTR_PTR(, label),
         BTRFS_ATTR_PTR(, nodesize),
@@@ -972,8 -863,6 +972,8 @@@
         BTRFS_ATTR_PTR(, metadata_uuid),
         BTRFS_ATTR_PTR(, checksum),
         BTRFS_ATTR_PTR(, exclusive_operation),
+ +      BTRFS_ATTR_PTR(, generation),
+ +      BTRFS_ATTR_PTR(, read_policy),
         NULL,
   };
   
@@@ -1318,7 -1207,7 +1318,7 @@@ static const char *alloc_name(u64 flags
         default:
                 WARN_ON(1);
                 return "invalid-combination";
- -      };
+ +      }
   }
   
   /*
@@@ -1343,8 -1232,6 +1343,6 @@@ int btrfs_sysfs_add_space_info_type(str
   
   void btrfs_sysfs_remove_device(struct btrfs_device *device)
   {
-       struct hd_struct *disk;
-       struct kobject *disk_kobj;
         struct kobject *devices_kobj;
   
         /*
@@@ -1354,11 -1241,8 +1352,8 @@@
         devices_kobj = device->fs_info->fs_devices->devices_kobj;
         ASSERT(devices_kobj);
   
-       if (device->bdev) {
-               disk = device->bdev->bd_part;
-               disk_kobj = &part_to_dev(disk)->kobj;
-               sysfs_remove_link(devices_kobj, disk_kobj->name);
-       }
+       if (device->bdev)
+               sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
   
         if (device->devid_kobj.state_initialized) {
                 kobject_del(&device->devid_kobj);
@@@ -1464,11 -1348,7 +1459,7 @@@ int btrfs_sysfs_add_device(struct btrfs
         nofs_flag = memalloc_nofs_save();
   
         if (device->bdev) {
-               struct hd_struct *disk;
-               struct kobject *disk_kobj;
- 
-               disk = device->bdev->bd_part;
-               disk_kobj = &part_to_dev(disk)->kobj;
+               struct kobject *disk_kobj = bdev_kobj(device->bdev);
   
                 ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
                 if (ret) {
diff --combined fs/btrfs/volumes.c

index 7930e1c,fbc4b58..ee086fc
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -31,7 -31,6 +31,7 @@@
   #include "space-info.h"
   #include "block-group.h"
   #include "discard.h"
+ +#include "zoned.h"
   
   const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
         [BTRFS_RAID_RAID10] = {
@@@ -375,7 -374,6 +375,7 @@@ void btrfs_free_device(struct btrfs_dev
         rcu_string_free(device->name);
         extent_io_tree_release(&device->alloc_state);
         bio_put(device->flush_bio);
+ +      btrfs_destroy_dev_zone_info(device);
         kfree(device);
   }
   
@@@ -669,10 -667,6 +669,10 @@@ static int btrfs_open_one_device(struc
         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
         device->mode = flags;
   
+ +      ret = btrfs_get_dev_zone_info(device);
+ +      if (ret != 0)
+ +              goto error_free_page;
+ +
         fs_devices->open_devices++;
         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
             device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@@ -828,7 -822,7 +828,7 @@@ static noinline struct btrfs_device *de
         } else {
                 mutex_lock(&fs_devices->device_list_mutex);
                 device = btrfs_find_device(fs_devices, devid,
- -                              disk_super->dev_item.uuid, NULL, false);
+ +                              disk_super->dev_item.uuid, NULL);
   
                 /*
                  * If this disk has been pulled into an fs devices created by
@@@ -935,31 -929,24 +935,30 @@@
                  * make sure it's the same device if the device is mounted
                  */
                 if (device->bdev) {
-                       struct block_device *path_bdev;
+                       int error;
+                       dev_t path_dev;
   
-                       path_bdev = lookup_bdev(path);
-                       if (IS_ERR(path_bdev)) {
+                       error = lookup_bdev(path, &path_dev);
+                       if (error) {
                                 mutex_unlock(&fs_devices->device_list_mutex);
-                               return ERR_CAST(path_bdev);
+                               return ERR_PTR(error);
                         }
   
-                       if (device->bdev != path_bdev) {
-                               bdput(path_bdev);
+                       if (device->bdev->bd_dev != path_dev) {
                                 mutex_unlock(&fs_devices->device_list_mutex);
- -                              btrfs_warn_in_rcu(device->fs_info,
+ +                              /*
+ +                               * device->fs_info may not be reliable here, so
+ +                               * pass in a NULL instead. This avoids a
+ +                               * possible use-after-free when the fs_info and
+ +                               * fs_info->sb are already torn down.
+ +                               */
+ +                              btrfs_warn_in_rcu(NULL,
         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
                                                   path, devid, found_transid,
                                                   current->comm,
                                                   task_pid_nr(current));
                                 return ERR_PTR(-EEXIST);
                         }
-                       bdput(path_bdev);
                         btrfs_info_in_rcu(device->fs_info,
         "devid %llu device path %s changed to %s scanned by %s (%d)",
                                           devid, rcu_str_deref(device->name),
@@@ -1050,7 -1037,7 +1049,7 @@@ error
   }
   
   static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
- -                                    int step, struct btrfs_device **latest_dev)
+ +                                    struct btrfs_device **latest_dev)
   {
         struct btrfs_device *device, *next;
   
@@@ -1095,16 -1082,16 +1094,16 @@@
    * After we have read the system tree and know devids belonging to this
    * filesystem, remove the device which does not belong there.
    */
- -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
   {
         struct btrfs_device *latest_dev = NULL;
         struct btrfs_fs_devices *seed_dev;
   
         mutex_lock(&uuid_mutex);
- -      __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+ +      __btrfs_free_extra_devids(fs_devices, &latest_dev);
   
         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
- -              __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
+ +              __btrfs_free_extra_devids(seed_dev, &latest_dev);
   
         fs_devices->latest_bdev = latest_dev->bdev;
   
@@@ -1143,7 -1130,6 +1142,7 @@@ static void btrfs_close_one_device(stru
                 device->bdev = NULL;
         }
         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ +      btrfs_destroy_dev_zone_info(device);
   
         device->fs_info = NULL;
         atomic_set(&device->dev_stats_ccnt, 0);
@@@ -1224,7 -1210,6 +1223,7 @@@ static int open_fs_devices(struct btrfs
         fs_devices->latest_bdev = latest_dev->bdev;
         fs_devices->total_rw_bytes = 0;
         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ +      fs_devices->read_policy = BTRFS_READ_POLICY_PID;
   
         return 0;
   }
@@@ -1276,7 -1261,7 +1275,7 @@@ void btrfs_release_disk_super(struct bt
   }
   
   static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- -                                                     u64 bytenr)
+ +                                                     u64 bytenr, u64 bytenr_orig)
   {
         struct btrfs_super_block *disk_super;
         struct page *page;
@@@ -1307,7 -1292,7 +1306,7 @@@
         /* align our pointer to the offset of the super block */
         disk_super = p + offset_in_page(bytenr);
   
- -      if (btrfs_super_bytenr(disk_super) != bytenr ||
+ +      if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
                 btrfs_release_disk_super(p);
                 return ERR_PTR(-EINVAL);
@@@ -1342,8 -1327,7 +1341,8 @@@ struct btrfs_device *btrfs_scan_one_dev
         bool new_device_added = false;
         struct btrfs_device *device = NULL;
         struct block_device *bdev;
- -      u64 bytenr;
+ +      u64 bytenr, bytenr_orig;
+ +      int ret;
   
         lockdep_assert_held(&uuid_mutex);
   
@@@ -1353,18 -1337,14 +1352,18 @@@
          * So, we need to add a special mount option to scan for
          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
          */
- -      bytenr = btrfs_sb_offset(0);
         flags |= FMODE_EXCL;
   
         bdev = blkdev_get_by_path(path, flags, holder);
         if (IS_ERR(bdev))
                 return ERR_CAST(bdev);
   
- -      disk_super = btrfs_read_disk_super(bdev, bytenr);
+ +      bytenr_orig = btrfs_sb_offset(0);
+ +      ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ +      if (ret)
+ +              return ERR_PTR(ret);
+ +
+ +      disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
         if (IS_ERR(disk_super)) {
                 device = ERR_CAST(disk_super);
                 goto error_bdev_put;
@@@ -2028,11 -2008,6 +2027,11 @@@ void btrfs_scratch_superblocks(struct b
                 if (IS_ERR(disk_super))
                         continue;
   
+ +              if (bdev_is_zoned(bdev)) {
+ +                      btrfs_reset_sb_log_zones(bdev, copy_num);
+ +                      continue;
+ +              }
+ +
                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
   
                 page = virt_to_page(disk_super);
@@@ -2311,10 -2286,10 +2310,10 @@@ static struct btrfs_device *btrfs_find_
         dev_uuid = disk_super->dev_item.uuid;
         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- -                                         disk_super->metadata_uuid, true);
+ +                                         disk_super->metadata_uuid);
         else
                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- -                                         disk_super->fsid, true);
+ +                                         disk_super->fsid);
   
         btrfs_release_disk_super(disk_super);
         if (!device)
@@@ -2334,7 -2309,7 +2333,7 @@@ struct btrfs_device *btrfs_find_device_
   
         if (devid) {
                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- -                                         NULL, true);
+ +                                         NULL);
                 if (!device)
                         return ERR_PTR(-ENOENT);
                 return device;
@@@ -2483,7 -2458,7 +2482,7 @@@ next_slot
                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                    BTRFS_FSID_SIZE);
                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- -                                         fs_uuid, true);
+ +                                         fs_uuid);
                 BUG_ON(!device); /* Logic error */
   
                 if (device->fs_devices->seeding) {
@@@ -2525,11 -2500,6 +2524,11 @@@ int btrfs_init_new_device(struct btrfs_
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
   
+ +      if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ +              ret = -EINVAL;
+ +              goto error;
+ +      }
+ +
         if (fs_devices->seeding) {
                 seeding_dev = 1;
                 down_write(&sb->s_umount);
@@@ -2563,17 -2533,10 +2562,17 @@@
         }
         rcu_assign_pointer(device->name, name);
   
+ +      device->fs_info = fs_info;
+ +      device->bdev = bdev;
+ +
+ +      ret = btrfs_get_dev_zone_info(device);
+ +      if (ret)
+ +              goto error_free_device;
+ +
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
- -              goto error_free_device;
+ +              goto error_free_zone;
         }
   
         q = bdev_get_queue(bdev);
@@@ -2586,6 -2549,8 +2585,6 @@@
                                          fs_info->sectorsize);
         device->disk_total_bytes = device->total_bytes;
         device->commit_total_bytes = device->total_bytes;
- -      device->fs_info = fs_info;
- -      device->bdev = bdev;
         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
         device->mode = FMODE_EXCL;
@@@ -2732,8 -2697,6 +2731,8 @@@ error_trans
                 sb->s_flags |= SB_RDONLY;
         if (trans)
                 btrfs_end_transaction(trans);
+ +error_free_zone:
+ +      btrfs_destroy_dev_zone_info(device);
   error_free_device:
         btrfs_free_device(device);
   error:
@@@ -5509,18 -5472,7 +5508,18 @@@ static int find_live_mirror(struct btrf
         else
                 num_stripes = map->num_stripes;
   
- -      preferred_mirror = first + current->pid % num_stripes;
+ +      switch (fs_info->fs_devices->read_policy) {
+ +      default:
+ +              /* Shouldn't happen, just warn and use pid instead of failing */
+ +              btrfs_warn_rl(fs_info,
+ +                            "unknown read_policy type %u, reset to pid",
+ +                            fs_info->fs_devices->read_policy);
+ +              fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ +              fallthrough;
+ +      case BTRFS_READ_POLICY_PID:
+ +              preferred_mirror = first + (current->pid % num_stripes);
+ +              break;
+ +      }
   
         if (dev_replace_is_ongoing &&
             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@@ -6376,7 -6328,7 +6375,7 @@@ static void submit_stripe_bio(struct bt
         bio->bi_iter.bi_sector = physical >> 9;
         btrfs_debug_in_rcu(fs_info,
         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- -              bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
+ +              bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
                 dev->devid, bio->bi_iter.bi_size);
         bio_set_dev(bio, dev->bdev);
@@@ -6408,7 -6360,7 +6407,7 @@@ blk_status_t btrfs_map_bio(struct btrfs
   {
         struct btrfs_device *dev;
         struct bio *first_bio = bio;
- -      u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ +      u64 logical = bio->bi_iter.bi_sector << 9;
         u64 length = 0;
         u64 map_length;
         int ret;
@@@ -6488,7 -6440,8 +6487,7 @@@
    * If @seed is true, traverse through the seed devices.
    */
   struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- -                                     u64 devid, u8 *uuid, u8 *fsid,
- -                                     bool seed)
+ +                                     u64 devid, u8 *uuid, u8 *fsid)
   {
         struct btrfs_device *device;
         struct btrfs_fs_devices *seed_devs;
@@@ -6695,7 -6648,7 +6694,7 @@@ static int read_one_chunk(struct btrfs_
                                    btrfs_stripe_dev_uuid_nr(chunk, i),
                                    BTRFS_UUID_SIZE);
                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- -                                                      devid, uuid, NULL, true);
+ +                                                      devid, uuid, NULL);
                 if (!map->stripes[i].dev &&
                     !btrfs_test_opt(fs_info, DEGRADED)) {
                         free_extent_map(em);
@@@ -6834,7 -6787,7 +6833,7 @@@ static int read_one_dev(struct extent_b
         }
   
         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- -                                 fs_uuid, true);
+ +                                 fs_uuid);
         if (!device) {
                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
                         btrfs_report_missing_device(fs_info, devid,
@@@ -6897,16 -6850,6 +6896,16 @@@
         }
   
         fill_device_from_item(leaf, dev_item, device);
+ +      if (device->bdev) {
+ +              u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ +
+ +              if (device->total_bytes > max_total_bytes) {
+ +                      btrfs_err(fs_info,
+ +                      "device total_bytes should be at most %llu but found %llu",
+ +                                max_total_bytes, device->total_bytes);
+ +                      return -EINVAL;
+ +              }
+ +      }
         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@@ -6941,11 -6884,11 +6940,11 @@@ int btrfs_read_sys_array(struct btrfs_f
          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
          * overallocate but we can keep it as-is, only the first page is used.
          */
- -      sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ +      sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
+ +                                        root->root_key.objectid, 0);
         if (IS_ERR(sb))
                 return PTR_ERR(sb);
         set_extent_buffer_uptodate(sb);
- -      btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
         /*
          * The sb extent buffer is artificial and just used to read the system array.
          * set_extent_buffer_uptodate() call does not properly mark all it's
@@@ -7109,8 -7052,12 +7108,8 @@@ static void readahead_tree_node_childre
         int i;
         const int nr_items = btrfs_header_nritems(node);
   
- -      for (i = 0; i < nr_items; i++) {
- -              u64 start;
- -
- -              start = btrfs_node_blockptr(node, i);
- -              readahead_tree_block(node->fs_info, start);
- -      }
+ +      for (i = 0; i < nr_items; i++)
+ +              btrfs_readahead_node_child(node, i);
   }
   
   int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
@@@ -7497,7 -7444,8 +7496,7 @@@ int btrfs_get_dev_stats(struct btrfs_fs
         int i;
   
         mutex_lock(&fs_devices->device_list_mutex);
- -      dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- -                              true);
+ +      dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
         mutex_unlock(&fs_devices->device_list_mutex);
   
         if (!dev) {
@@@ -7628,13 -7576,28 +7627,13 @@@ static int verify_one_dev_extent(struc
         }
   
         /* Make sure no dev extent is beyond device bondary */
- -      dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ +      dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
         if (!dev) {
                 btrfs_err(fs_info, "failed to find devid %llu", devid);
                 ret = -EUCLEAN;
                 goto out;
         }
   
- -      /* It's possible this device is a dummy for seed device */
- -      if (dev->disk_total_bytes == 0) {
- -              struct btrfs_fs_devices *devs;
- -
- -              devs = list_first_entry(&fs_info->fs_devices->seed_list,
- -                                      struct btrfs_fs_devices, seed_list);
- -              dev = btrfs_find_device(devs, devid, NULL, NULL, false);
- -              if (!dev) {
- -                      btrfs_err(fs_info, "failed to find seed devid %llu",
- -                                devid);
- -                      ret = -EUCLEAN;
- -                      goto out;
- -              }
- -      }
- -
         if (physical_offset + physical_len > dev->disk_total_bytes) {
                 btrfs_err(fs_info,
   "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@@ -7689,19 -7652,6 +7688,19 @@@ int btrfs_verify_dev_extents(struct btr
         u64 prev_dev_ext_end = 0;
         int ret = 0;
   
+ +      /*
+ +       * We don't have a dev_root because we mounted with ignorebadroots and
+ +       * failed to load the root, so we want to skip the verification in this
+ +       * case for sure.
+ +       *
+ +       * However if the dev root is fine, but the tree itself is corrupted
+ +       * we'd still fail to mount.  This verification is only to make sure
+ +       * writes can happen safely, so instead just bypass this check
+ +       * completely in the case of IGNOREBADROOTS.
+ +       */
+ +      if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ +              return 0;
+ +
         key.objectid = 1;
         key.type = BTRFS_DEV_EXTENT_KEY;
         key.offset = 0;
diff --combined fs/btrfs/zoned.c

index 1555451,0000000..c388466

mode 100644,000000..100644
--- 1/fs/btrfs/zoned.c
--- /dev/null
+++ b/fs/btrfs/zoned.c
@@@ -1,616 -1,0 +1,616 @@@
-       nr_sectors = bdev->bd_part->nr_sects;
+ +// SPDX-License-Identifier: GPL-2.0
+ +
+ +#include <linux/slab.h>
+ +#include <linux/blkdev.h>
+ +#include "ctree.h"
+ +#include "volumes.h"
+ +#include "zoned.h"
+ +#include "rcu-string.h"
+ +
+ +/* Maximum number of zones to report per blkdev_report_zones() call */
+ +#define BTRFS_REPORT_NR_ZONES   4096
+ +
+ +/* Number of superblock log zones */
+ +#define BTRFS_NR_SB_LOG_ZONES 2
+ +
+ +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+ +{
+ +      struct blk_zone *zones = data;
+ +
+ +      memcpy(&zones[idx], zone, sizeof(*zone));
+ +
+ +      return 0;
+ +}
+ +
+ +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ +                          u64 *wp_ret)
+ +{
+ +      bool empty[BTRFS_NR_SB_LOG_ZONES];
+ +      bool full[BTRFS_NR_SB_LOG_ZONES];
+ +      sector_t sector;
+ +
+ +      ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
+ +             zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ +
+ +      empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
+ +      empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
+ +      full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
+ +      full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+ +
+ +      /*
+ +       * Possible states of log buffer zones
+ +       *
+ +       *           Empty[0]  In use[0]  Full[0]
+ +       * Empty[1]         *          x        0
+ +       * In use[1]        0          x        0
+ +       * Full[1]          1          1        C
+ +       *
+ +       * Log position:
+ +       *   *: Special case, no superblock is written
+ +       *   0: Use write pointer of zones[0]
+ +       *   1: Use write pointer of zones[1]
+ +       *   C: Compare super blcoks from zones[0] and zones[1], use the latest
+ +       *      one determined by generation
+ +       *   x: Invalid state
+ +       */
+ +
+ +      if (empty[0] && empty[1]) {
+ +              /* Special case to distinguish no superblock to read */
+ +              *wp_ret = zones[0].start << SECTOR_SHIFT;
+ +              return -ENOENT;
+ +      } else if (full[0] && full[1]) {
+ +              /* Compare two super blocks */
+ +              struct address_space *mapping = bdev->bd_inode->i_mapping;
+ +              struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ +              struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ +              int i;
+ +
+ +              for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ +                      u64 bytenr;
+ +
+ +                      bytenr = ((zones[i].start + zones[i].len)
+ +                                 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
+ +
+ +                      page[i] = read_cache_page_gfp(mapping,
+ +                                      bytenr >> PAGE_SHIFT, GFP_NOFS);
+ +                      if (IS_ERR(page[i])) {
+ +                              if (i == 1)
+ +                                      btrfs_release_disk_super(super[0]);
+ +                              return PTR_ERR(page[i]);
+ +                      }
+ +                      super[i] = page_address(page[i]);
+ +              }
+ +
+ +              if (super[0]->generation > super[1]->generation)
+ +                      sector = zones[1].start;
+ +              else
+ +                      sector = zones[0].start;
+ +
+ +              for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ +                      btrfs_release_disk_super(super[i]);
+ +      } else if (!full[0] && (empty[1] || full[1])) {
+ +              sector = zones[0].wp;
+ +      } else if (full[0]) {
+ +              sector = zones[1].wp;
+ +      } else {
+ +              return -EUCLEAN;
+ +      }
+ +      *wp_ret = sector << SECTOR_SHIFT;
+ +      return 0;
+ +}
+ +
+ +/*
+ + * The following zones are reserved as the circular buffer on ZONED btrfs.
+ + *  - The primary superblock: zones 0 and 1
+ + *  - The first copy: zones 16 and 17
+ + *  - The second copy: zones 1024 or zone at 256GB which is minimum, and
+ + *                     the following one
+ + */
+ +static inline u32 sb_zone_number(int shift, int mirror)
+ +{
+ +      ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ +
+ +      switch (mirror) {
+ +      case 0: return 0;
+ +      case 1: return 16;
+ +      case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ +                             struct blk_zone *zones, unsigned int *nr_zones)
+ +{
+ +      int ret;
+ +
+ +      if (!*nr_zones)
+ +              return 0;
+ +
+ +      ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ +                                copy_zone_info_cb, zones);
+ +      if (ret < 0) {
+ +              btrfs_err_in_rcu(device->fs_info,
+ +                               "zoned: failed to read zone %llu on %s (devid %llu)",
+ +                               pos, rcu_str_deref(device->name),
+ +                               device->devid);
+ +              return ret;
+ +      }
+ +      *nr_zones = ret;
+ +      if (!ret)
+ +              return -EIO;
+ +
+ +      return 0;
+ +}
+ +
+ +int btrfs_get_dev_zone_info(struct btrfs_device *device)
+ +{
+ +      struct btrfs_zoned_device_info *zone_info = NULL;
+ +      struct block_device *bdev = device->bdev;
+ +      struct request_queue *queue = bdev_get_queue(bdev);
+ +      sector_t nr_sectors;
+ +      sector_t sector = 0;
+ +      struct blk_zone *zones = NULL;
+ +      unsigned int i, nreported = 0, nr_zones;
+ +      unsigned int zone_sectors;
+ +      int ret;
+ +
+ +      if (!bdev_is_zoned(bdev))
+ +              return 0;
+ +
+ +      if (device->zone_info)
+ +              return 0;
+ +
+ +      zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ +      if (!zone_info)
+ +              return -ENOMEM;
+ +
-       nr_sectors = bdev->bd_part->nr_sects;
++      nr_sectors = bdev_nr_sectors(bdev);
+ +      zone_sectors = bdev_zone_sectors(bdev);
+ +      /* Check if it's power of 2 (see is_power_of_2) */
+ +      ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ +      zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+ +      zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ +      zone_info->max_zone_append_size =
+ +              (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
+ +      zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ +      if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ +              zone_info->nr_zones++;
+ +
+ +      zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ +      if (!zone_info->seq_zones) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +
+ +      zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ +      if (!zone_info->empty_zones) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +
+ +      zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ +      if (!zones) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +
+ +      /* Get zones type */
+ +      while (sector < nr_sectors) {
+ +              nr_zones = BTRFS_REPORT_NR_ZONES;
+ +              ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ +                                        &nr_zones);
+ +              if (ret)
+ +                      goto out;
+ +
+ +              for (i = 0; i < nr_zones; i++) {
+ +                      if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ +                              __set_bit(nreported, zone_info->seq_zones);
+ +                      if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ +                              __set_bit(nreported, zone_info->empty_zones);
+ +                      nreported++;
+ +              }
+ +              sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ +      }
+ +
+ +      if (nreported != zone_info->nr_zones) {
+ +              btrfs_err_in_rcu(device->fs_info,
+ +                               "inconsistent number of zones on %s (%u/%u)",
+ +                               rcu_str_deref(device->name), nreported,
+ +                               zone_info->nr_zones);
+ +              ret = -EIO;
+ +              goto out;
+ +      }
+ +
+ +      /* Validate superblock log */
+ +      nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ +      for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ +              u32 sb_zone;
+ +              u64 sb_wp;
+ +              int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+ +
+ +              sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ +              if (sb_zone + 1 >= zone_info->nr_zones)
+ +                      continue;
+ +
+ +              sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
+ +              ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ +                                        &zone_info->sb_zones[sb_pos],
+ +                                        &nr_zones);
+ +              if (ret)
+ +                      goto out;
+ +
+ +              if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ +                      btrfs_err_in_rcu(device->fs_info,
+ +      "zoned: failed to read super block log zone info at devid %llu zone %u",
+ +                                       device->devid, sb_zone);
+ +                      ret = -EUCLEAN;
+ +                      goto out;
+ +              }
+ +
+ +              /*
+ +               * If zones[0] is conventional, always use the beggining of the
+ +               * zone to record superblock. No need to validate in that case.
+ +               */
+ +              if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ +                  BLK_ZONE_TYPE_CONVENTIONAL)
+ +                      continue;
+ +
+ +              ret = sb_write_pointer(device->bdev,
+ +                                     &zone_info->sb_zones[sb_pos], &sb_wp);
+ +              if (ret != -ENOENT && ret) {
+ +                      btrfs_err_in_rcu(device->fs_info,
+ +                      "zoned: super block log zone corrupted devid %llu zone %u",
+ +                                       device->devid, sb_zone);
+ +                      ret = -EUCLEAN;
+ +                      goto out;
+ +              }
+ +      }
+ +
+ +
+ +      kfree(zones);
+ +
+ +      device->zone_info = zone_info;
+ +
+ +      /* device->fs_info is not safe to use for printing messages */
+ +      btrfs_info_in_rcu(NULL,
+ +                      "host-%s zoned block device %s, %u zones of %llu bytes",
+ +                      bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
+ +                      rcu_str_deref(device->name), zone_info->nr_zones,
+ +                      zone_info->zone_size);
+ +
+ +      return 0;
+ +
+ +out:
+ +      kfree(zones);
+ +      bitmap_free(zone_info->empty_zones);
+ +      bitmap_free(zone_info->seq_zones);
+ +      kfree(zone_info);
+ +
+ +      return ret;
+ +}
+ +
+ +void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+ +{
+ +      struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ +
+ +      if (!zone_info)
+ +              return;
+ +
+ +      bitmap_free(zone_info->seq_zones);
+ +      bitmap_free(zone_info->empty_zones);
+ +      kfree(zone_info);
+ +      device->zone_info = NULL;
+ +}
+ +
+ +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ +                     struct blk_zone *zone)
+ +{
+ +      unsigned int nr_zones = 1;
+ +      int ret;
+ +
+ +      ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ +      if (ret != 0 || !nr_zones)
+ +              return ret ? ret : -EIO;
+ +
+ +      return 0;
+ +}
+ +
+ +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ +{
+ +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ +      struct btrfs_device *device;
+ +      u64 zoned_devices = 0;
+ +      u64 nr_devices = 0;
+ +      u64 zone_size = 0;
+ +      u64 max_zone_append_size = 0;
+ +      const bool incompat_zoned = btrfs_is_zoned(fs_info);
+ +      int ret = 0;
+ +
+ +      /* Count zoned devices */
+ +      list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ +              enum blk_zoned_model model;
+ +
+ +              if (!device->bdev)
+ +                      continue;
+ +
+ +              model = bdev_zoned_model(device->bdev);
+ +              if (model == BLK_ZONED_HM ||
+ +                  (model == BLK_ZONED_HA && incompat_zoned)) {
+ +                      struct btrfs_zoned_device_info *zone_info;
+ +
+ +                      zone_info = device->zone_info;
+ +                      zoned_devices++;
+ +                      if (!zone_size) {
+ +                              zone_size = zone_info->zone_size;
+ +                      } else if (zone_info->zone_size != zone_size) {
+ +                              btrfs_err(fs_info,
+ +              "zoned: unequal block device zone sizes: have %llu found %llu",
+ +                                        device->zone_info->zone_size,
+ +                                        zone_size);
+ +                              ret = -EINVAL;
+ +                              goto out;
+ +                      }
+ +                      if (!max_zone_append_size ||
+ +                          (zone_info->max_zone_append_size &&
+ +                           zone_info->max_zone_append_size < max_zone_append_size))
+ +                              max_zone_append_size =
+ +                                      zone_info->max_zone_append_size;
+ +              }
+ +              nr_devices++;
+ +      }
+ +
+ +      if (!zoned_devices && !incompat_zoned)
+ +              goto out;
+ +
+ +      if (!zoned_devices && incompat_zoned) {
+ +              /* No zoned block device found on ZONED filesystem */
+ +              btrfs_err(fs_info,
+ +                        "zoned: no zoned devices found on a zoned filesystem");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      if (zoned_devices && !incompat_zoned) {
+ +              btrfs_err(fs_info,
+ +                        "zoned: mode not enabled but zoned device found");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      if (zoned_devices != nr_devices) {
+ +              btrfs_err(fs_info,
+ +                        "zoned: cannot mix zoned and regular devices");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      /*
+ +       * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ +       * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ +       * check the alignment here.
+ +       */
+ +      if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ +              btrfs_err(fs_info,
+ +                        "zoned: zone size %llu not aligned to stripe %u",
+ +                        zone_size, BTRFS_STRIPE_LEN);
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ +              btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      fs_info->zone_size = zone_size;
+ +      fs_info->max_zone_append_size = max_zone_append_size;
+ +
+ +      btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+ +out:
+ +      return ret;
+ +}
+ +
+ +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+ +{
+ +      if (!btrfs_is_zoned(info))
+ +              return 0;
+ +
+ +      /*
+ +       * Space cache writing is not COWed. Disable that to avoid write errors
+ +       * in sequential zones.
+ +       */
+ +      if (btrfs_test_opt(info, SPACE_CACHE)) {
+ +              btrfs_err(info, "zoned: space cache v1 is not supported");
+ +              return -EINVAL;
+ +      }
+ +
+ +      if (btrfs_test_opt(info, NODATACOW)) {
+ +              btrfs_err(info, "zoned: NODATACOW not supported");
+ +              return -EINVAL;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ +                         int rw, u64 *bytenr_ret)
+ +{
+ +      u64 wp;
+ +      int ret;
+ +
+ +      if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ +              *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ +              return 0;
+ +      }
+ +
+ +      ret = sb_write_pointer(bdev, zones, &wp);
+ +      if (ret != -ENOENT && ret < 0)
+ +              return ret;
+ +
+ +      if (rw == WRITE) {
+ +              struct blk_zone *reset = NULL;
+ +
+ +              if (wp == zones[0].start << SECTOR_SHIFT)
+ +                      reset = &zones[0];
+ +              else if (wp == zones[1].start << SECTOR_SHIFT)
+ +                      reset = &zones[1];
+ +
+ +              if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ +                      ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+ +
+ +                      ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ +                                             reset->start, reset->len,
+ +                                             GFP_NOFS);
+ +                      if (ret)
+ +                              return ret;
+ +
+ +                      reset->cond = BLK_ZONE_COND_EMPTY;
+ +                      reset->wp = reset->start;
+ +              }
+ +      } else if (ret != -ENOENT) {
+ +              /* For READ, we want the precious one */
+ +              if (wp == zones[0].start << SECTOR_SHIFT)
+ +                      wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ +              wp -= BTRFS_SUPER_INFO_SIZE;
+ +      }
+ +
+ +      *bytenr_ret = wp;
+ +      return 0;
+ +
+ +}
+ +
+ +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ +                             u64 *bytenr_ret)
+ +{
+ +      struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ +      unsigned int zone_sectors;
+ +      u32 sb_zone;
+ +      int ret;
+ +      u64 zone_size;
+ +      u8 zone_sectors_shift;
+ +      sector_t nr_sectors;
+ +      u32 nr_zones;
+ +
+ +      if (!bdev_is_zoned(bdev)) {
+ +              *bytenr_ret = btrfs_sb_offset(mirror);
+ +              return 0;
+ +      }
+ +
+ +      ASSERT(rw == READ || rw == WRITE);
+ +
+ +      zone_sectors = bdev_zone_sectors(bdev);
+ +      if (!is_power_of_2(zone_sectors))
+ +              return -EINVAL;
+ +      zone_size = zone_sectors << SECTOR_SHIFT;
+ +      zone_sectors_shift = ilog2(zone_sectors);
-       nr_sectors = bdev->bd_part->nr_sects;
++      nr_sectors = bdev_nr_sectors(bdev);
+ +      nr_zones = nr_sectors >> zone_sectors_shift;
+ +
+ +      sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ +      if (sb_zone + 1 >= nr_zones)
+ +              return -ENOENT;
+ +
+ +      ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ +                                BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ +                                zones);
+ +      if (ret < 0)
+ +              return ret;
+ +      if (ret != BTRFS_NR_SB_LOG_ZONES)
+ +              return -EIO;
+ +
+ +      return sb_log_location(bdev, zones, rw, bytenr_ret);
+ +}
+ +
+ +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ +                        u64 *bytenr_ret)
+ +{
+ +      struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ +      u32 zone_num;
+ +
+ +      if (!zinfo) {
+ +              *bytenr_ret = btrfs_sb_offset(mirror);
+ +              return 0;
+ +      }
+ +
+ +      zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ +      if (zone_num + 1 >= zinfo->nr_zones)
+ +              return -ENOENT;
+ +
+ +      return sb_log_location(device->bdev,
+ +                             &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ +                             rw, bytenr_ret);
+ +}
+ +
+ +static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ +                                int mirror)
+ +{
+ +      u32 zone_num;
+ +
+ +      if (!zinfo)
+ +              return false;
+ +
+ +      zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ +      if (zone_num + 1 >= zinfo->nr_zones)
+ +              return false;
+ +
+ +      if (!test_bit(zone_num, zinfo->seq_zones))
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+ +{
+ +      struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ +      struct blk_zone *zone;
+ +
+ +      if (!is_sb_log_zone(zinfo, mirror))
+ +              return;
+ +
+ +      zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ +      if (zone->cond != BLK_ZONE_COND_FULL) {
+ +              if (zone->cond == BLK_ZONE_COND_EMPTY)
+ +                      zone->cond = BLK_ZONE_COND_IMP_OPEN;
+ +
+ +              zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ +
+ +              if (zone->wp == zone->start + zone->len)
+ +                      zone->cond = BLK_ZONE_COND_FULL;
+ +
+ +              return;
+ +      }
+ +
+ +      zone++;
+ +      ASSERT(zone->cond != BLK_ZONE_COND_FULL);
+ +      if (zone->cond == BLK_ZONE_COND_EMPTY)
+ +              zone->cond = BLK_ZONE_COND_IMP_OPEN;
+ +
+ +      zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ +
+ +      if (zone->wp == zone->start + zone->len)
+ +              zone->cond = BLK_ZONE_COND_FULL;
+ +}
+ +
+ +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+ +{
+ +      sector_t zone_sectors;
+ +      sector_t nr_sectors;
+ +      u8 zone_sectors_shift;
+ +      u32 sb_zone;
+ +      u32 nr_zones;
+ +
+ +      zone_sectors = bdev_zone_sectors(bdev);
+ +      zone_sectors_shift = ilog2(zone_sectors);
++      nr_sectors = bdev_nr_sectors(bdev);
+ +      nr_zones = nr_sectors >> zone_sectors_shift;
+ +
+ +      sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ +      if (sb_zone + 1 >= nr_zones)
+ +              return -ENOENT;
+ +
+ +      return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ +                              sb_zone << zone_sectors_shift,
+ +                              zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ +}
diff --combined fs/buffer.c

index b56f99f,a7595ad..32647d2
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -523,7 -523,7 +523,7 @@@ repeat
   
   void emergency_thaw_bdev(struct super_block *sb)
   {
-       while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+       while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
                 printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
   }
   
@@@ -657,7 -657,7 +657,7 @@@ int __set_page_dirty_buffers(struct pag
                 } while (bh != head);
         }
         /*
- -       * Lock out page->mem_cgroup migration to keep PageDirty
+ +       * Lock out page's memcg migration to keep PageDirty
          * synchronized with per-memcg dirty page counters.
          */
         lock_page_memcg(page);
diff --combined fs/ext4/super.c

index 9447204,c303a0f..a2ec60f
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -2638,6 -2638,10 +2638,6 @@@ static int _ext4_show_options(struct se
         } else if (test_opt2(sb, DAX_INODE)) {
                 SEQ_OPTS_PUTS("dax=inode");
         }
- -
- -      if (test_opt2(sb, JOURNAL_FAST_COMMIT))
- -              SEQ_OPTS_PUTS("fast_commit");
- -
         ext4_show_quota_options(seq, sb);
         return 0;
   }
@@@ -4044,9 -4048,8 +4044,8 @@@ static int ext4_fill_super(struct super
         sbi->s_sb = sb;
         sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
         sbi->s_sb_block = sb_block;
-       if (sb->s_bdev->bd_part)
-               sbi->s_sectors_written_start =
-                       part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+       sbi->s_sectors_written_start =
+               part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
   
         /* Cleanup superblock name */
         strreplace(sb->s_id, '/', '!');
@@@ -5505,15 -5508,10 +5504,10 @@@ static int ext4_commit_super(struct sup
          */
         if (!(sb->s_flags & SB_RDONLY))
                 ext4_update_tstamp(es, s_wtime);
-       if (sb->s_bdev->bd_part)
-               es->s_kbytes_written =
-                       cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-                           ((part_stat_read(sb->s_bdev->bd_part,
-                                            sectors[STAT_WRITE]) -
-                             EXT4_SB(sb)->s_sectors_written_start) >> 1));
-       else
-               es->s_kbytes_written =
-                       cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+       es->s_kbytes_written =
+               cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+                   ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+                     EXT4_SB(sb)->s_sectors_written_start) >> 1));
         if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
                 ext4_free_blocks_count_set(es,
                         EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
diff --combined fs/f2fs/f2fs.h

index 9a321c5,49681a8..9f79392
--- 1/fs/f2fs/f2fs.h
--- 2/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@@ -1675,7 -1675,7 +1675,7 @@@ static inline bool f2fs_is_multi_device
    * and the return value is in kbytes. s is of struct f2fs_sb_info.
    */
   #define BD_PART_WRITTEN(s)                                             \
- (((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) -   \
+       (((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) -   \
                 (s)->sectors_written_start) >> 1)
   
   static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
@@@ -3251,8 -3251,6 +3251,8 @@@ bool f2fs_empty_dir(struct inode *dir)
   
   static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
   {
+ +      if (fscrypt_is_nokey_name(dentry))
+ +              return -ENOKEY;
         return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name,
                                 inode, inode->i_ino, inode->i_mode);
   }
diff --combined fs/internal.h

index 6fd14ea,53f8904..77c50be
--- 1/fs/internal.h
--- 2/fs/internal.h
+++ b/fs/internal.h
@@@ -25,7 -25,6 +25,6 @@@ extern void __init bdev_cache_init(void
   extern int __sync_blockdev(struct block_device *bdev, int wait);
   void iterate_bdevs(void (*)(struct block_device *, void *), void *);
   void emergency_thaw_bdev(struct super_block *sb);
- void bd_forget(struct inode *inode);
   #else
   static inline void bdev_cache_init(void)
   {
@@@ -43,9 -42,6 +42,6 @@@ static inline int emergency_thaw_bdev(s
   {
         return 0;
   }
- static inline void bd_forget(struct inode *inode)
- {
- }
   #endif /* CONFIG_BLOCK */
   
   /*
@@@ -78,8 -74,6 +74,8 @@@ extern int vfs_path_lookup(struct dentr
   long do_rmdir(int dfd, struct filename *name);
   long do_unlinkat(int dfd, struct filename *name);
   int may_linkat(struct path *link);
+ +int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
+ +               struct filename *newname, unsigned int flags);
   
   /*
    * namespace.c
@@@ -116,7 -110,8 +112,8 @@@ extern struct file *alloc_empty_file_no
    */
   extern int reconfigure_super(struct fs_context *);
   extern bool trylock_super(struct super_block *sb);
- extern struct super_block *user_get_super(dev_t);
+ struct super_block *user_get_super(dev_t, bool excl);
+ void put_super(struct super_block *sb);
   extern bool mount_capable(struct fs_context *);
   
   /*
diff --combined fs/io_uring.c

index 22e3105,8f13c04..6f9392c
--- 1/fs/io_uring.c
--- 2/fs/io_uring.c
+++ b/fs/io_uring.c
@@@ -205,7 -205,6 +205,7 @@@ struct fixed_file_ref_node 
         struct list_head                file_list;
         struct fixed_file_data          *file_data;
         struct llist_node               llist;
+ +      bool                            done;
   };
   
   struct fixed_file_data {
@@@ -245,8 -244,6 +245,8 @@@ struct io_sq_data 
   
         struct task_struct      *thread;
         struct wait_queue_head  wait;
+ +
+ +      unsigned                sq_thread_idle;
   };
   
   struct io_ring_ctx {
@@@ -287,6 -284,7 +287,6 @@@
                 struct list_head        timeout_list;
                 struct list_head        cq_overflow_list;
   
- -              wait_queue_head_t       inflight_wait;
                 struct io_uring_sqe     *sq_sqes;
         } ____cacheline_aligned_in_smp;
   
@@@ -311,6 -309,7 +311,6 @@@
         struct io_sq_data       *sq_data;       /* if using sq thread polling */
   
         struct wait_queue_head  sqo_sq_wait;
- -      struct wait_queue_entry sqo_wait_entry;
         struct list_head        sqd_list;
   
         /*
@@@ -395,18 -394,16 +395,18 @@@
    */
   struct io_poll_iocb {
         struct file                     *file;
- -      union {
- -              struct wait_queue_head  *head;
- -              u64                     addr;
- -      };
+ +      struct wait_queue_head          *head;
         __poll_t                        events;
         bool                            done;
         bool                            canceled;
         struct wait_queue_entry         wait;
   };
   
+ +struct io_poll_remove {
+ +      struct file                     *file;
+ +      u64                             addr;
+ +};
+ +
   struct io_close {
         struct file                     *file;
         struct file                     *put_file;
@@@ -446,17 -443,11 +446,17 @@@ struct io_timeout 
         u32                             off;
         u32                             target_seq;
         struct list_head                list;
+ +      /* head of the link, used by linked timeouts only */
+ +      struct io_kiocb                 *head;
   };
   
   struct io_timeout_rem {
         struct file                     *file;
         u64                             addr;
+ +
+ +      /* timeout update */
+ +      struct timespec64               ts;
+ +      u32                             flags;
   };
   
   struct io_rw {
@@@ -487,7 -478,6 +487,7 @@@ struct io_sr_msg 
   struct io_open {
         struct file                     *file;
         int                             dfd;
+ +      bool                            ignore_nonblock;
         struct filename                 *filename;
         struct open_how                 how;
         unsigned long                   nofile;
@@@ -549,27 -539,6 +549,27 @@@ struct io_statx 
         struct statx __user             *buffer;
   };
   
+ +struct io_shutdown {
+ +      struct file                     *file;
+ +      int                             how;
+ +};
+ +
+ +struct io_rename {
+ +      struct file                     *file;
+ +      int                             old_dfd;
+ +      int                             new_dfd;
+ +      struct filename                 *oldpath;
+ +      struct filename                 *newpath;
+ +      int                             flags;
+ +};
+ +
+ +struct io_unlink {
+ +      struct file                     *file;
+ +      int                             dfd;
+ +      int                             flags;
+ +      struct filename                 *filename;
+ +};
+ +
   struct io_completion {
         struct file                     *file;
         struct list_head                list;
@@@ -604,6 -573,7 +604,6 @@@ enum 
         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
   
- -      REQ_F_LINK_HEAD_BIT,
         REQ_F_FAIL_LINK_BIT,
         REQ_F_INFLIGHT_BIT,
         REQ_F_CUR_POS_BIT,
@@@ -635,6 -605,8 +635,6 @@@ enum 
         /* IOSQE_BUFFER_SELECT */
         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
   
- -      /* head of a link */
- -      REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
         /* fail rest of links */
         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
         /* on inflight list */
@@@ -677,7 -649,6 +677,7 @@@ struct io_kiocb 
                 struct file             *file;
                 struct io_rw            rw;
                 struct io_poll_iocb     poll;
+ +              struct io_poll_remove   poll_remove;
                 struct io_accept        accept;
                 struct io_sync          sync;
                 struct io_cancel        cancel;
@@@ -694,9 -665,6 +694,9 @@@
                 struct io_splice        splice;
                 struct io_provide_buf   pbuf;
                 struct io_statx         statx;
+ +              struct io_shutdown      shutdown;
+ +              struct io_rename        rename;
+ +              struct io_unlink        unlink;
                 /* use only after cleaning per-op data, see io_clean_op() */
                 struct io_completion    compl;
         };
@@@ -716,14 -684,15 +716,14 @@@
         struct task_struct              *task;
         u64                             user_data;
   
- -      struct list_head                link_list;
+ +      struct io_kiocb                 *link;
+ +      struct percpu_ref               *fixed_file_refs;
   
         /*
          * 1. used with ctx->iopoll_list with reads/writes
          * 2. to track reqs with ->files (see io_op_def::file_table)
          */
         struct list_head                inflight_entry;
- -
- -      struct percpu_ref               *fixed_file_refs;
         struct callback_head            task_work;
         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
         struct hlist_node               hash_node;
@@@ -754,8 -723,6 +754,8 @@@ struct io_submit_state 
         void                    *reqs[IO_IOPOLL_BATCH];
         unsigned int            free_reqs;
   
+ +      bool                    plug_started;
+ +
         /*
          * Batch completion logic
          */
@@@ -766,7 -733,7 +766,7 @@@
          */
         struct file             *file;
         unsigned int            fd;
- -      unsigned int            has_refs;
+ +      unsigned int            file_refs;
         unsigned int            ios_left;
   };
   
@@@ -788,8 -755,6 +788,8 @@@ struct io_op_def 
         unsigned                buffer_select : 1;
         /* must always have async data allocated */
         unsigned                needs_async_data : 1;
+ +      /* should block plug */
+ +      unsigned                plug : 1;
         /* size of async data needed, if any */
         unsigned short          async_size;
         unsigned                work_flags;
@@@ -803,7 -768,6 +803,7 @@@ static const struct io_op_def io_op_def
                 .pollin                 = 1,
                 .buffer_select          = 1,
                 .needs_async_data       = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
         },
@@@ -813,7 -777,6 +813,7 @@@
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
                 .needs_async_data       = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                 IO_WQ_WORK_FSIZE,
@@@ -826,7 -789,6 +826,7 @@@
                 .needs_file             = 1,
                 .unbound_nonreg_file    = 1,
                 .pollin                 = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
         },
@@@ -835,7 -797,6 +835,7 @@@
                 .hash_reg_file          = 1,
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
                                                 IO_WQ_WORK_MM,
@@@ -855,7 -816,8 +855,7 @@@
                 .pollout                = 1,
                 .needs_async_data       = 1,
                 .async_size             = sizeof(struct io_async_msghdr),
- -              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- -                                              IO_WQ_WORK_FS,
+ +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
         },
         [IORING_OP_RECVMSG] = {
                 .needs_file             = 1,
@@@ -864,17 -826,15 +864,17 @@@
                 .buffer_select          = 1,
                 .needs_async_data       = 1,
                 .async_size             = sizeof(struct io_async_msghdr),
- -              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- -                                              IO_WQ_WORK_FS,
+ +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
         },
         [IORING_OP_TIMEOUT] = {
                 .needs_async_data       = 1,
                 .async_size             = sizeof(struct io_timeout_data),
                 .work_flags             = IO_WQ_WORK_MM,
         },
- -      [IORING_OP_TIMEOUT_REMOVE] = {},
+ +      [IORING_OP_TIMEOUT_REMOVE] = {
+ +              /* used by timeout updates' prep() */
+ +              .work_flags             = IO_WQ_WORK_MM,
+ +      },
         [IORING_OP_ACCEPT] = {
                 .needs_file             = 1,
                 .unbound_nonreg_file    = 1,
@@@ -901,7 -861,7 +901,7 @@@
         },
         [IORING_OP_OPENAT] = {
                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- -                                              IO_WQ_WORK_FS,
+ +                                              IO_WQ_WORK_FS | IO_WQ_WORK_MM,
         },
         [IORING_OP_CLOSE] = {
                 .needs_file             = 1,
@@@ -920,7 -880,6 +920,7 @@@
                 .unbound_nonreg_file    = 1,
                 .pollin                 = 1,
                 .buffer_select          = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
         },
@@@ -928,7 -887,6 +928,7 @@@
                 .needs_file             = 1,
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
+ +              .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                 IO_WQ_WORK_FSIZE,
@@@ -955,7 -913,7 +955,7 @@@
         },
         [IORING_OP_OPENAT2] = {
                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- -                                              IO_WQ_WORK_BLKCG,
+ +                                              IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
         },
         [IORING_OP_EPOLL_CTL] = {
                 .unbound_nonreg_file    = 1,
@@@ -974,17 -932,6 +974,17 @@@
                 .hash_reg_file          = 1,
                 .unbound_nonreg_file    = 1,
         },
+ +      [IORING_OP_SHUTDOWN] = {
+ +              .needs_file             = 1,
+ +      },
+ +      [IORING_OP_RENAMEAT] = {
+ +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ +                                              IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ +      },
+ +      [IORING_OP_UNLINKAT] = {
+ +              .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ +                                              IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ +      },
   };
   
   enum io_mem_account {
@@@ -1034,9 -981,6 +1034,9 @@@ struct sock *io_uring_get_socket(struc
   }
   EXPORT_SYMBOL(io_uring_get_socket);
   
+ +#define io_for_each_link(pos, head) \
+ +      for (pos = (head); pos; pos = pos->link)
+ +
   static inline void io_clean_op(struct io_kiocb *req)
   {
         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@@ -1044,39 -988,8 +1044,39 @@@
                 __io_clean_op(req);
   }
   
- -static void io_sq_thread_drop_mm(void)
+ +static inline void io_set_resource_node(struct io_kiocb *req)
+ +{
+ +      struct io_ring_ctx *ctx = req->ctx;
+ +
+ +      if (!req->fixed_file_refs) {
+ +              req->fixed_file_refs = &ctx->file_data->node->refs;
+ +              percpu_ref_get(req->fixed_file_refs);
+ +      }
+ +}
+ +
+ +static bool io_match_task(struct io_kiocb *head,
+ +                        struct task_struct *task,
+ +                        struct files_struct *files)
+ +{
+ +      struct io_kiocb *req;
+ +
+ +      if (task && head->task != task)
+ +              return false;
+ +      if (!files)
+ +              return true;
+ +
+ +      io_for_each_link(req, head) {
+ +              if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ +                  (req->work.flags & IO_WQ_WORK_FILES) &&
+ +                  req->work.identity->files == files)
+ +                      return true;
+ +      }
+ +      return false;
+ +}
+ +
+ +static void io_sq_thread_drop_mm_files(void)
   {
+ +      struct files_struct *files = current->files;
         struct mm_struct *mm = current->mm;
   
         if (mm) {
@@@ -1084,41 -997,6 +1084,41 @@@
                 mmput(mm);
                 current->mm = NULL;
         }
+ +      if (files) {
+ +              struct nsproxy *nsproxy = current->nsproxy;
+ +
+ +              task_lock(current);
+ +              current->files = NULL;
+ +              current->nsproxy = NULL;
+ +              task_unlock(current);
+ +              put_files_struct(files);
+ +              put_nsproxy(nsproxy);
+ +      }
+ +}
+ +
+ +static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+ +{
+ +      if (!current->files) {
+ +              struct files_struct *files;
+ +              struct nsproxy *nsproxy;
+ +
+ +              task_lock(ctx->sqo_task);
+ +              files = ctx->sqo_task->files;
+ +              if (!files) {
+ +                      task_unlock(ctx->sqo_task);
+ +                      return -EOWNERDEAD;
+ +              }
+ +              atomic_inc(&files->count);
+ +              get_nsproxy(ctx->sqo_task->nsproxy);
+ +              nsproxy = ctx->sqo_task->nsproxy;
+ +              task_unlock(ctx->sqo_task);
+ +
+ +              task_lock(current);
+ +              current->files = files;
+ +              current->nsproxy = nsproxy;
+ +              task_unlock(current);
+ +      }
+ +      return 0;
   }
   
   static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@@ -1146,25 -1024,12 +1146,25 @@@
         return -EFAULT;
   }
   
- -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- -                                 struct io_kiocb *req)
+ +static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ +                                       struct io_kiocb *req)
   {
- -      if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- -              return 0;
- -      return __io_sq_thread_acquire_mm(ctx);
+ +      const struct io_op_def *def = &io_op_defs[req->opcode];
+ +      int ret;
+ +
+ +      if (def->work_flags & IO_WQ_WORK_MM) {
+ +              ret = __io_sq_thread_acquire_mm(ctx);
+ +              if (unlikely(ret))
+ +                      return ret;
+ +      }
+ +
+ +      if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ +              ret = __io_sq_thread_acquire_files(ctx);
+ +              if (unlikely(ret))
+ +                      return ret;
+ +      }
+ +
+ +      return 0;
   }
   
   static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@@ -1307,6 -1172,7 +1307,6 @@@ static struct io_ring_ctx *io_ring_ctx_
         INIT_LIST_HEAD(&ctx->iopoll_list);
         INIT_LIST_HEAD(&ctx->defer_list);
         INIT_LIST_HEAD(&ctx->timeout_list);
- -      init_waitqueue_head(&ctx->inflight_wait);
         spin_lock_init(&ctx->inflight_lock);
         INIT_LIST_HEAD(&ctx->inflight_list);
         INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@@ -1416,7 -1282,7 +1416,7 @@@ static bool io_identity_cow(struct io_k
          */
         io_init_identity(id);
         if (creds)
- -              req->work.identity->creds = creds;
+ +              id->creds = creds;
   
         /* add one for this request */
         refcount_inc(&id->count);
@@@ -1445,6 -1311,22 +1445,6 @@@ static bool io_grab_identity(struct io_
                         return false;
                 req->work.flags |= IO_WQ_WORK_FSIZE;
         }
- -
- -      if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- -          (def->work_flags & IO_WQ_WORK_FILES) &&
- -          !(req->flags & REQ_F_NO_FILE_TABLE)) {
- -              if (id->files != current->files ||
- -                  id->nsproxy != current->nsproxy)
- -                      return false;
- -              atomic_inc(&id->files->count);
- -              get_nsproxy(id->nsproxy);
- -              req->flags |= REQ_F_INFLIGHT;
- -
- -              spin_lock_irq(&ctx->inflight_lock);
- -              list_add(&req->inflight_entry, &ctx->inflight_list);
- -              spin_unlock_irq(&ctx->inflight_lock);
- -              req->work.flags |= IO_WQ_WORK_FILES;
- -      }
   #ifdef CONFIG_BLK_CGROUP
         if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
             (def->work_flags & IO_WQ_WORK_BLKCG)) {
@@@ -1486,21 -1368,6 +1486,21 @@@
                 }
                 spin_unlock(&current->fs->lock);
         }
+ +      if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ +          (def->work_flags & IO_WQ_WORK_FILES) &&
+ +          !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ +              if (id->files != current->files ||
+ +                  id->nsproxy != current->nsproxy)
+ +                      return false;
+ +              atomic_inc(&id->files->count);
+ +              get_nsproxy(id->nsproxy);
+ +              req->flags |= REQ_F_INFLIGHT;
+ +
+ +              spin_lock_irq(&ctx->inflight_lock);
+ +              list_add(&req->inflight_entry, &ctx->inflight_list);
+ +              spin_unlock_irq(&ctx->inflight_lock);
+ +              req->work.flags |= IO_WQ_WORK_FILES;
+ +      }
   
         return true;
   }
@@@ -1548,8 -1415,10 +1548,8 @@@ static void io_prep_async_link(struct i
   {
         struct io_kiocb *cur;
   
- -      io_prep_async_work(req);
- -      if (req->flags & REQ_F_LINK_HEAD)
- -              list_for_each_entry(cur, &req->link_list, link_list)
- -                      io_prep_async_work(cur);
+ +      io_for_each_link(cur, req)
+ +              io_prep_async_work(cur);
   }
   
   static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@@ -1590,18 -1459,30 +1590,18 @@@ static void io_kill_timeout(struct io_k
         }
   }
   
- -static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
- -{
- -      struct io_ring_ctx *ctx = req->ctx;
- -
- -      if (!tsk || req->task == tsk)
- -              return true;
- -      if (ctx->flags & IORING_SETUP_SQPOLL) {
- -              if (ctx->sq_data && req->task == ctx->sq_data->thread)
- -                      return true;
- -      }
- -      return false;
- -}
- -
   /*
    * Returns true if we found and killed one or more timeouts
    */
- -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ +                           struct files_struct *files)
   {
         struct io_kiocb *req, *tmp;
         int canceled = 0;
   
         spin_lock_irq(&ctx->completion_lock);
         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- -              if (io_task_match(req, tsk)) {
+ +              if (io_match_task(req, tsk, files)) {
                         io_kill_timeout(req);
                         canceled++;
                 }
@@@ -1712,6 -1593,32 +1712,6 @@@ static void io_cqring_mark_overflow(str
         }
   }
   
- -static inline bool __io_match_files(struct io_kiocb *req,
- -                                  struct files_struct *files)
- -{
- -      return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- -              (req->work.flags & IO_WQ_WORK_FILES)) &&
- -              req->work.identity->files == files;
- -}
- -
- -static bool io_match_files(struct io_kiocb *req,
- -                         struct files_struct *files)
- -{
- -      struct io_kiocb *link;
- -
- -      if (!files)
- -              return true;
- -      if (__io_match_files(req, files))
- -              return true;
- -      if (req->flags & REQ_F_LINK_HEAD) {
- -              list_for_each_entry(link, &req->link_list, link_list) {
- -                      if (__io_match_files(link, files))
- -                              return true;
- -              }
- -      }
- -      return false;
- -}
- -
   /* Returns true if there are no backlogged entries after the flush */
   static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                      struct task_struct *tsk,
@@@ -1739,7 -1646,9 +1739,7 @@@
   
         cqe = NULL;
         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- -              if (tsk && req->task != tsk)
- -                      continue;
- -              if (!io_match_files(req, files))
+ +              if (!io_match_task(req, tsk, files))
                         continue;
   
                 cqe = io_get_cqring(ctx);
@@@ -1935,7 -1844,9 +1935,7 @@@ fallback
   static inline void io_put_file(struct io_kiocb *req, struct file *file,
                           bool fixed)
   {
- -      if (fixed)
- -              percpu_ref_put(req->fixed_file_refs);
- -      else
+ +      if (!fixed)
                 fput(file);
   }
   
@@@ -1947,8 -1858,7 +1947,8 @@@ static void io_dismantle_req(struct io_
                 kfree(req->async_data);
         if (req->file)
                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- -
+ +      if (req->fixed_file_refs)
+ +              percpu_ref_put(req->fixed_file_refs);
         io_req_clean_work(req);
   }
   
@@@ -1971,14 -1881,6 +1971,14 @@@ static void __io_free_req(struct io_kio
         percpu_ref_put(&ctx->refs);
   }
   
+ +static inline void io_remove_next_linked(struct io_kiocb *req)
+ +{
+ +      struct io_kiocb *nxt = req->link;
+ +
+ +      req->link = nxt->link;
+ +      nxt->link = NULL;
+ +}
+ +
   static void io_kill_linked_timeout(struct io_kiocb *req)
   {
         struct io_ring_ctx *ctx = req->ctx;
@@@ -1987,8 -1889,8 +1987,8 @@@
         unsigned long flags;
   
         spin_lock_irqsave(&ctx->completion_lock, flags);
- -      link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- -                                      link_list);
+ +      link = req->link;
+ +
         /*
          * Can happen if a linked timeout fired and link had been like
          * req -> link t-out -> link t-out [-> ...]
@@@ -1997,8 -1899,7 +1997,8 @@@
                 struct io_timeout_data *io = link->async_data;
                 int ret;
   
- -              list_del_init(&link->link_list);
+ +              io_remove_next_linked(req);
+ +              link->timeout.head = NULL;
                 ret = hrtimer_try_to_cancel(&io->timer);
                 if (ret != -1) {
                         io_cqring_fill_event(link, -ECANCELED);
@@@ -2015,22 -1916,41 +2015,22 @@@
         }
   }
   
- -static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
- -{
- -      struct io_kiocb *nxt;
- -
- -      /*
- -       * The list should never be empty when we are called here. But could
- -       * potentially happen if the chain is messed up, check to be on the
- -       * safe side.
- -       */
- -      if (unlikely(list_empty(&req->link_list)))
- -              return NULL;
- -
- -      nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- -      list_del_init(&req->link_list);
- -      if (!list_empty(&nxt->link_list))
- -              nxt->flags |= REQ_F_LINK_HEAD;
- -      return nxt;
- -}
   
- -/*
- - * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- - */
   static void io_fail_links(struct io_kiocb *req)
   {
+ +      struct io_kiocb *link, *nxt;
         struct io_ring_ctx *ctx = req->ctx;
         unsigned long flags;
   
         spin_lock_irqsave(&ctx->completion_lock, flags);
- -      while (!list_empty(&req->link_list)) {
- -              struct io_kiocb *link = list_first_entry(&req->link_list,
- -                                              struct io_kiocb, link_list);
+ +      link = req->link;
+ +      req->link = NULL;
   
- -              list_del_init(&link->link_list);
- -              trace_io_uring_fail_link(req, link);
+ +      while (link) {
+ +              nxt = link->link;
+ +              link->link = NULL;
   
+ +              trace_io_uring_fail_link(req, link);
                 io_cqring_fill_event(link, -ECANCELED);
   
                 /*
@@@ -2042,8 -1962,8 +2042,8 @@@
                         io_put_req_deferred(link, 2);
                 else
                         io_double_put_req(link);
+ +              link = nxt;
         }
- -
         io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
   
@@@ -2052,6 -1972,7 +2052,6 @@@
   
   static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
   {
- -      req->flags &= ~REQ_F_LINK_HEAD;
         if (req->flags & REQ_F_LINK_TIMEOUT)
                 io_kill_linked_timeout(req);
   
@@@ -2061,24 -1982,20 +2061,24 @@@
          * dependencies to the next request. In case of failure, fail the rest
          * of the chain.
          */
- -      if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- -              return io_req_link_next(req);
+ +      if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ +              struct io_kiocb *nxt = req->link;
+ +
+ +              req->link = NULL;
+ +              return nxt;
+ +      }
         io_fail_links(req);
         return NULL;
   }
   
- -static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+ +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
   {
- -      if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ +      if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
                 return NULL;
         return __io_req_find_next(req);
   }
   
- -static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
+ +static int io_req_task_work_add(struct io_kiocb *req)
   {
         struct task_struct *tsk = req->task;
         struct io_ring_ctx *ctx = req->ctx;
@@@ -2095,7 -2012,7 +2095,7 @@@
          * will do the job.
          */
         notify = TWA_NONE;
- -      if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
+ +      if (!(ctx->flags & IORING_SETUP_SQPOLL))
                 notify = TWA_SIGNAL;
   
         ret = task_work_add(tsk, &req->task_work, notify);
@@@ -2132,8 -2049,7 +2132,8 @@@ static void __io_req_task_submit(struc
   {
         struct io_ring_ctx *ctx = req->ctx;
   
- -      if (!__io_sq_thread_acquire_mm(ctx)) {
+ +      if (!__io_sq_thread_acquire_mm(ctx) &&
+ +          !__io_sq_thread_acquire_files(ctx)) {
                 mutex_lock(&ctx->uring_lock);
                 __io_queue_sqe(req, NULL);
                 mutex_unlock(&ctx->uring_lock);
@@@ -2158,7 -2074,7 +2158,7 @@@ static void io_req_task_queue(struct io
         init_task_work(&req->task_work, io_req_task_submit);
         percpu_ref_get(&req->ctx->refs);
   
- -      ret = io_req_task_work_add(req, true);
+ +      ret = io_req_task_work_add(req);
         if (unlikely(ret)) {
                 struct task_struct *tsk;
   
@@@ -2169,7 -2085,7 +2169,7 @@@
         }
   }
   
- -static void io_queue_next(struct io_kiocb *req)
+ +static inline void io_queue_next(struct io_kiocb *req)
   {
         struct io_kiocb *nxt = io_req_find_next(req);
   
@@@ -2226,7 -2142,8 +2226,7 @@@ static void io_req_free_batch(struct re
                 io_free_req(req);
                 return;
         }
- -      if (req->flags & REQ_F_LINK_HEAD)
- -              io_queue_next(req);
+ +      io_queue_next(req);
   
         if (req->task != rb->task) {
                 if (rb->task) {
@@@ -2279,7 -2196,7 +2279,7 @@@ static void io_free_req_deferred(struc
         int ret;
   
         init_task_work(&req->task_work, io_put_req_deferred_cb);
- -      ret = io_req_task_work_add(req, true);
+ +      ret = io_req_task_work_add(req);
         if (unlikely(ret)) {
                 struct task_struct *tsk;
   
@@@ -2328,7 -2245,7 +2328,7 @@@ static unsigned io_cqring_events(struc
                  * we wake up the task, and the next invocation will flush the
                  * entries. We cannot safely to it from here.
                  */
- -              if (noflush && !list_empty(&ctx->cq_overflow_list))
+ +              if (noflush)
                         return -1U;
   
                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@@ -2660,6 -2577,7 +2660,6 @@@ static bool io_resubmit_prep(struct io_
         }
   end_req:
         req_set_fail_links(req);
- -      io_req_complete(req, ret);
         return false;
   }
   #endif
@@@ -2675,7 -2593,7 +2675,7 @@@ static bool io_rw_reissue(struct io_kio
         if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
                 return false;
   
- -      ret = io_sq_thread_acquire_mm(req->ctx, req);
+ +      ret = io_sq_thread_acquire_mm_files(req->ctx, req);
   
         if (io_resubmit_prep(req, ret)) {
                 refcount_inc(&req->refs);
@@@ -2723,7 -2641,7 +2723,7 @@@ static void io_complete_rw_iopoll(struc
    * find it from a io_iopoll_getevents() thread before the issuer is done
    * accessing the kiocb cookie.
    */
- -static void io_iopoll_req_issued(struct io_kiocb *req)
+ +static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
   {
         struct io_ring_ctx *ctx = req->ctx;
   
@@@ -2752,25 -2670,21 +2752,25 @@@
         else
                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
   
- -      if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ +      /*
+ +       * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ +       * task context or in io worker task context. If current task context is
+ +       * sq thread, we don't need to check whether should wake up sq thread.
+ +       */
+ +      if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
             wq_has_sleeper(&ctx->sq_data->wait))
                 wake_up(&ctx->sq_data->wait);
   }
   
- -static void __io_state_file_put(struct io_submit_state *state)
+ +static inline void __io_state_file_put(struct io_submit_state *state)
   {
- -      if (state->has_refs)
- -              fput_many(state->file, state->has_refs);
- -      state->file = NULL;
+ +      fput_many(state->file, state->file_refs);
+ +      state->file_refs = 0;
   }
   
   static inline void io_state_file_put(struct io_submit_state *state)
   {
- -      if (state->file)
+ +      if (state->file_refs)
                 __io_state_file_put(state);
   }
   
@@@ -2784,29 -2698,25 +2784,25 @@@ static struct file *__io_file_get(struc
         if (!state)
                 return fget(fd);
   
- -      if (state->file) {
+ +      if (state->file_refs) {
                 if (state->fd == fd) {
- -                      state->has_refs--;
+ +                      state->file_refs--;
                         return state->file;
                 }
                 __io_state_file_put(state);
         }
         state->file = fget_many(fd, state->ios_left);
- -      if (!state->file)
+ +      if (unlikely(!state->file))
                 return NULL;
   
         state->fd = fd;
- -      state->has_refs = state->ios_left - 1;
+ +      state->file_refs = state->ios_left - 1;
         return state->file;
   }
   
   static bool io_bdev_nowait(struct block_device *bdev)
   {
- #ifdef CONFIG_BLOCK
         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
- #else
-       return true;
- #endif
   }
   
   /*
@@@ -2819,14 -2729,16 +2815,16 @@@ static bool io_file_supports_async(stru
         umode_t mode = file_inode(file)->i_mode;
   
         if (S_ISBLK(mode)) {
-               if (io_bdev_nowait(file->f_inode->i_bdev))
+               if (IS_ENABLED(CONFIG_BLOCK) &&
+                   io_bdev_nowait(I_BDEV(file->f_mapping->host)))
                         return true;
                 return false;
         }
         if (S_ISCHR(mode) || S_ISSOCK(mode))
                 return true;
         if (S_ISREG(mode)) {
-               if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+               if (IS_ENABLED(CONFIG_BLOCK) &&
+                   io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
                     file->f_op != &io_uring_fops)
                         return true;
                 return false;
@@@ -3151,7 -3063,7 +3149,7 @@@ static ssize_t io_iov_buffer_select(str
         return __io_iov_buffer_select(req, iov, needs_lock);
   }
   
- -static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+ +static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                                  struct iovec **iovec, struct iov_iter *iter,
                                  bool needs_lock)
   {
@@@ -3180,7 -3092,7 +3178,7 @@@
   
                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                 *iovec = NULL;
- -              return ret < 0 ? ret : sqe_len;
+ +              return ret;
         }
   
         if (req->flags & REQ_F_BUFFER_SELECT) {
@@@ -3197,6 -3109,18 +3195,6 @@@
                               req->ctx->compat);
   }
   
- -static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- -                             struct iovec **iovec, struct iov_iter *iter,
- -                             bool needs_lock)
- -{
- -      struct io_async_rw *iorw = req->async_data;
- -
- -      if (!iorw)
- -              return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- -      *iovec = NULL;
- -      return iov_iter_count(&iorw->iter);
- -}
- -
   static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
   {
         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@@ -3266,7 -3190,7 +3264,7 @@@ static void io_req_map_rw(struct io_kio
         rw->free_iovec = iovec;
         rw->bytes_done = 0;
         /* can only be fixed buffers, no need to do anything */
- -      if (iter->type == ITER_BVEC)
+ +      if (iov_iter_is_bvec(iter))
                 return;
         if (!iovec) {
                 unsigned iov_off = 0;
@@@ -3320,7 -3244,7 +3318,7 @@@ static inline int io_rw_prep_async(stru
         struct iovec *iov = iorw->fast_iov;
         ssize_t ret;
   
- -      ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ +      ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
         if (unlikely(ret < 0))
                 return ret;
   
@@@ -3379,7 -3303,7 +3377,7 @@@ static int io_async_buf_func(struct wai
   
         /* submit ref gets dropped, acquire a new one */
         refcount_inc(&req->refs);
- -      ret = io_req_task_work_add(req, true);
+ +      ret = io_req_task_work_add(req);
         if (unlikely(ret)) {
                 struct task_struct *tsk;
   
@@@ -3453,17 -3377,17 +3451,17 @@@ static int io_read(struct io_kiocb *req
         struct iov_iter __iter, *iter = &__iter;
         struct io_async_rw *rw = req->async_data;
         ssize_t io_size, ret, ret2;
- -      size_t iov_count;
         bool no_async;
   
- -      if (rw)
+ +      if (rw) {
                 iter = &rw->iter;
- -
- -      ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- -      if (ret < 0)
- -              return ret;
- -      iov_count = iov_iter_count(iter);
- -      io_size = ret;
+ +              iovec = NULL;
+ +      } else {
+ +              ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ +              if (ret < 0)
+ +                      return ret;
+ +      }
+ +      io_size = iov_iter_count(iter);
         req->result = io_size;
         ret = 0;
   
@@@ -3479,7 -3403,7 +3477,7 @@@
         if (no_async)
                 goto copy_iov;
   
- -      ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ +      ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
         if (unlikely(ret))
                 goto out_free;
   
@@@ -3498,7 -3422,7 +3496,7 @@@
                 if (req->file->f_flags & O_NONBLOCK)
                         goto done;
                 /* some cases will consume bytes even on error returns */
- -              iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ +              iov_iter_revert(iter, io_size - iov_iter_count(iter));
                 ret = 0;
                 goto copy_iov;
         } else if (ret < 0) {
@@@ -3581,17 -3505,17 +3579,17 @@@ static int io_write(struct io_kiocb *re
         struct kiocb *kiocb = &req->rw.kiocb;
         struct iov_iter __iter, *iter = &__iter;
         struct io_async_rw *rw = req->async_data;
- -      size_t iov_count;
         ssize_t ret, ret2, io_size;
   
- -      if (rw)
+ +      if (rw) {
                 iter = &rw->iter;
- -
- -      ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- -      if (ret < 0)
- -              return ret;
- -      iov_count = iov_iter_count(iter);
- -      io_size = ret;
+ +              iovec = NULL;
+ +      } else {
+ +              ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ +              if (ret < 0)
+ +                      return ret;
+ +      }
+ +      io_size = iov_iter_count(iter);
         req->result = io_size;
   
         /* Ensure we clear previously set non-block flag */
@@@ -3609,7 -3533,7 +3607,7 @@@
             (req->flags & REQ_F_ISREG))
                 goto copy_iov;
   
- -      ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ +      ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
         if (unlikely(ret))
                 goto out_free;
   
@@@ -3652,7 -3576,7 +3650,7 @@@ done
         } else {
   copy_iov:
                 /* some cases will consume bytes even on error returns */
- -              iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ +              iov_iter_revert(iter, io_size - iov_iter_count(iter));
                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
                 if (!ret)
                         return -EAGAIN;
@@@ -3664,209 -3588,80 +3662,209 @@@ out_free
         return ret;
   }
   
- -static int __io_splice_prep(struct io_kiocb *req,
+ +static int io_renameat_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
   {
- -      struct io_splice* sp = &req->splice;
- -      unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- -
- -      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- -              return -EINVAL;
+ +      struct io_rename *ren = &req->rename;
+ +      const char __user *oldf, *newf;
   
- -      sp->file_in = NULL;
- -      sp->len = READ_ONCE(sqe->len);
- -      sp->flags = READ_ONCE(sqe->splice_flags);
+ +      if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ +              return -EBADF;
   
- -      if (unlikely(sp->flags & ~valid_flags))
- -              return -EINVAL;
+ +      ren->old_dfd = READ_ONCE(sqe->fd);
+ +      oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ +      newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ +      ren->new_dfd = READ_ONCE(sqe->len);
+ +      ren->flags = READ_ONCE(sqe->rename_flags);
   
- -      sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
- -                                (sp->flags & SPLICE_F_FD_IN_FIXED));
- -      if (!sp->file_in)
- -              return -EBADF;
- -      req->flags |= REQ_F_NEED_CLEANUP;
+ +      ren->oldpath = getname(oldf);
+ +      if (IS_ERR(ren->oldpath))
+ +              return PTR_ERR(ren->oldpath);
   
- -      if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
- -              /*
- -               * Splice operation will be punted aync, and here need to
- -               * modify io_wq_work.flags, so initialize io_wq_work firstly.
- -               */
- -              io_req_init_async(req);
- -              req->work.flags |= IO_WQ_WORK_UNBOUND;
+ +      ren->newpath = getname(newf);
+ +      if (IS_ERR(ren->newpath)) {
+ +              putname(ren->oldpath);
+ +              return PTR_ERR(ren->newpath);
         }
   
+ +      req->flags |= REQ_F_NEED_CLEANUP;
         return 0;
   }
   
- -static int io_tee_prep(struct io_kiocb *req,
- -                     const struct io_uring_sqe *sqe)
- -{
- -      if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
- -              return -EINVAL;
- -      return __io_splice_prep(req, sqe);
- -}
- -
- -static int io_tee(struct io_kiocb *req, bool force_nonblock)
+ +static int io_renameat(struct io_kiocb *req, bool force_nonblock)
   {
- -      struct io_splice *sp = &req->splice;
- -      struct file *in = sp->file_in;
- -      struct file *out = sp->file_out;
- -      unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- -      long ret = 0;
+ +      struct io_rename *ren = &req->rename;
+ +      int ret;
   
         if (force_nonblock)
                 return -EAGAIN;
- -      if (sp->len)
- -              ret = do_tee(in, out, sp->len, flags);
   
- -      io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
- -      req->flags &= ~REQ_F_NEED_CLEANUP;
+ +      ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ +                              ren->newpath, ren->flags);
   
- -      if (ret != sp->len)
+ +      req->flags &= ~REQ_F_NEED_CLEANUP;
+ +      if (ret < 0)
                 req_set_fail_links(req);
         io_req_complete(req, ret);
         return 0;
   }
   
- -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ +static int io_unlinkat_prep(struct io_kiocb *req,
+ +                          const struct io_uring_sqe *sqe)
   {
- -      struct io_splice* sp = &req->splice;
+ +      struct io_unlink *un = &req->unlink;
+ +      const char __user *fname;
   
- -      sp->off_in = READ_ONCE(sqe->splice_off_in);
- -      sp->off_out = READ_ONCE(sqe->off);
- -      return __io_splice_prep(req, sqe);
+ +      if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ +              return -EBADF;
+ +
+ +      un->dfd = READ_ONCE(sqe->fd);
+ +
+ +      un->flags = READ_ONCE(sqe->unlink_flags);
+ +      if (un->flags & ~AT_REMOVEDIR)
+ +              return -EINVAL;
+ +
+ +      fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ +      un->filename = getname(fname);
+ +      if (IS_ERR(un->filename))
+ +              return PTR_ERR(un->filename);
+ +
+ +      req->flags |= REQ_F_NEED_CLEANUP;
+ +      return 0;
   }
   
- -static int io_splice(struct io_kiocb *req, bool force_nonblock)
+ +static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+ +{
+ +      struct io_unlink *un = &req->unlink;
+ +      int ret;
+ +
+ +      if (force_nonblock)
+ +              return -EAGAIN;
+ +
+ +      if (un->flags & AT_REMOVEDIR)
+ +              ret = do_rmdir(un->dfd, un->filename);
+ +      else
+ +              ret = do_unlinkat(un->dfd, un->filename);
+ +
+ +      req->flags &= ~REQ_F_NEED_CLEANUP;
+ +      if (ret < 0)
+ +              req_set_fail_links(req);
+ +      io_req_complete(req, ret);
+ +      return 0;
+ +}
+ +
+ +static int io_shutdown_prep(struct io_kiocb *req,
+ +                          const struct io_uring_sqe *sqe)
+ +{
+ +#if defined(CONFIG_NET)
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ +              return -EINVAL;
+ +      if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ +          sqe->buf_index)
+ +              return -EINVAL;
+ +
+ +      req->shutdown.how = READ_ONCE(sqe->len);
+ +      return 0;
+ +#else
+ +      return -EOPNOTSUPP;
+ +#endif
+ +}
+ +
+ +static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+ +{
+ +#if defined(CONFIG_NET)
+ +      struct socket *sock;
+ +      int ret;
+ +
+ +      if (force_nonblock)
+ +              return -EAGAIN;
+ +
+ +      sock = sock_from_file(req->file);
+ +      if (unlikely(!sock))
+ +              return -ENOTSOCK;
+ +
+ +      ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ +      io_req_complete(req, ret);
+ +      return 0;
+ +#else
+ +      return -EOPNOTSUPP;
+ +#endif
+ +}
+ +
+ +static int __io_splice_prep(struct io_kiocb *req,
+ +                          const struct io_uring_sqe *sqe)
+ +{
+ +      struct io_splice* sp = &req->splice;
+ +      unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+ +
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ +              return -EINVAL;
+ +
+ +      sp->file_in = NULL;
+ +      sp->len = READ_ONCE(sqe->len);
+ +      sp->flags = READ_ONCE(sqe->splice_flags);
+ +
+ +      if (unlikely(sp->flags & ~valid_flags))
+ +              return -EINVAL;
+ +
+ +      sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+ +                                (sp->flags & SPLICE_F_FD_IN_FIXED));
+ +      if (!sp->file_in)
+ +              return -EBADF;
+ +      req->flags |= REQ_F_NEED_CLEANUP;
+ +
+ +      if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+ +              /*
+ +               * Splice operation will be punted aync, and here need to
+ +               * modify io_wq_work.flags, so initialize io_wq_work firstly.
+ +               */
+ +              io_req_init_async(req);
+ +              req->work.flags |= IO_WQ_WORK_UNBOUND;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int io_tee_prep(struct io_kiocb *req,
+ +                     const struct io_uring_sqe *sqe)
+ +{
+ +      if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
+ +              return -EINVAL;
+ +      return __io_splice_prep(req, sqe);
+ +}
+ +
+ +static int io_tee(struct io_kiocb *req, bool force_nonblock)
+ +{
+ +      struct io_splice *sp = &req->splice;
+ +      struct file *in = sp->file_in;
+ +      struct file *out = sp->file_out;
+ +      unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ +      long ret = 0;
+ +
+ +      if (force_nonblock)
+ +              return -EAGAIN;
+ +      if (sp->len)
+ +              ret = do_tee(in, out, sp->len, flags);
+ +
+ +      io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ +      req->flags &= ~REQ_F_NEED_CLEANUP;
+ +
+ +      if (ret != sp->len)
+ +              req_set_fail_links(req);
+ +      io_req_complete(req, ret);
+ +      return 0;
+ +}
+ +
+ +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ +{
+ +      struct io_splice* sp = &req->splice;
+ +
+ +      sp->off_in = READ_ONCE(sqe->splice_off_in);
+ +      sp->off_out = READ_ONCE(sqe->off);
+ +      return __io_splice_prep(req, sqe);
+ +}
+ +
+ +static int io_splice(struct io_kiocb *req, bool force_nonblock)
   {
         struct io_splice *sp = &req->splice;
         struct file *in = sp->file_in;
@@@ -3998,7 -3793,6 +3996,7 @@@ static int __io_openat_prep(struct io_k
                 return ret;
         }
         req->open.nofile = rlimit(RLIMIT_NOFILE);
+ +      req->open.ignore_nonblock = false;
         req->flags |= REQ_F_NEED_CLEANUP;
         return 0;
   }
@@@ -4007,7 -3801,7 +4005,7 @@@ static int io_openat_prep(struct io_kio
   {
         u64 flags, mode;
   
- -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         mode = READ_ONCE(sqe->len);
         flags = READ_ONCE(sqe->open_flags);
@@@ -4021,7 -3815,7 +4019,7 @@@ static int io_openat2_prep(struct io_ki
         size_t len;
         int ret;
   
- -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
         len = READ_ONCE(sqe->len);
@@@ -4042,7 -3836,7 +4040,7 @@@ static int io_openat2(struct io_kiocb *
         struct file *file;
         int ret;
   
- -      if (force_nonblock)
+ +      if (force_nonblock && !req->open.ignore_nonblock)
                 return -EAGAIN;
   
         ret = build_open_flags(&req->open.how, &op);
@@@ -4057,21 -3851,6 +4055,21 @@@
         if (IS_ERR(file)) {
                 put_unused_fd(ret);
                 ret = PTR_ERR(file);
+ +              /*
+ +               * A work-around to ensure that /proc/self works that way
+ +               * that it should - if we get -EOPNOTSUPP back, then assume
+ +               * that proc_self_get_link() failed us because we're in async
+ +               * context. We should be safe to retry this from the task
+ +               * itself with force_nonblock == false set, as it should not
+ +               * block on lookup. Would be nice to know this upfront and
+ +               * avoid the async dance, but doesn't seem feasible.
+ +               */
+ +              if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
+ +                      req->open.ignore_nonblock = true;
+ +                      refcount_inc(&req->refs);
+ +                      io_req_task_queue(req);
+ +                      return 0;
+ +              }
         } else {
                 fsnotify_open(file);
                 fd_install(ret, file);
@@@ -4151,17 -3930,11 +4149,17 @@@ static int io_remove_buffers(struct io_
         head = idr_find(&ctx->io_buffer_idr, p->bgid);
         if (head)
                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
- -
- -      io_ring_submit_lock(ctx, !force_nonblock);
         if (ret < 0)
                 req_set_fail_links(req);
- -      __io_req_complete(req, ret, 0, cs);
+ +
+ +      /* need to hold the lock to complete IOPOLL requests */
+ +      if (ctx->flags & IORING_SETUP_IOPOLL) {
+ +              __io_req_complete(req, ret, 0, cs);
+ +              io_ring_submit_unlock(ctx, !force_nonblock);
+ +      } else {
+ +              io_ring_submit_unlock(ctx, !force_nonblock);
+ +              __io_req_complete(req, ret, 0, cs);
+ +      }
         return 0;
   }
   
@@@ -4246,17 -4019,10 +4244,17 @@@ static int io_provide_buffers(struct io
                 }
         }
   out:
- -      io_ring_submit_unlock(ctx, !force_nonblock);
         if (ret < 0)
                 req_set_fail_links(req);
- -      __io_req_complete(req, ret, 0, cs);
+ +
+ +      /* need to hold the lock to complete IOPOLL requests */
+ +      if (ctx->flags & IORING_SETUP_IOPOLL) {
+ +              __io_req_complete(req, ret, 0, cs);
+ +              io_ring_submit_unlock(ctx, !force_nonblock);
+ +      } else {
+ +              io_ring_submit_unlock(ctx, !force_nonblock);
+ +              __io_req_complete(req, ret, 0, cs);
+ +      }
         return 0;
   }
   
@@@ -4428,7 -4194,7 +4426,7 @@@ static int io_close_prep(struct io_kioc
         io_req_init_async(req);
         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
   
- -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
             sqe->rw_flags || sqe->buf_index)
@@@ -4452,7 -4218,7 +4450,7 @@@ static int io_close(struct io_kiocb *re
   
         /* might be already done during nonblock submission */
         if (!close->put_file) {
- -              ret = __close_fd_get_file(close->fd, &close->put_file);
+ +              ret = close_fd_get_file(close->fd, &close->put_file);
                 if (ret < 0)
                         return (ret == -ENOENT) ? -EBADF : ret;
         }
@@@ -4572,9 -4338,9 +4570,9 @@@ static int io_sendmsg(struct io_kiocb *
         unsigned flags;
         int ret;
   
- -      sock = sock_from_file(req->file, &ret);
+ +      sock = sock_from_file(req->file);
         if (unlikely(!sock))
- -              return ret;
+ +              return -ENOTSOCK;
   
         if (req->async_data) {
                 kmsg = req->async_data;
@@@ -4621,9 -4387,9 +4619,9 @@@ static int io_send(struct io_kiocb *req
         unsigned flags;
         int ret;
   
- -      sock = sock_from_file(req->file, &ret);
+ +      sock = sock_from_file(req->file);
         if (unlikely(!sock))
- -              return ret;
+ +              return -ENOTSOCK;
   
         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
         if (unlikely(ret))
@@@ -4715,8 -4481,7 +4713,8 @@@ static int __io_compat_recvmsg_copy_hdr
                         return -EFAULT;
                 if (clen < 0)
                         return -EINVAL;
- -              sr->len = iomsg->iov[0].iov_len;
+ +              sr->len = clen;
+ +              iomsg->iov[0].iov_len = clen;
                 iomsg->iov = NULL;
         } else {
                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
@@@ -4801,9 -4566,9 +4799,9 @@@ static int io_recvmsg(struct io_kiocb *
         unsigned flags;
         int ret, cflags = 0;
   
- -      sock = sock_from_file(req->file, &ret);
+ +      sock = sock_from_file(req->file);
         if (unlikely(!sock))
- -              return ret;
+ +              return -ENOTSOCK;
   
         if (req->async_data) {
                 kmsg = req->async_data;
@@@ -4864,9 -4629,9 +4862,9 @@@ static int io_recv(struct io_kiocb *req
         unsigned flags;
         int ret, cflags = 0;
   
- -      sock = sock_from_file(req->file, &ret);
+ +      sock = sock_from_file(req->file);
         if (unlikely(!sock))
- -              return ret;
+ +              return -ENOTSOCK;
   
         if (req->flags & REQ_F_BUFFER_SELECT) {
                 kbuf = io_recv_buffer_select(req, !force_nonblock);
@@@ -4910,7 -4675,7 +4908,7 @@@ static int io_accept_prep(struct io_kio
   {
         struct io_accept *accept = &req->accept;
   
- -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         if (sqe->ioprio || sqe->len || sqe->buf_index)
                 return -EINVAL;
@@@ -4951,7 -4716,7 +4949,7 @@@ static int io_connect_prep(struct io_ki
         struct io_connect *conn = &req->connect;
         struct io_async_connect *io = req->async_data;
   
- -      if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ +      if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
                 return -EINVAL;
@@@ -5075,6 -4840,7 +5073,6 @@@ struct io_poll_table 
   static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
                            __poll_t mask, task_work_func_t func)
   {
- -      bool twa_signal_ok;
         int ret;
   
         /* for instances that support it check for an event match first: */
@@@ -5090,12 -4856,20 +5088,12 @@@
         percpu_ref_get(&req->ctx->refs);
   
         /*
- -       * If we using the signalfd wait_queue_head for this wakeup, then
- -       * it's not safe to use TWA_SIGNAL as we could be recursing on the
- -       * tsk->sighand->siglock on doing the wakeup. Should not be needed
- -       * either, as the normal wakeup will suffice.
- -       */
- -      twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
- -
- -      /*
          * If this fails, then the task is exiting. When a task exits, the
          * work gets canceled, so just cancel this request as well instead
          * of executing it. We can't safely execute it anyway, as we may not
          * have the needed state needed for it anyway.
          */
- -      ret = io_req_task_work_add(req, twa_signal_ok);
+ +      ret = io_req_task_work_add(req);
         if (unlikely(ret)) {
                 struct task_struct *tsk;
   
@@@ -5486,8 -5260,7 +5484,8 @@@ static bool io_poll_remove_one(struct i
   /*
    * Returns true if we found and killed one or more poll requests
    */
- -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+ +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ +                             struct files_struct *files)
   {
         struct hlist_node *tmp;
         struct io_kiocb *req;
@@@ -5499,7 -5272,7 +5497,7 @@@
   
                 list = &ctx->cancel_hash[i];
                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- -                      if (io_task_match(req, tsk))
+ +                      if (io_match_task(req, tsk, files))
                                 posted += io_poll_remove_one(req);
                 }
         }
@@@ -5537,7 -5310,7 +5535,7 @@@ static int io_poll_remove_prep(struct i
             sqe->poll_events)
                 return -EINVAL;
   
- -      req->poll.addr = READ_ONCE(sqe->addr);
+ +      req->poll_remove.addr = READ_ONCE(sqe->addr);
         return 0;
   }
   
@@@ -5548,10 -5321,12 +5546,10 @@@
   static int io_poll_remove(struct io_kiocb *req)
   {
         struct io_ring_ctx *ctx = req->ctx;
- -      u64 addr;
         int ret;
   
- -      addr = req->poll.addr;
         spin_lock_irq(&ctx->completion_lock);
- -      ret = io_poll_cancel(ctx, addr);
+ +      ret = io_poll_cancel(ctx, req->poll_remove.addr);
         spin_unlock_irq(&ctx->completion_lock);
   
         if (ret < 0)
@@@ -5644,37 -5419,15 +5642,37 @@@ static enum hrtimer_restart io_timeout_
         return HRTIMER_NORESTART;
   }
   
- -static int __io_timeout_cancel(struct io_kiocb *req)
+ +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+ +                                         __u64 user_data)
   {
- -      struct io_timeout_data *io = req->async_data;
- -      int ret;
+ +      struct io_timeout_data *io;
+ +      struct io_kiocb *req;
+ +      int ret = -ENOENT;
+ +
+ +      list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+ +              if (user_data == req->user_data) {
+ +                      ret = 0;
+ +                      break;
+ +              }
+ +      }
   
+ +      if (ret == -ENOENT)
+ +              return ERR_PTR(ret);
+ +
+ +      io = req->async_data;
         ret = hrtimer_try_to_cancel(&io->timer);
         if (ret == -1)
- -              return -EALREADY;
+ +              return ERR_PTR(-EALREADY);
         list_del_init(&req->timeout.list);
+ +      return req;
+ +}
+ +
+ +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ +{
+ +      struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ +
+ +      if (IS_ERR(req))
+ +              return PTR_ERR(req);
   
         req_set_fail_links(req);
         io_cqring_fill_event(req, -ECANCELED);
@@@ -5682,48 -5435,35 +5680,48 @@@
         return 0;
   }
   
- -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ +                           struct timespec64 *ts, enum hrtimer_mode mode)
   {
- -      struct io_kiocb *req;
- -      int ret = -ENOENT;
- -
- -      list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- -              if (user_data == req->user_data) {
- -                      ret = 0;
- -                      break;
- -              }
- -      }
+ +      struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ +      struct io_timeout_data *data;
   
- -      if (ret == -ENOENT)
- -              return ret;
+ +      if (IS_ERR(req))
+ +              return PTR_ERR(req);
   
- -      return __io_timeout_cancel(req);
+ +      req->timeout.off = 0; /* noseq */
+ +      data = req->async_data;
+ +      list_add_tail(&req->timeout.list, &ctx->timeout_list);
+ +      hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ +      data->timer.function = io_timeout_fn;
+ +      hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+ +      return 0;
   }
   
   static int io_timeout_remove_prep(struct io_kiocb *req,
                                   const struct io_uring_sqe *sqe)
   {
+ +      struct io_timeout_rem *tr = &req->timeout_rem;
+ +
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->len)
                 return -EINVAL;
   
- -      req->timeout_rem.addr = READ_ONCE(sqe->addr);
+ +      tr->addr = READ_ONCE(sqe->addr);
+ +      tr->flags = READ_ONCE(sqe->timeout_flags);
+ +      if (tr->flags & IORING_TIMEOUT_UPDATE) {
+ +              if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ +                      return -EINVAL;
+ +              if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+ +                      return -EFAULT;
+ +      } else if (tr->flags) {
+ +              /* timeout removal doesn't support flags */
+ +              return -EINVAL;
+ +      }
+ +
         return 0;
   }
   
@@@ -5732,19 -5472,11 +5730,19 @@@
    */
   static int io_timeout_remove(struct io_kiocb *req)
   {
+ +      struct io_timeout_rem *tr = &req->timeout_rem;
         struct io_ring_ctx *ctx = req->ctx;
         int ret;
   
         spin_lock_irq(&ctx->completion_lock);
- -      ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+ +      if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+ +              enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+ +                                      ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+ +
+ +              ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ +      } else {
+ +              ret = io_timeout_cancel(ctx, tr->addr);
+ +      }
   
         io_cqring_fill_event(req, ret);
         io_commit_cqring(ctx);
@@@ -6024,12 -5756,6 +6022,12 @@@ static int io_req_prep(struct io_kiocb 
                 return io_remove_buffers_prep(req, sqe);
         case IORING_OP_TEE:
                 return io_tee_prep(req, sqe);
+ +      case IORING_OP_SHUTDOWN:
+ +              return io_shutdown_prep(req, sqe);
+ +      case IORING_OP_RENAMEAT:
+ +              return io_renameat_prep(req, sqe);
+ +      case IORING_OP_UNLINKAT:
+ +              return io_unlinkat_prep(req, sqe);
         }
   
         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@@ -6051,10 -5777,11 +6049,10 @@@ static u32 io_get_sequence(struct io_ki
   {
         struct io_kiocb *pos;
         struct io_ring_ctx *ctx = req->ctx;
- -      u32 total_submitted, nr_reqs = 1;
+ +      u32 total_submitted, nr_reqs = 0;
   
- -      if (req->flags & REQ_F_LINK_HEAD)
- -              list_for_each_entry(pos, &req->link_list, link_list)
- -                      nr_reqs++;
+ +      io_for_each_link(pos, req)
+ +              nr_reqs++;
   
         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
         return total_submitted - nr_reqs;
@@@ -6106,13 -5833,12 +6104,13 @@@ static int io_req_defer(struct io_kioc
   static void io_req_drop_files(struct io_kiocb *req)
   {
         struct io_ring_ctx *ctx = req->ctx;
+ +      struct io_uring_task *tctx = req->task->io_uring;
         unsigned long flags;
   
         spin_lock_irqsave(&ctx->inflight_lock, flags);
         list_del(&req->inflight_entry);
- -      if (waitqueue_active(&ctx->inflight_wait))
- -              wake_up(&ctx->inflight_wait);
+ +      if (atomic_read(&tctx->in_idle))
+ +              wake_up(&tctx->wait);
         spin_unlock_irqrestore(&ctx->inflight_lock, flags);
         req->flags &= ~REQ_F_INFLIGHT;
         put_files_struct(req->work.identity->files);
@@@ -6167,13 -5893,6 +6165,13 @@@ static void __io_clean_op(struct io_kio
                         if (req->open.filename)
                                 putname(req->open.filename);
                         break;
+ +              case IORING_OP_RENAMEAT:
+ +                      putname(req->rename.oldpath);
+ +                      putname(req->rename.newpath);
+ +                      break;
+ +              case IORING_OP_UNLINKAT:
+ +                      putname(req->unlink.filename);
+ +                      break;
                 }
                 req->flags &= ~REQ_F_NEED_CLEANUP;
         }
@@@ -6280,15 -5999,6 +6278,15 @@@ static int io_issue_sqe(struct io_kioc
         case IORING_OP_TEE:
                 ret = io_tee(req, force_nonblock);
                 break;
+ +      case IORING_OP_SHUTDOWN:
+ +              ret = io_shutdown(req, force_nonblock);
+ +              break;
+ +      case IORING_OP_RENAMEAT:
+ +              ret = io_renameat(req, force_nonblock);
+ +              break;
+ +      case IORING_OP_UNLINKAT:
+ +              ret = io_unlinkat(req, force_nonblock);
+ +              break;
         default:
                 ret = -EINVAL;
                 break;
@@@ -6305,7 -6015,7 +6303,7 @@@
                 if (in_async)
                         mutex_lock(&ctx->uring_lock);
   
- -              io_iopoll_req_issued(req);
+ +              io_iopoll_req_issued(req, in_async);
   
                 if (in_async)
                         mutex_unlock(&ctx->uring_lock);
@@@ -6345,19 -6055,8 +6343,19 @@@ static struct io_wq_work *io_wq_submit_
         }
   
         if (ret) {
- -              req_set_fail_links(req);
- -              io_req_complete(req, ret);
+ +              /*
+ +               * io_iopoll_complete() does not hold completion_lock to complete
+ +               * polled io, so here for polled io, just mark it done and still let
+ +               * io_iopoll_complete() complete it.
+ +               */
+ +              if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ +                      struct kiocb *kiocb = &req->rw.kiocb;
+ +
+ +                      kiocb_done(kiocb, ret, NULL);
+ +              } else {
+ +                      req_set_fail_links(req);
+ +                      io_req_complete(req, ret);
+ +              }
         }
   
         return io_steal_work(req);
@@@ -6383,7 -6082,10 +6381,7 @@@ static struct file *io_file_get(struct 
                         return NULL;
                 fd = array_index_nospec(fd, ctx->nr_user_files);
                 file = io_file_from_index(ctx, fd);
- -              if (file) {
- -                      req->fixed_file_refs = &ctx->file_data->node->refs;
- -                      percpu_ref_get(req->fixed_file_refs);
- -              }
+ +              io_set_resource_node(req);
         } else {
                 trace_io_uring_file_get(ctx, fd);
                 file = __io_file_get(state, fd);
@@@ -6392,26 -6094,45 +6390,26 @@@
         return file;
   }
   
- -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- -                         int fd)
- -{
- -      bool fixed;
- -
- -      fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- -      if (unlikely(!fixed && io_async_submit(req->ctx)))
- -              return -EBADF;
- -
- -      req->file = io_file_get(state, req, fd, fixed);
- -      if (req->file || io_op_defs[req->opcode].needs_file_no_error)
- -              return 0;
- -      return -EBADF;
- -}
- -
   static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
   {
         struct io_timeout_data *data = container_of(timer,
                                                 struct io_timeout_data, timer);
- -      struct io_kiocb *req = data->req;
+ +      struct io_kiocb *prev, *req = data->req;
         struct io_ring_ctx *ctx = req->ctx;
- -      struct io_kiocb *prev = NULL;
         unsigned long flags;
   
         spin_lock_irqsave(&ctx->completion_lock, flags);
+ +      prev = req->timeout.head;
+ +      req->timeout.head = NULL;
   
         /*
          * We don't expect the list to be empty, that will only happen if we
          * race with the completion of the linked work.
          */
- -      if (!list_empty(&req->link_list)) {
- -              prev = list_entry(req->link_list.prev, struct io_kiocb,
- -                                link_list);
- -              if (refcount_inc_not_zero(&prev->refs))
- -                      list_del_init(&req->link_list);
- -              else
- -                      prev = NULL;
- -      }
- -
+ +      if (prev && refcount_inc_not_zero(&prev->refs))
+ +              io_remove_next_linked(prev);
+ +      else
+ +              prev = NULL;
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
   
         if (prev) {
@@@ -6427,10 -6148,10 +6425,10 @@@
   static void __io_queue_linked_timeout(struct io_kiocb *req)
   {
         /*
- -       * If the list is now empty, then our linked request finished before
- -       * we got a chance to setup the timer
+ +       * If the back reference is NULL, then our linked request finished
+ +       * before we got a chance to setup the timer
          */
- -      if (!list_empty(&req->link_list)) {
+ +      if (req->timeout.head) {
                 struct io_timeout_data *data = req->async_data;
   
                 data->timer.function = io_link_timeout_fn;
@@@ -6453,13 -6174,18 +6451,13 @@@ static void io_queue_linked_timeout(str
   
   static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
   {
- -      struct io_kiocb *nxt;
+ +      struct io_kiocb *nxt = req->link;
   
- -      if (!(req->flags & REQ_F_LINK_HEAD))
- -              return NULL;
- -      if (req->flags & REQ_F_LINK_TIMEOUT)
- -              return NULL;
- -
- -      nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- -                                      link_list);
- -      if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+ +      if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+ +          nxt->opcode != IORING_OP_LINK_TIMEOUT)
                 return NULL;
   
+ +      nxt->timeout.head = req;
         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
         req->flags |= REQ_F_LINK_TIMEOUT;
         return nxt;
@@@ -6565,13 -6291,8 +6563,13 @@@ static inline void io_queue_link_head(s
                 io_queue_sqe(req, NULL, cs);
   }
   
+ +struct io_submit_link {
+ +      struct io_kiocb *head;
+ +      struct io_kiocb *last;
+ +};
+ +
   static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- -                       struct io_kiocb **link, struct io_comp_state *cs)
+ +                       struct io_submit_link *link, struct io_comp_state *cs)
   {
         struct io_ring_ctx *ctx = req->ctx;
         int ret;
@@@ -6583,8 -6304,8 +6581,8 @@@
          * submitted sync once the chain is complete. If none of those
          * conditions are true (normal request), then just queue it.
          */
- -      if (*link) {
- -              struct io_kiocb *head = *link;
+ +      if (link->head) {
+ +              struct io_kiocb *head = link->head;
   
                 /*
                  * Taking sequential execution of a link, draining both sides
@@@ -6604,13 -6325,12 +6602,13 @@@
                         return ret;
                 }
                 trace_io_uring_link(ctx, req, head);
- -              list_add_tail(&req->link_list, &head->link_list);
+ +              link->last->link = req;
+ +              link->last = req;
   
                 /* last request of a link, enqueue the link */
                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                         io_queue_link_head(head, cs);
- -                      *link = NULL;
+ +                      link->head = NULL;
                 }
         } else {
                 if (unlikely(ctx->drain_next)) {
@@@ -6618,11 -6338,13 +6616,11 @@@
                         ctx->drain_next = 0;
                 }
                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- -                      req->flags |= REQ_F_LINK_HEAD;
- -                      INIT_LIST_HEAD(&req->link_list);
- -
                         ret = io_req_defer_prep(req, sqe);
                         if (unlikely(ret))
                                 req->flags |= REQ_F_FAIL_LINK;
- -                      *link = req;
+ +                      link->head = req;
+ +                      link->last = req;
                 } else {
                         io_queue_sqe(req, sqe, cs);
                 }
@@@ -6638,8 -6360,7 +6636,8 @@@ static void io_submit_state_end(struct 
   {
         if (!list_empty(&state->comp.list))
                 io_submit_flush_completions(&state->comp);
- -      blk_finish_plug(&state->plug);
+ +      if (state->plug_started)
+ +              blk_finish_plug(&state->plug);
         io_state_file_put(state);
         if (state->free_reqs)
                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
@@@ -6651,12 -6372,12 +6649,12 @@@
   static void io_submit_state_start(struct io_submit_state *state,
                                   struct io_ring_ctx *ctx, unsigned int max_ios)
   {
- -      blk_start_plug(&state->plug);
+ +      state->plug_started = false;
         state->comp.nr = 0;
         INIT_LIST_HEAD(&state->comp.list);
         state->comp.ctx = ctx;
         state->free_reqs = 0;
- -      state->file = NULL;
+ +      state->file_refs = 0;
         state->ios_left = max_ios;
   }
   
@@@ -6751,8 -6472,6 +6749,8 @@@ static int io_init_req(struct io_ring_c
         req->file = NULL;
         req->ctx = ctx;
         req->flags = 0;
+ +      req->link = NULL;
+ +      req->fixed_file_refs = NULL;
         /* one is dropped after submission, the other at completion */
         refcount_set(&req->refs, 2);
         req->task = current;
@@@ -6761,7 -6480,7 +6759,7 @@@
         if (unlikely(req->opcode >= IORING_OP_LAST))
                 return -EINVAL;
   
- -      if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ +      if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
                 return -EFAULT;
   
         sqe_flags = READ_ONCE(sqe->flags);
@@@ -6794,26 -6513,10 +6792,26 @@@
         /* same numerical values with corresponding REQ_F_*, safe to copy */
         req->flags |= sqe_flags;
   
- -      if (!io_op_defs[req->opcode].needs_file)
- -              return 0;
+ +      /*
+ +       * Plug now if we have more than 1 IO left after this, and the target
+ +       * is potentially a read/write to block based storage.
+ +       */
+ +      if (!state->plug_started && state->ios_left > 1 &&
+ +          io_op_defs[req->opcode].plug) {
+ +              blk_start_plug(&state->plug);
+ +              state->plug_started = true;
+ +      }
+ +
+ +      ret = 0;
+ +      if (io_op_defs[req->opcode].needs_file) {
+ +              bool fixed = req->flags & REQ_F_FIXED_FILE;
+ +
+ +              req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ +              if (unlikely(!req->file &&
+ +                  !io_op_defs[req->opcode].needs_file_no_error))
+ +                      ret = -EBADF;
+ +      }
   
- -      ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
         state->ios_left--;
         return ret;
   }
@@@ -6821,7 -6524,7 +6819,7 @@@
   static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
   {
         struct io_submit_state state;
- -      struct io_kiocb *link = NULL;
+ +      struct io_submit_link link;
         int i, submitted = 0;
   
         /* if we have a backlog and couldn't flush it all, return BUSY */
@@@ -6841,7 -6544,6 +6839,7 @@@
         refcount_add(nr, &current->usage);
   
         io_submit_state_start(&state, ctx, nr);
+ +      link.head = NULL;
   
         for (i = 0; i < nr; i++) {
                 const struct io_uring_sqe *sqe;
@@@ -6887,8 -6589,8 +6885,8 @@@ fail_req
                 percpu_counter_sub(&tctx->inflight, unused);
                 put_task_struct_many(current, unused);
         }
- -      if (link)
- -              io_queue_link_head(link, &state.comp);
+ +      if (link.head)
+ +              io_queue_link_head(link.head, &state.comp);
         io_submit_state_end(&state);
   
          /* Commit SQ ring head once we've consumed and submitted all SQEs */
@@@ -6912,45 -6614,111 +6910,45 @@@ static inline void io_ring_clear_wakeup
         spin_unlock_irq(&ctx->completion_lock);
   }
   
- -static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
- -                             int sync, void *key)
+ +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
   {
- -      struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
- -      int ret;
- -
- -      ret = autoremove_wake_function(wqe, mode, sync, key);
- -      if (ret) {
- -              unsigned long flags;
- -
- -              spin_lock_irqsave(&ctx->completion_lock, flags);
- -              ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- -              spin_unlock_irqrestore(&ctx->completion_lock, flags);
- -      }
- -      return ret;
- -}
- -
- -enum sq_ret {
- -      SQT_IDLE        = 1,
- -      SQT_SPIN        = 2,
- -      SQT_DID_WORK    = 4,
- -};
- -
- -static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
- -                                unsigned long start_jiffies, bool cap_entries)
- -{
- -      unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
- -      struct io_sq_data *sqd = ctx->sq_data;
         unsigned int to_submit;
         int ret = 0;
   
- -again:
- -      if (!list_empty(&ctx->iopoll_list)) {
+ +      to_submit = io_sqring_entries(ctx);
+ +      /* if we're handling multiple rings, cap submit size for fairness */
+ +      if (cap_entries && to_submit > 8)
+ +              to_submit = 8;
+ +
+ +      if (!list_empty(&ctx->iopoll_list) || to_submit) {
                 unsigned nr_events = 0;
   
                 mutex_lock(&ctx->uring_lock);
- -              if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ +              if (!list_empty(&ctx->iopoll_list))
                         io_do_iopoll(ctx, &nr_events, 0);
+ +
+ +              if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ +                      ret = io_submit_sqes(ctx, to_submit);
                 mutex_unlock(&ctx->uring_lock);
         }
   
- -      to_submit = io_sqring_entries(ctx);
- -
- -      /*
- -       * If submit got -EBUSY, flag us as needing the application
- -       * to enter the kernel to reap and flush events.
- -       */
- -      if (!to_submit || ret == -EBUSY || need_resched()) {
- -              /*
- -               * Drop cur_mm before scheduling, we can't hold it for
- -               * long periods (or over schedule()). Do this before
- -               * adding ourselves to the waitqueue, as the unuse/drop
- -               * may sleep.
- -               */
- -              io_sq_thread_drop_mm();
- -
- -              /*
- -               * We're polling. If we're within the defined idle
- -               * period, then let us spin without work before going
- -               * to sleep. The exception is if we got EBUSY doing
- -               * more IO, we should wait for the application to
- -               * reap events and wake us up.
- -               */
- -              if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- -                  (!time_after(jiffies, timeout) && ret != -EBUSY &&
- -                  !percpu_ref_is_dying(&ctx->refs)))
- -                      return SQT_SPIN;
+ +      if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ +              wake_up(&ctx->sqo_sq_wait);
   
- -              prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
- -                                      TASK_INTERRUPTIBLE);
+ +      return ret;
+ +}
   
- -              /*
- -               * While doing polled IO, before going to sleep, we need
- -               * to check if there are new reqs added to iopoll_list,
- -               * it is because reqs may have been punted to io worker
- -               * and will be added to iopoll_list later, hence check
- -               * the iopoll_list again.
- -               */
- -              if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- -                  !list_empty_careful(&ctx->iopoll_list)) {
- -                      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- -                      goto again;
- -              }
+ +static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+ +{
+ +      struct io_ring_ctx *ctx;
+ +      unsigned sq_thread_idle = 0;
   
- -              to_submit = io_sqring_entries(ctx);
- -              if (!to_submit || ret == -EBUSY)
- -                      return SQT_IDLE;
+ +      list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ +              if (sq_thread_idle < ctx->sq_thread_idle)
+ +                      sq_thread_idle = ctx->sq_thread_idle;
         }
   
- -      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- -      io_ring_clear_wakeup_flag(ctx);
- -
- -      /* if we're handling multiple rings, cap submit size for fairness */
- -      if (cap_entries && to_submit > 8)
- -              to_submit = 8;
- -
- -      mutex_lock(&ctx->uring_lock);
- -      if (likely(!percpu_ref_is_dying(&ctx->refs)))
- -              ret = io_submit_sqes(ctx, to_submit);
- -      mutex_unlock(&ctx->uring_lock);
- -
- -      if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- -              wake_up(&ctx->sqo_sq_wait);
- -
- -      return SQT_DID_WORK;
+ +      sqd->sq_thread_idle = sq_thread_idle;
   }
   
   static void io_sqd_init_new(struct io_sq_data *sqd)
@@@ -6959,56 -6727,39 +6957,56 @@@
   
         while (!list_empty(&sqd->ctx_new_list)) {
                 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- -              init_wait(&ctx->sqo_wait_entry);
- -              ctx->sqo_wait_entry.func = io_sq_wake_function;
                 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
                 complete(&ctx->sq_thread_comp);
         }
+ +
+ +      io_sqd_update_thread_idle(sqd);
   }
   
   static int io_sq_thread(void *data)
   {
         struct cgroup_subsys_state *cur_css = NULL;
+ +      struct files_struct *old_files = current->files;
+ +      struct nsproxy *old_nsproxy = current->nsproxy;
         const struct cred *old_cred = NULL;
         struct io_sq_data *sqd = data;
         struct io_ring_ctx *ctx;
- -      unsigned long start_jiffies;
+ +      unsigned long timeout = 0;
+ +      DEFINE_WAIT(wait);
+ +
+ +      task_lock(current);
+ +      current->files = NULL;
+ +      current->nsproxy = NULL;
+ +      task_unlock(current);
   
- -      start_jiffies = jiffies;
         while (!kthread_should_stop()) {
- -              enum sq_ret ret = 0;
- -              bool cap_entries;
+ +              int ret;
+ +              bool cap_entries, sqt_spin, needs_sched;
   
                 /*
                  * Any changes to the sqd lists are synchronized through the
                  * kthread parking. This synchronizes the thread vs users,
                  * the users are synchronized on the sqd->ctx_lock.
                  */
- -              if (kthread_should_park())
+ +              if (kthread_should_park()) {
                         kthread_parkme();
+ +                      /*
+ +                       * When sq thread is unparked, in case the previous park operation
+ +                       * comes from io_put_sq_data(), which means that sq thread is going
+ +                       * to be stopped, so here needs to have a check.
+ +                       */
+ +                      if (kthread_should_stop())
+ +                              break;
+ +              }
   
- -              if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ +              if (unlikely(!list_empty(&sqd->ctx_new_list))) {
                         io_sqd_init_new(sqd);
+ +                      timeout = jiffies + sqd->sq_thread_idle;
+ +              }
   
+ +              sqt_spin = false;
                 cap_entries = !list_is_singular(&sqd->ctx_list);
- -
                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                         if (current->cred != ctx->creds) {
                                 if (old_cred)
@@@ -7021,49 -6772,24 +7019,49 @@@
                         current->sessionid = ctx->sessionid;
   #endif
   
- -                      ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+ +                      ret = __io_sq_thread(ctx, cap_entries);
+ +                      if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ +                              sqt_spin = true;
   
- -                      io_sq_thread_drop_mm();
+ +                      io_sq_thread_drop_mm_files();
                 }
   
- -              if (ret & SQT_SPIN) {
+ +              if (sqt_spin || !time_after(jiffies, timeout)) {
                         io_run_task_work();
                         cond_resched();
- -              } else if (ret == SQT_IDLE) {
- -                      if (kthread_should_park())
- -                              continue;
+ +                      if (sqt_spin)
+ +                              timeout = jiffies + sqd->sq_thread_idle;
+ +                      continue;
+ +              }
+ +
+ +              if (kthread_should_park())
+ +                      continue;
+ +
+ +              needs_sched = true;
+ +              prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+ +              list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ +                      if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ +                          !list_empty_careful(&ctx->iopoll_list)) {
+ +                              needs_sched = false;
+ +                              break;
+ +                      }
+ +                      if (io_sqring_entries(ctx)) {
+ +                              needs_sched = false;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              if (needs_sched) {
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_set_wakeup_flag(ctx);
+ +
                         schedule();
- -                      start_jiffies = jiffies;
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_clear_wakeup_flag(ctx);
                 }
+ +
+ +              finish_wait(&sqd->wait, &wait);
+ +              timeout = jiffies + sqd->sq_thread_idle;
         }
   
         io_run_task_work();
@@@ -7073,11 -6799,6 +7071,11 @@@
         if (old_cred)
                 revert_creds(old_cred);
   
+ +      task_lock(current);
+ +      current->files = old_files;
+ +      current->nsproxy = old_nsproxy;
+ +      task_unlock(current);
+ +
         kthread_parkme();
   
         return 0;
@@@ -7122,8 -6843,13 +7120,8 @@@ static int io_run_task_work_sig(void
                 return 1;
         if (!signal_pending(current))
                 return 0;
- -      if (current->jobctl & JOBCTL_TASK_WORK) {
- -              spin_lock_irq(&current->sighand->siglock);
- -              current->jobctl &= ~JOBCTL_TASK_WORK;
- -              recalc_sigpending();
- -              spin_unlock_irq(&current->sighand->siglock);
- -              return 1;
- -      }
+ +      if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+ +              return -ERESTARTSYS;
         return -EINTR;
   }
   
@@@ -7132,8 -6858,7 +7130,8 @@@
    * application must reap them itself, as they reside on the shared cq ring.
    */
   static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- -                        const sigset_t __user *sig, size_t sigsz)
+ +                        const sigset_t __user *sig, size_t sigsz,
+ +                        struct __kernel_timespec __user *uts)
   {
         struct io_wait_queue iowq = {
                 .wq = {
@@@ -7145,8 -6870,6 +7143,8 @@@
                 .to_wait        = min_events,
         };
         struct io_rings *rings = ctx->rings;
+ +      struct timespec64 ts;
+ +      signed long timeout = 0;
         int ret = 0;
   
         do {
@@@ -7169,12 -6892,6 +7167,12 @@@
                         return ret;
         }
   
+ +      if (uts) {
+ +              if (get_timespec64(&ts, uts))
+ +                      return -EFAULT;
+ +              timeout = timespec64_to_jiffies(&ts);
+ +      }
+ +
         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
         trace_io_uring_cqring_wait(ctx, min_events);
         do {
@@@ -7188,15 -6905,7 +7186,15 @@@
                         break;
                 if (io_should_wake(&iowq, false))
                         break;
- -              schedule();
+ +              if (uts) {
+ +                      timeout = schedule_timeout(timeout);
+ +                      if (timeout == 0) {
+ +                              ret = -ETIME;
+ +                              break;
+ +                      }
+ +              } else {
+ +                      schedule();
+ +              }
         } while (1);
         finish_wait(&ctx->wait, &iowq.wq);
   
@@@ -7245,9 -6954,11 +7243,9 @@@ static int io_sqe_files_unregister(stru
         if (!data)
                 return -ENXIO;
   
- -      spin_lock(&data->lock);
- -      if (!list_empty(&data->ref_list))
- -              ref_node = list_first_entry(&data->ref_list,
- -                              struct fixed_file_ref_node, node);
- -      spin_unlock(&data->lock);
+ +      spin_lock_bh(&data->lock);
+ +      ref_node = data->node;
+ +      spin_unlock_bh(&data->lock);
         if (ref_node)
                 percpu_ref_kill(&ref_node->refs);
   
@@@ -7370,11 -7081,12 +7368,11 @@@ static void io_sq_thread_stop(struct io
   
                 mutex_lock(&sqd->ctx_lock);
                 list_del(&ctx->sqd_list);
+ +              io_sqd_update_thread_idle(sqd);
                 mutex_unlock(&sqd->ctx_lock);
   
- -              if (sqd->thread) {
- -                      finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ +              if (sqd->thread)
                         io_sq_thread_unpark(sqd);
- -              }
   
                 io_put_sq_data(sqd);
                 ctx->sq_data = NULL;
@@@ -7594,6 -7306,10 +7592,6 @@@ static void __io_file_put_work(struct f
                 kfree(pfile);
         }
   
- -      spin_lock(&file_data->lock);
- -      list_del(&ref_node->node);
- -      spin_unlock(&file_data->lock);
- -
         percpu_ref_exit(&ref_node->refs);
         kfree(ref_node);
         percpu_ref_put(&file_data->refs);
@@@ -7620,32 -7336,17 +7618,32 @@@ static void io_file_put_work(struct wor
   static void io_file_data_ref_zero(struct percpu_ref *ref)
   {
         struct fixed_file_ref_node *ref_node;
+ +      struct fixed_file_data *data;
         struct io_ring_ctx *ctx;
- -      bool first_add;
+ +      bool first_add = false;
         int delay = HZ;
   
         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- -      ctx = ref_node->file_data->ctx;
+ +      data = ref_node->file_data;
+ +      ctx = data->ctx;
+ +
+ +      spin_lock_bh(&data->lock);
+ +      ref_node->done = true;
+ +
+ +      while (!list_empty(&data->ref_list)) {
+ +              ref_node = list_first_entry(&data->ref_list,
+ +                                      struct fixed_file_ref_node, node);
+ +              /* recycle ref nodes in order */
+ +              if (!ref_node->done)
+ +                      break;
+ +              list_del(&ref_node->node);
+ +              first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+ +      }
+ +      spin_unlock_bh(&data->lock);
   
- -      if (percpu_ref_is_dying(&ctx->file_data->refs))
+ +      if (percpu_ref_is_dying(&data->refs))
                 delay = 0;
   
- -      first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
         if (!delay)
                 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
         else if (first_add)
@@@ -7669,7 -7370,6 +7667,7 @@@ static struct fixed_file_ref_node *allo
         INIT_LIST_HEAD(&ref_node->node);
         INIT_LIST_HEAD(&ref_node->file_list);
         ref_node->file_data = ctx->file_data;
+ +      ref_node->done = false;
         return ref_node;
   }
   
@@@ -7764,9 -7464,9 +7762,9 @@@ static int io_sqe_files_register(struc
         }
   
         file_data->node = ref_node;
- -      spin_lock(&file_data->lock);
- -      list_add(&ref_node->node, &file_data->ref_list);
- -      spin_unlock(&file_data->lock);
+ +      spin_lock_bh(&file_data->lock);
+ +      list_add_tail(&ref_node->node, &file_data->ref_list);
+ +      spin_unlock_bh(&file_data->lock);
         percpu_ref_get(&file_data->refs);
         return ret;
   out_fput:
@@@ -7923,10 -7623,10 +7921,10 @@@ static int __io_sqe_files_update(struc
   
         if (needs_switch) {
                 percpu_ref_kill(&data->node->refs);
- -              spin_lock(&data->lock);
- -              list_add(&ref_node->node, &data->ref_list);
+ +              spin_lock_bh(&data->lock);
+ +              list_add_tail(&ref_node->node, &data->ref_list);
                 data->node = ref_node;
- -              spin_unlock(&data->lock);
+ +              spin_unlock_bh(&data->lock);
                 percpu_ref_get(&ctx->file_data->refs);
         } else
                 destroy_fixed_file_ref_node(ref_node);
@@@ -8054,7 -7754,7 +8052,7 @@@ static int io_sq_offload_create(struct 
                 struct io_sq_data *sqd;
   
                 ret = -EPERM;
- -              if (!capable(CAP_SYS_ADMIN))
+ +              if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
                         goto err;
   
                 sqd = io_get_sq_data(p);
@@@ -8640,6 -8340,8 +8638,6 @@@ static void io_ring_exit_work(struct wo
          * as nobody else will be looking for them.
          */
         do {
- -              if (ctx->rings)
- -                      io_cqring_overflow_flush(ctx, true, NULL, NULL);
                 io_iopoll_try_reap_events(ctx);
         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
         io_ring_ctx_free(ctx);
@@@ -8649,17 -8351,17 +8647,17 @@@ static void io_ring_ctx_wait_and_kill(s
   {
         mutex_lock(&ctx->uring_lock);
         percpu_ref_kill(&ctx->refs);
+ +      if (ctx->rings)
+ +              io_cqring_overflow_flush(ctx, true, NULL, NULL);
         mutex_unlock(&ctx->uring_lock);
   
- -      io_kill_timeouts(ctx, NULL);
- -      io_poll_remove_all(ctx, NULL);
+ +      io_kill_timeouts(ctx, NULL, NULL);
+ +      io_poll_remove_all(ctx, NULL, NULL);
   
         if (ctx->io_wq)
                 io_wq_cancel_all(ctx->io_wq);
   
         /* if we failed setting up the ctx, we might not have any rings */
- -      if (ctx->rings)
- -              io_cqring_overflow_flush(ctx, true, NULL, NULL);
         io_iopoll_try_reap_events(ctx);
         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
   
@@@ -8690,31 -8392,120 +8688,31 @@@ static int io_uring_release(struct inod
         return 0;
   }
   
- -static bool io_wq_files_match(struct io_wq_work *work, void *data)
- -{
- -      struct files_struct *files = data;
- -
- -      return !files || ((work->flags & IO_WQ_WORK_FILES) &&
- -                              work->identity->files == files);
- -}
- -
- -/*
- - * Returns true if 'preq' is the link parent of 'req'
- - */
- -static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
- -{
- -      struct io_kiocb *link;
- -
- -      if (!(preq->flags & REQ_F_LINK_HEAD))
- -              return false;
- -
- -      list_for_each_entry(link, &preq->link_list, link_list) {
- -              if (link == req)
- -                      return true;
- -      }
- -
- -      return false;
- -}
- -
- -/*
- - * We're looking to cancel 'req' because it's holding on to our files, but
- - * 'req' could be a link to another request. See if it is, and cancel that
- - * parent request if so.
- - */
- -static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
- -{
- -      struct hlist_node *tmp;
- -      struct io_kiocb *preq;
- -      bool found = false;
- -      int i;
- -
- -      spin_lock_irq(&ctx->completion_lock);
- -      for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- -              struct hlist_head *list;
- -
- -              list = &ctx->cancel_hash[i];
- -              hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
- -                      found = io_match_link(preq, req);
- -                      if (found) {
- -                              io_poll_remove_one(preq);
- -                              break;
- -                      }
- -              }
- -      }
- -      spin_unlock_irq(&ctx->completion_lock);
- -      return found;
- -}
- -
- -static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
- -                                 struct io_kiocb *req)
- -{
- -      struct io_kiocb *preq;
- -      bool found = false;
- -
- -      spin_lock_irq(&ctx->completion_lock);
- -      list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
- -              found = io_match_link(preq, req);
- -              if (found) {
- -                      __io_timeout_cancel(preq);
- -                      break;
- -              }
- -      }
- -      spin_unlock_irq(&ctx->completion_lock);
- -      return found;
- -}
+ +struct io_task_cancel {
+ +      struct task_struct *task;
+ +      struct files_struct *files;
+ +};
   
- -static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+ +static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
   {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ +      struct io_task_cancel *cancel = data;
         bool ret;
   
- -      if (req->flags & REQ_F_LINK_TIMEOUT) {
+ +      if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
                 unsigned long flags;
                 struct io_ring_ctx *ctx = req->ctx;
   
                 /* protect against races with linked timeouts */
                 spin_lock_irqsave(&ctx->completion_lock, flags);
- -              ret = io_match_link(req, data);
+ +              ret = io_match_task(req, cancel->task, cancel->files);
                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
         } else {
- -              ret = io_match_link(req, data);
+ +              ret = io_match_task(req, cancel->task, cancel->files);
         }
         return ret;
   }
   
- -static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
- -{
- -      enum io_wq_cancel cret;
- -
- -      /* cancel this particular work, if it's running */
- -      cret = io_wq_cancel_work(ctx->io_wq, &req->work);
- -      if (cret != IO_WQ_CANCEL_NOTFOUND)
- -              return;
- -
- -      /* find links that hold this pending, cancel those */
- -      cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
- -      if (cret != IO_WQ_CANCEL_NOTFOUND)
- -              return;
- -
- -      /* if we have a poll link holding this pending, cancel that */
- -      if (io_poll_remove_link(ctx, req))
- -              return;
- -
- -      /* final option, timeout link is holding this req pending */
- -      io_timeout_remove_link(ctx, req);
- -}
- -
   static void io_cancel_defer_files(struct io_ring_ctx *ctx,
                                   struct task_struct *task,
                                   struct files_struct *files)
@@@ -8724,7 -8515,8 +8722,7 @@@
   
         spin_lock_irq(&ctx->completion_lock);
         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- -              if (io_task_match(de->req, task) &&
- -                  io_match_files(de->req, files)) {
+ +              if (io_match_task(de->req, task, files)) {
                         list_cut_position(&list, &ctx->defer_list, &de->list);
                         break;
                 }
@@@ -8741,52 -8533,72 +8739,52 @@@
         }
   }
   
- -/*
- - * Returns true if we found and killed one or more files pinning requests
- - */
- -static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+ +static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ +                                struct task_struct *task,
                                   struct files_struct *files)
   {
- -      if (list_empty_careful(&ctx->inflight_list))
- -              return false;
- -
- -      /* cancel all at once, should be faster than doing it one by one*/
- -      io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
- -
         while (!list_empty_careful(&ctx->inflight_list)) {
- -              struct io_kiocb *cancel_req = NULL, *req;
+ +              struct io_task_cancel cancel = { .task = task, .files = files };
+ +              struct io_kiocb *req;
                 DEFINE_WAIT(wait);
+ +              bool found = false;
   
                 spin_lock_irq(&ctx->inflight_lock);
                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- -                      if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ +                      if (req->task != task ||
                             req->work.identity->files != files)
                                 continue;
- -                      /* req is being completed, ignore */
- -                      if (!refcount_inc_not_zero(&req->refs))
- -                              continue;
- -                      cancel_req = req;
+ +                      found = true;
                         break;
                 }
- -              if (cancel_req)
- -                      prepare_to_wait(&ctx->inflight_wait, &wait,
- -                                              TASK_UNINTERRUPTIBLE);
+ +              if (found)
+ +                      prepare_to_wait(&task->io_uring->wait, &wait,
+ +                                      TASK_UNINTERRUPTIBLE);
                 spin_unlock_irq(&ctx->inflight_lock);
   
                 /* We need to keep going until we don't find a matching req */
- -              if (!cancel_req)
+ +              if (!found)
                         break;
- -              /* cancel this request, or head link requests */
- -              io_attempt_cancel(ctx, cancel_req);
- -              io_put_req(cancel_req);
+ +
+ +              io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+ +              io_poll_remove_all(ctx, task, files);
+ +              io_kill_timeouts(ctx, task, files);
                 /* cancellations _may_ trigger task work */
                 io_run_task_work();
                 schedule();
- -              finish_wait(&ctx->inflight_wait, &wait);
+ +              finish_wait(&task->io_uring->wait, &wait);
         }
- -
- -      return true;
   }
   
- -static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+ +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ +                                          struct task_struct *task)
   {
- -      struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- -      struct task_struct *task = data;
- -
- -      return io_task_match(req, task);
- -}
- -
- -static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- -                                          struct task_struct *task,
- -                                          struct files_struct *files)
- -{
- -      bool ret;
- -
- -      ret = io_uring_cancel_files(ctx, files);
- -      if (!files) {
+ +      while (1) {
+ +              struct io_task_cancel cancel = { .task = task, .files = NULL, };
                 enum io_wq_cancel cret;
+ +              bool ret = false;
   
- -              cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ +              cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
                 if (cret != IO_WQ_CANCEL_NOTFOUND)
                         ret = true;
   
@@@ -8798,13 -8610,11 +8796,13 @@@
                         }
                 }
   
- -              ret |= io_poll_remove_all(ctx, task);
- -              ret |= io_kill_timeouts(ctx, task);
+ +              ret |= io_poll_remove_all(ctx, task, NULL);
+ +              ret |= io_kill_timeouts(ctx, task, NULL);
+ +              if (!ret)
+ +                      break;
+ +              io_run_task_work();
+ +              cond_resched();
         }
- -
- -      return ret;
   }
   
   /*
@@@ -8823,15 -8633,17 +8821,15 @@@ static void io_uring_cancel_task_reques
                 io_sq_thread_park(ctx->sq_data);
         }
   
- -      if (files)
- -              io_cancel_defer_files(ctx, NULL, files);
- -      else
- -              io_cancel_defer_files(ctx, task, NULL);
- -
+ +      io_cancel_defer_files(ctx, task, files);
+ +      io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
         io_cqring_overflow_flush(ctx, true, task, files);
+ +      io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
   
- -      while (__io_uring_cancel_task_requests(ctx, task, files)) {
- -              io_run_task_work();
- -              cond_resched();
- -      }
+ +      if (!files)
+ +              __io_uring_cancel_task_requests(ctx, task);
+ +      else
+ +              io_uring_cancel_files(ctx, task, files);
   
         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
                 atomic_dec(&task->io_uring->in_idle);
@@@ -9089,39 -8901,9 +9087,39 @@@ static void io_sqpoll_wait_sq(struct io
         finish_wait(&ctx->sqo_sq_wait, &wait);
   }
   
+ +static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+ +                        struct __kernel_timespec __user **ts,
+ +                        const sigset_t __user **sig)
+ +{
+ +      struct io_uring_getevents_arg arg;
+ +
+ +      /*
+ +       * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+ +       * is just a pointer to the sigset_t.
+ +       */
+ +      if (!(flags & IORING_ENTER_EXT_ARG)) {
+ +              *sig = (const sigset_t __user *) argp;
+ +              *ts = NULL;
+ +              return 0;
+ +      }
+ +
+ +      /*
+ +       * EXT_ARG is set - ensure we agree on the size of it and copy in our
+ +       * timespec and sigset_t pointers if good.
+ +       */
+ +      if (*argsz != sizeof(arg))
+ +              return -EINVAL;
+ +      if (copy_from_user(&arg, argp, sizeof(arg)))
+ +              return -EFAULT;
+ +      *sig = u64_to_user_ptr(arg.sigmask);
+ +      *argsz = arg.sigmask_sz;
+ +      *ts = u64_to_user_ptr(arg.ts);
+ +      return 0;
+ +}
+ +
   SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- -              u32, min_complete, u32, flags, const sigset_t __user *, sig,
- -              size_t, sigsz)
+ +              u32, min_complete, u32, flags, const void __user *, argp,
+ +              size_t, argsz)
   {
         struct io_ring_ctx *ctx;
         long ret = -EBADF;
@@@ -9131,7 -8913,7 +9129,7 @@@
         io_run_task_work();
   
         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- -                      IORING_ENTER_SQ_WAIT))
+ +                      IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
                 return -EINVAL;
   
         f = fdget(fd);
@@@ -9158,10 -8940,8 +9156,10 @@@
          */
         ret = 0;
         if (ctx->flags & IORING_SETUP_SQPOLL) {
+ +              io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                 if (!list_empty_careful(&ctx->cq_overflow_list))
                         io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ +              io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                 if (flags & IORING_ENTER_SQ_WAKEUP)
                         wake_up(&ctx->sq_data->wait);
                 if (flags & IORING_ENTER_SQ_WAIT)
@@@ -9179,13 -8959,6 +9177,13 @@@
                         goto out;
         }
         if (flags & IORING_ENTER_GETEVENTS) {
+ +              const sigset_t __user *sig;
+ +              struct __kernel_timespec __user *ts;
+ +
+ +              ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ +              if (unlikely(ret))
+ +                      goto out;
+ +
                 min_complete = min(min_complete, ctx->cq_entries);
   
                 /*
@@@ -9198,7 -8971,7 +9196,7 @@@
                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
                         ret = io_iopoll_check(ctx, min_complete);
                 } else {
- -                      ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ +                      ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
                 }
         }
   
@@@ -9381,7 -9154,6 +9379,7 @@@ static int io_uring_get_fd(struct io_ri
   {
         struct file *file;
         int ret;
+ +      int fd;
   
   #if defined(CONFIG_UNIX)
         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
@@@ -9393,12 -9165,12 +9391,12 @@@
         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
         if (ret < 0)
                 goto err;
+ +      fd = ret;
   
         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                         O_RDWR | O_CLOEXEC);
         if (IS_ERR(file)) {
- -err_fd:
- -              put_unused_fd(ret);
+ +              put_unused_fd(fd);
                 ret = PTR_ERR(file);
                 goto err;
         }
@@@ -9406,14 -9178,12 +9404,14 @@@
   #if defined(CONFIG_UNIX)
         ctx->ring_sock->file = file;
   #endif
- -      if (unlikely(io_uring_add_task_file(ctx, file))) {
- -              file = ERR_PTR(-ENOMEM);
- -              goto err_fd;
+ +      ret = io_uring_add_task_file(ctx, file);
+ +      if (ret) {
+ +              fput(file);
+ +              put_unused_fd(fd);
+ +              goto err;
         }
- -      fd_install(ret, file);
- -      return ret;
+ +      fd_install(fd, file);
+ +      return fd;
   err:
   #if defined(CONFIG_UNIX)
         sock_release(ctx->ring_sock);
@@@ -9453,16 -9223,14 +9451,16 @@@ static int io_uring_create(unsigned ent
                  * to a power-of-two, if it isn't already. We do NOT impose
                  * any cq vs sq ring sizing.
                  */
- -              p->cq_entries = roundup_pow_of_two(p->cq_entries);
- -              if (p->cq_entries < p->sq_entries)
+ +              if (!p->cq_entries)
                         return -EINVAL;
                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
                         if (!(p->flags & IORING_SETUP_CLAMP))
                                 return -EINVAL;
                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
                 }
+ +              p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ +              if (p->cq_entries < p->sq_entries)
+ +                      return -EINVAL;
         } else {
                 p->cq_entries = 2 * p->sq_entries;
         }
@@@ -9566,8 -9334,7 +9564,8 @@@
         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- -                      IORING_FEAT_POLL_32BITS;
+ +                      IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+ +                      IORING_FEAT_EXT_ARG;
   
         if (copy_to_user(params, p, sizeof(*p))) {
                 ret = -EFAULT;
diff --combined fs/pstore/blk.c

index 882d0bc,777a26f..4bb8a34
--- 1/fs/pstore/blk.c
--- 2/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@@ -90,6 -90,7 +90,6 @@@ MODULE_PARM_DESC(blkdev, "block device 
   static DEFINE_MUTEX(pstore_blk_lock);
   static struct block_device *psblk_bdev;
   static struct pstore_zone_info *pstore_zone_info;
- -static pstore_blk_panic_write_op blkdev_panic_write;
   
   struct bdev_info {
         dev_t devt;
@@@ -244,7 -245,7 +244,7 @@@ static struct block_device *psblk_get_b
                         return bdev;
         }
   
-       nr_sects = part_nr_sects_read(bdev->bd_part);
+       nr_sects = bdev_nr_sectors(bdev);
         if (!nr_sects) {
                 pr_err("not enough space for '%s'\n", blkdev);
                 blkdev_put(bdev, mode);
@@@ -340,11 -341,24 +340,11 @@@ static ssize_t psblk_generic_blk_write(
         return ret;
   }
   
- -static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
- -              loff_t off)
- -{
- -      int ret;
- -
- -      if (!blkdev_panic_write)
- -              return -EOPNOTSUPP;
- -
- -      /* size and off must align to SECTOR_SIZE for block device */
- -      ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
- -                      size >> SECTOR_SHIFT);
- -      /* try next zone */
- -      if (ret == -ENOMSG)
- -              return ret;
- -      return ret ? -EIO : size;
- -}
- -
- -static int __register_pstore_blk(struct pstore_blk_info *info)
+ +/*
+ + * This takes its configuration only from the module parameters now.
+ + * See psblk_get_bdev() and blkdev.
+ + */
+ +static int __register_pstore_blk(void)
   {
         char bdev_name[BDEVNAME_SIZE];
         struct block_device *bdev;
@@@ -364,34 -378,68 +364,34 @@@
         }
   
         /* only allow driver matching the @blkdev */
- -      if (!binfo.devt || (!best_effort &&
- -                          MAJOR(binfo.devt) != info->major)) {
- -              pr_debug("invalid major %u (expect %u)\n",
- -                              info->major, MAJOR(binfo.devt));
+ +      if (!binfo.devt) {
+ +              pr_debug("no major\n");
                 ret = -ENODEV;
                 goto err_put_bdev;
         }
   
         /* psblk_bdev must be assigned before register to pstore/blk */
         psblk_bdev = bdev;
- -      blkdev_panic_write = info->panic_write;
- -
- -      /* Copy back block device details. */
- -      info->devt = binfo.devt;
- -      info->nr_sects = binfo.nr_sects;
- -      info->start_sect = binfo.start_sect;
   
         memset(&dev, 0, sizeof(dev));
- -      dev.total_size = info->nr_sects << SECTOR_SHIFT;
- -      dev.flags = info->flags;
+ +      dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
         dev.read = psblk_generic_blk_read;
         dev.write = psblk_generic_blk_write;
- -      dev.erase = NULL;
- -      dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
   
         ret = __register_pstore_device(&dev);
         if (ret)
                 goto err_put_bdev;
   
         bdevname(bdev, bdev_name);
- -      pr_info("attached %s%s\n", bdev_name,
- -              info->panic_write ? "" : " (no dedicated panic_write!)");
+ +      pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
         return 0;
   
   err_put_bdev:
         psblk_bdev = NULL;
- -      blkdev_panic_write = NULL;
         psblk_put_bdev(bdev, holder);
         return ret;
   }
   
- -/**
- - * register_pstore_blk() - register block device to pstore/blk
- - *
- - * @info: details on the desired block device interface
- - *
- - * Return:
- - * * 0                - OK
- - * * Others   - something error.
- - */
- -int register_pstore_blk(struct pstore_blk_info *info)
- -{
- -      int ret;
- -
- -      mutex_lock(&pstore_blk_lock);
- -      ret = __register_pstore_blk(info);
- -      mutex_unlock(&pstore_blk_lock);
- -
- -      return ret;
- -}
- -EXPORT_SYMBOL_GPL(register_pstore_blk);
- -
   static void __unregister_pstore_blk(unsigned int major)
   {
         struct pstore_device_info dev = { .read = psblk_generic_blk_read };
@@@ -401,10 -449,24 +401,10 @@@
         if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
                 __unregister_pstore_device(&dev);
                 psblk_put_bdev(psblk_bdev, holder);
- -              blkdev_panic_write = NULL;
                 psblk_bdev = NULL;
         }
   }
   
- -/**
- - * unregister_pstore_blk() - unregister block device from pstore/blk
- - *
- - * @major: the major device number of device
- - */
- -void unregister_pstore_blk(unsigned int major)
- -{
- -      mutex_lock(&pstore_blk_lock);
- -      __unregister_pstore_blk(major);
- -      mutex_unlock(&pstore_blk_lock);
- -}
- -EXPORT_SYMBOL_GPL(unregister_pstore_blk);
- -
   /* get information of pstore/blk */
   int pstore_blk_get_config(struct pstore_blk_config *info)
   {
@@@ -421,11 -483,12 +421,11 @@@ EXPORT_SYMBOL_GPL(pstore_blk_get_config
   
   static int __init pstore_blk_init(void)
   {
- -      struct pstore_blk_info info = { };
         int ret = 0;
   
         mutex_lock(&pstore_blk_lock);
         if (!pstore_zone_info && best_effort && blkdev[0])
- -              ret = __register_pstore_blk(&info);
+ +              ret = __register_pstore_blk();
         mutex_unlock(&pstore_blk_lock);
   
         return ret;
diff --combined include/linux/blkdev.h

index 033eb5f,17cedf0..070de09
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -191,7 -191,7 +191,7 @@@ struct request 
         };
   
         struct gendisk *rq_disk;
-       struct hd_struct *part;
+       struct block_device *part;
   #ifdef CONFIG_BLK_RQ_ALLOC_TIME
         /* Time that the first bio started allocating this request. */
         u64 alloc_time_ns;
@@@ -1073,15 -1073,12 +1073,15 @@@ static inline unsigned int blk_queue_ge
    * file system requests.
    */
   static inline unsigned int blk_max_size_offset(struct request_queue *q,
- -                                             sector_t offset)
+ +                                             sector_t offset,
+ +                                             unsigned int chunk_sectors)
   {
- -      unsigned int chunk_sectors = q->limits.chunk_sectors;
- -
- -      if (!chunk_sectors)
- -              return q->limits.max_sectors;
+ +      if (!chunk_sectors) {
+ +              if (q->limits.chunk_sectors)
+ +                      chunk_sectors = q->limits.chunk_sectors;
+ +              else
+ +                      return q->limits.max_sectors;
+ +      }
   
         if (likely(is_power_of_2(chunk_sectors)))
                 chunk_sectors -= offset & (chunk_sectors - 1);
@@@ -1104,7 -1101,7 +1104,7 @@@ static inline unsigned int blk_rq_get_m
             req_op(rq) == REQ_OP_SECURE_ERASE)
                 return blk_queue_get_max_sectors(q, req_op(rq));
   
- -      return min(blk_max_size_offset(q, offset),
+ +      return min(blk_max_size_offset(q, offset, 0),
                         blk_queue_get_max_sectors(q, req_op(rq)));
   }
   
@@@ -1491,7 -1488,7 +1491,7 @@@ static inline int bdev_alignment_offset
                 return -1;
         if (bdev_is_partition(bdev))
                 return queue_limit_alignment_offset(&q->limits,
-                               bdev->bd_part->start_sect);
+                               bdev->bd_start_sect);
         return q->limits.alignment_offset;
   }
   
@@@ -1532,7 -1529,7 +1532,7 @@@ static inline int bdev_discard_alignmen
   
         if (bdev_is_partition(bdev))
                 return queue_limit_discard_alignment(&q->limits,
-                               bdev->bd_part->start_sect);
+                               bdev->bd_start_sect);
         return q->limits.discard_alignment;
   }
   
@@@ -1853,6 -1850,7 +1853,7 @@@ struct block_device_operations 
         void (*unlock_native_capacity) (struct gendisk *);
         int (*revalidate_disk) (struct gendisk *);
         int (*getgeo)(struct block_device *, struct hd_geometry *);
+       int (*set_read_only)(struct block_device *bdev, bool ro);
         /* this callback is with swap_lock and sometimes page table lock held */
         void (*swap_slot_free_notify) (struct block_device *, unsigned long);
         int (*report_zones)(struct gendisk *, sector_t sector,
@@@ -1869,8 -1867,6 +1870,6 @@@ extern int blkdev_compat_ptr_ioctl(stru
   #define blkdev_compat_ptr_ioctl NULL
   #endif
   
- extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
-                                unsigned long);
   extern int bdev_read_page(struct block_device *, sector_t, struct page *);
   extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                 struct writeback_control *);
@@@ -1947,9 -1943,9 +1946,9 @@@ unsigned long disk_start_io_acct(struc
   void disk_end_io_acct(struct gendisk *disk, unsigned int op,
                 unsigned long start_time);
   
- unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
-                                struct bio *bio);
- void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long part_start_io_acct(struct gendisk *disk,
+               struct block_device **part, struct bio *bio);
+ void part_end_io_acct(struct block_device *part, struct bio *bio,
                       unsigned long start_time);
   
   /**
@@@ -1977,7 -1973,7 +1976,7 @@@ int bdev_read_only(struct block_device 
   int set_blocksize(struct block_device *bdev, int size);
   
   const char *bdevname(struct block_device *bdev, char *buffer);
- struct block_device *lookup_bdev(const char *);
+ int lookup_bdev(const char *pathname, dev_t *dev);
   
   void blkdev_show(struct seq_file *seqf, off_t offset);
   
@@@ -1992,14 -1988,17 +1991,17 @@@
   struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                 void *holder);
   struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
- int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
-               void *holder);
- void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
-               void *holder);
+ int bd_prepare_to_claim(struct block_device *bdev, void *holder);
+ void bd_abort_claiming(struct block_device *bdev, void *holder);
   void blkdev_put(struct block_device *bdev, fmode_t mode);
   
+ /* just for blk-cgroup, don't use elsewhere */
+ struct block_device *blkdev_get_no_open(dev_t dev);
+ void blkdev_put_no_open(struct block_device *bdev);
+ 
+ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
+ void bdev_add(struct block_device *bdev, dev_t dev);
   struct block_device *I_BDEV(struct inode *inode);
- struct block_device *bdget_part(struct hd_struct *part);
   struct block_device *bdgrab(struct block_device *bdev);
   void bdput(struct block_device *);
   
@@@ -2024,7 -2023,7 +2026,7 @@@ static inline int sync_blockdev(struct 
   #endif
   int fsync_bdev(struct block_device *bdev);
   
- struct super_block *freeze_bdev(struct block_device *bdev);
- int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+ int freeze_bdev(struct block_device *bdev);
+ int thaw_bdev(struct block_device *bdev);
   
   #endif /* _LINUX_BLKDEV_H */
diff --combined include/linux/fs.h

index afb42d1,b0b3583..59bba64
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -696,7 -696,6 +696,6 @@@ struct inode 
         struct list_head        i_devices;
         union {
                 struct pipe_inode_info  *i_pipe;
-               struct block_device     *i_bdev;
                 struct cdev             *i_cdev;
                 char                    *i_link;
                 unsigned                i_dir_seq;
@@@ -923,7 -922,7 +922,7 @@@ struct file 
         const struct file_operations    *f_op;
   
         /*
- -       * Protects f_ep_links, f_flags.
+ +       * Protects f_ep, f_flags.
          * Must not be taken from IRQ context.
          */
         spinlock_t              f_lock;
@@@ -946,7 -945,8 +945,7 @@@
   
   #ifdef CONFIG_EPOLL
         /* Used by fs/eventpoll.c to link all the hooks to this file */
- -      struct list_head        f_ep_links;
- -      struct list_head        f_tfile_llink;
+ +      struct hlist_head       *f_ep;
   #endif /* #ifdef CONFIG_EPOLL */
         struct address_space    *f_mapping;
         errseq_t                f_wb_err;
@@@ -1408,7 -1408,7 +1407,7 @@@ enum 
   
   struct sb_writers {
         int                             frozen;         /* Is sb frozen? */
-       wait_queue_head_t               wait_unfrozen;  /* for get_super_thawed() */
+       wait_queue_head_t               wait_unfrozen;  /* wait for thaw */
         struct percpu_rw_semaphore      rw_sem[SB_FREEZE_LEVELS];
   };
   
@@@ -3131,8 -3131,6 +3130,6 @@@ extern struct file_system_type *get_fil
   extern void put_filesystem(struct file_system_type *fs);
   extern struct file_system_type *get_fs_type(const char *name);
   extern struct super_block *get_super(struct block_device *);
- extern struct super_block *get_super_thawed(struct block_device *);
- extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
   extern struct super_block *get_active_super(struct block_device *bdev);
   extern void drop_super(struct super_block *sb);
   extern void drop_super_exclusive(struct super_block *sb);
@@@ -3229,7 -3227,7 +3226,7 @@@ static inline bool vma_is_fsdax(struct 
   {
         struct inode *inode;
   
- -      if (!vma->vm_file)
+ +      if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                 return false;
         if (!vma_is_dax(vma))
                 return false;
diff --combined kernel/trace/blktrace.c

index b5c4b9a,2c5b3c5..456fe4c
--- 1/kernel/trace/blktrace.c
--- 2/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@@ -449,7 -449,7 +449,7 @@@ static struct dentry *blk_create_buf_fi
                                         &relay_file_operations);
   }
   
- -static struct rchan_callbacks blk_relay_callbacks = {
+ +static const struct rchan_callbacks blk_relay_callbacks = {
         .subbuf_start           = blk_subbuf_start_callback,
         .create_buf_file        = blk_create_buf_file_callback,
         .remove_buf_file        = blk_remove_buf_file_callback,
@@@ -458,14 -458,9 +458,9 @@@
   static void blk_trace_setup_lba(struct blk_trace *bt,
                                 struct block_device *bdev)
   {
-       struct hd_struct *part = NULL;
- 
-       if (bdev)
-               part = bdev->bd_part;
- 
-       if (part) {
-               bt->start_lba = part->start_sect;
-               bt->end_lba = part->start_sect + part->nr_sects;
+       if (bdev) {
+               bt->start_lba = bdev->bd_start_sect;
+               bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
         } else {
                 bt->start_lba = 0;
                 bt->end_lba = -1ULL;
@@@ -800,12 -795,12 +795,12 @@@ static u64 blk_trace_bio_get_cgid(struc
   #endif
   
   static u64
- blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+ blk_trace_request_get_cgid(struct request *rq)
   {
         if (!rq->bio)
                 return 0;
         /* Use the first bio */
-       return blk_trace_bio_get_cgid(q, rq->bio);
+       return blk_trace_bio_get_cgid(rq->q, rq->bio);
   }
   
   /*
@@@ -846,40 -841,35 +841,35 @@@ static void blk_add_trace_rq(struct req
         rcu_read_unlock();
   }
   
- static void blk_add_trace_rq_insert(void *ignore,
-                                   struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
   {
         blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
   }
   
- static void blk_add_trace_rq_issue(void *ignore,
-                                  struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
   {
         blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
   }
   
- static void blk_add_trace_rq_merge(void *ignore,
-                                  struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
   {
         blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
   }
   
- static void blk_add_trace_rq_requeue(void *ignore,
-                                    struct request_queue *q,
-                                    struct request *rq)
+ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
   {
         blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
-                        blk_trace_request_get_cgid(q, rq));
+                        blk_trace_request_get_cgid(rq));
   }
   
   static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
                         int error, unsigned int nr_bytes)
   {
         blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
-                        blk_trace_request_get_cgid(rq->q, rq));
+                        blk_trace_request_get_cgid(rq));
   }
   
   /**
@@@ -911,10 -901,9 +901,9 @@@ static void blk_add_trace_bio(struct re
         rcu_read_unlock();
   }
   
- static void blk_add_trace_bio_bounce(void *ignore,
-                                    struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
   {
-       blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
   }
   
   static void blk_add_trace_bio_complete(void *ignore,
@@@ -924,63 -913,24 +913,24 @@@
                           blk_status_to_errno(bio->bi_status));
   }
   
- static void blk_add_trace_bio_backmerge(void *ignore,
-                                       struct request_queue *q,
-                                       struct request *rq,
-                                       struct bio *bio)
+ static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
   {
-       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
   }
   
- static void blk_add_trace_bio_frontmerge(void *ignore,
-                                        struct request_queue *q,
-                                        struct request *rq,
-                                        struct bio *bio)
+ static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
   {
-       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
   }
   
- static void blk_add_trace_bio_queue(void *ignore,
-                                   struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
   {
-       blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
   }
   
- static void blk_add_trace_getrq(void *ignore,
-                               struct request_queue *q,
-                               struct bio *bio, int rw)
+ static void blk_add_trace_getrq(void *ignore, struct bio *bio)
   {
-       if (bio)
-               blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
-       else {
-               struct blk_trace *bt;
- 
-               rcu_read_lock();
-               bt = rcu_dereference(q->blk_trace);
-               if (bt)
-                       __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
-                                       NULL, 0);
-               rcu_read_unlock();
-       }
- }
- 
- 
- static void blk_add_trace_sleeprq(void *ignore,
-                                 struct request_queue *q,
-                                 struct bio *bio, int rw)
- {
-       if (bio)
-               blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
-       else {
-               struct blk_trace *bt;
- 
-               rcu_read_lock();
-               bt = rcu_dereference(q->blk_trace);
-               if (bt)
-                       __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
-                                       0, 0, NULL, 0);
-               rcu_read_unlock();
-       }
+       blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
   }
   
   static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@@ -1015,10 -965,9 +965,9 @@@ static void blk_add_trace_unplug(void *
         rcu_read_unlock();
   }
   
- static void blk_add_trace_split(void *ignore,
-                               struct request_queue *q, struct bio *bio,
-                               unsigned int pdu)
+ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
   {
+       struct request_queue *q = bio->bi_disk->queue;
         struct blk_trace *bt;
   
         rcu_read_lock();
@@@ -1039,20 -988,16 +988,16 @@@
   /**
    * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
    * @ignore:   trace callback data parameter (not used)
-  * @q:                queue the io is for
    * @bio:      the source bio
-  * @dev:      target device
+  * @dev:      source device
    * @from:     source sector
    *
-  * Description:
-  *     Device mapper or raid target sometimes need to split a bio because
-  *     it spans a stripe (or similar). Add a trace for that action.
-  *
+  * Called after a bio is remapped to a different device and/or sector.
    **/
- static void blk_add_trace_bio_remap(void *ignore,
-                                   struct request_queue *q, struct bio *bio,
-                                   dev_t dev, sector_t from)
+ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+                                   sector_t from)
   {
+       struct request_queue *q = bio->bi_disk->queue;
         struct blk_trace *bt;
         struct blk_io_trace_remap r;
   
@@@ -1077,7 -1022,6 +1022,6 @@@
   /**
    * blk_add_trace_rq_remap - Add a trace for a request-remap operation
    * @ignore:   trace callback data parameter (not used)
-  * @q:                queue the io is for
    * @rq:               the source request
    * @dev:      target device
    * @from:     source sector
@@@ -1087,16 -1031,14 +1031,14 @@@
    *     Add a trace for that action.
    *
    **/
- static void blk_add_trace_rq_remap(void *ignore,
-                                  struct request_queue *q,
-                                  struct request *rq, dev_t dev,
+ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
                                    sector_t from)
   {
         struct blk_trace *bt;
         struct blk_io_trace_remap r;
   
         rcu_read_lock();
-       bt = rcu_dereference(q->blk_trace);
+       bt = rcu_dereference(rq->q->blk_trace);
         if (likely(!bt)) {
                 rcu_read_unlock();
                 return;
@@@ -1108,13 -1050,12 +1050,12 @@@
   
         __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
                         rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-                       sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+                       sizeof(r), &r, blk_trace_request_get_cgid(rq));
         rcu_read_unlock();
   }
   
   /**
    * blk_add_driver_data - Add binary message with driver-specific data
-  * @q:                queue the io is for
    * @rq:               io request
    * @data:     driver-specific data
    * @len:      length of driver-specific data
@@@ -1123,14 -1064,12 +1064,12 @@@
    *     Some drivers might want to write driver-specific data per request.
    *
    **/
- void blk_add_driver_data(struct request_queue *q,
-                        struct request *rq,
-                        void *data, size_t len)
+ void blk_add_driver_data(struct request *rq, void *data, size_t len)
   {
         struct blk_trace *bt;
   
         rcu_read_lock();
-       bt = rcu_dereference(q->blk_trace);
+       bt = rcu_dereference(rq->q->blk_trace);
         if (likely(!bt)) {
                 rcu_read_unlock();
                 return;
@@@ -1138,7 -1077,7 +1077,7 @@@
   
         __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
                                 BLK_TA_DRV_DATA, 0, len, data,
-                               blk_trace_request_get_cgid(q, rq));
+                               blk_trace_request_get_cgid(rq));
         rcu_read_unlock();
   }
   EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@@ -1169,8 -1108,6 +1108,6 @@@ static void blk_register_tracepoints(vo
         WARN_ON(ret);
         ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
         WARN_ON(ret);
-       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
-       WARN_ON(ret);
         ret = register_trace_block_plug(blk_add_trace_plug, NULL);
         WARN_ON(ret);
         ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@@ -1190,7 -1127,6 +1127,6 @@@ static void blk_unregister_tracepoints(
         unregister_trace_block_split(blk_add_trace_split, NULL);
         unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
         unregister_trace_block_plug(blk_add_trace_plug, NULL);
-       unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
         unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
         unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
         unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@@ -1815,30 -1751,15 +1751,15 @@@ static ssize_t blk_trace_mask2str(char 
         return p - buf;
   }
   
- static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
- {
-       if (bdev->bd_disk == NULL)
-               return NULL;
- 
-       return bdev_get_queue(bdev);
- }
- 
   static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
   {
-       struct block_device *bdev = bdget_part(dev_to_part(dev));
-       struct request_queue *q;
+       struct block_device *bdev = dev_to_bdev(dev);
+       struct request_queue *q = bdev_get_queue(bdev);
         struct blk_trace *bt;
         ssize_t ret = -ENXIO;
   
-       if (bdev == NULL)
-               goto out;
- 
-       q = blk_trace_get_queue(bdev);
-       if (q == NULL)
-               goto out_bdput;
- 
         mutex_lock(&q->debugfs_mutex);
   
         bt = rcu_dereference_protected(q->blk_trace,
@@@ -1861,9 -1782,6 +1782,6 @@@
   
   out_unlock_bdev:
         mutex_unlock(&q->debugfs_mutex);
- out_bdput:
-       bdput(bdev);
- out:
         return ret;
   }
   
@@@ -1871,8 -1789,8 +1789,8 @@@ static ssize_t sysfs_blk_trace_attr_sto
                                           struct device_attribute *attr,
                                           const char *buf, size_t count)
   {
-       struct block_device *bdev;
-       struct request_queue *q;
+       struct block_device *bdev = dev_to_bdev(dev);
+       struct request_queue *q = bdev_get_queue(bdev);
         struct blk_trace *bt;
         u64 value;
         ssize_t ret = -EINVAL;
@@@ -1888,17 -1806,10 +1806,10 @@@
                                 goto out;
                         value = ret;
                 }
-       } else if (kstrtoull(buf, 0, &value))
-               goto out;
- 
-       ret = -ENXIO;
-       bdev = bdget_part(dev_to_part(dev));
-       if (bdev == NULL)
-               goto out;
- 
-       q = blk_trace_get_queue(bdev);
-       if (q == NULL)
-               goto out_bdput;
+       } else {
+               if (kstrtoull(buf, 0, &value))
+                       goto out;
+       }
   
         mutex_lock(&q->debugfs_mutex);
   
@@@ -1936,8 -1847,6 +1847,6 @@@
   
   out_unlock_bdev:
         mutex_unlock(&q->debugfs_mutex);
- out_bdput:
-       bdput(bdev);
   out:
         return ret ? ret : count;
   }
diff --combined mm/filemap.c

index 2e16daf,4f58348..7a49bac
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -102,8 -102,8 +102,8 @@@
    *    ->swap_lock             (try_to_unmap_one)
    *    ->private_lock          (try_to_unmap_one)
    *    ->i_pages lock          (try_to_unmap_one)
- - *    ->pgdat->lru_lock               (follow_page->mark_page_accessed)
- - *    ->pgdat->lru_lock               (check_pte_range->isolate_lru_page)
+ + *    ->lruvec->lru_lock      (follow_page->mark_page_accessed)
+ + *    ->lruvec->lru_lock      (check_pte_range->isolate_lru_page)
    *    ->private_lock          (page_remove_rmap->set_page_dirty)
    *    ->i_pages lock          (page_remove_rmap->set_page_dirty)
    *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
@@@ -204,9 -204,9 +204,9 @@@ static void unaccount_page_cache_page(s
         if (PageSwapBacked(page)) {
                 __mod_lruvec_page_state(page, NR_SHMEM, -nr);
                 if (PageTransHuge(page))
- -                      __dec_node_page_state(page, NR_SHMEM_THPS);
+ +                      __dec_lruvec_page_state(page, NR_SHMEM_THPS);
         } else if (PageTransHuge(page)) {
- -              __dec_node_page_state(page, NR_FILE_THPS);
+ +              __dec_lruvec_page_state(page, NR_FILE_THPS);
                 filemap_nr_thps_dec(mapping);
         }
   
@@@ -1359,7 -1359,7 +1359,7 @@@ static int __wait_on_page_locked_async(
         else
                 ret = PageLocked(page);
         /*
- -       * If we were succesful now, we know we're still on the
+ +       * If we were successful now, we know we're still on the
          * waitqueue as we're still under the lock. This means it's
          * safe to remove and return success, we know the callback
          * isn't going to trigger.
@@@ -1484,19 -1484,11 +1484,19 @@@ void end_page_writeback(struct page *pa
                 rotate_reclaimable_page(page);
         }
   
+ +      /*
+ +       * Writeback does not hold a page reference of its own, relying
+ +       * on truncation to wait for the clearing of PG_writeback.
+ +       * But here we must make sure that the page is not freed and
+ +       * reused before the wake_up_page().
+ +       */
+ +      get_page(page);
         if (!test_clear_page_writeback(page))
                 BUG();
   
         smp_mb__after_atomic();
         wake_up_page(page, PG_writeback);
+ +      put_page(page);
   }
   EXPORT_SYMBOL(end_page_writeback);
   
@@@ -1583,20 -1575,19 +1583,20 @@@ int __lock_page_or_retry(struct page *p
                 else
                         wait_on_page_locked(page);
                 return 0;
- -      } else {
- -              if (flags & FAULT_FLAG_KILLABLE) {
- -                      int ret;
+ +      }
+ +      if (flags & FAULT_FLAG_KILLABLE) {
+ +              int ret;
   
- -                      ret = __lock_page_killable(page);
- -                      if (ret) {
- -                              mmap_read_unlock(mm);
- -                              return 0;
- -                      }
- -              } else
- -                      __lock_page(page);
- -              return 1;
+ +              ret = __lock_page_killable(page);
+ +              if (ret) {
+ +                      mmap_read_unlock(mm);
+ +                      return 0;
+ +              }
+ +      } else {
+ +              __lock_page(page);
         }
+ +      return 1;
+ +
   }
   
   /**
@@@ -2167,259 -2158,6 +2167,259 @@@ static void shrink_readahead_size_eio(s
         ra->ra_pages /= 4;
   }
   
+ +static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+ +{
+ +      if (iocb->ki_flags & IOCB_WAITQ)
+ +              return lock_page_async(page, iocb->ki_waitq);
+ +      else if (iocb->ki_flags & IOCB_NOWAIT)
+ +              return trylock_page(page) ? 0 : -EAGAIN;
+ +      else
+ +              return lock_page_killable(page);
+ +}
+ +
+ +static struct page *
+ +generic_file_buffered_read_readpage(struct kiocb *iocb,
+ +                                  struct file *filp,
+ +                                  struct address_space *mapping,
+ +                                  struct page *page)
+ +{
+ +      struct file_ra_state *ra = &filp->f_ra;
+ +      int error;
+ +
+ +      if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ +              unlock_page(page);
+ +              put_page(page);
+ +              return ERR_PTR(-EAGAIN);
+ +      }
+ +
+ +      /*
+ +       * A previous I/O error may have been due to temporary
+ +       * failures, eg. multipath errors.
+ +       * PG_error will be set again if readpage fails.
+ +       */
+ +      ClearPageError(page);
+ +      /* Start the actual read. The read will unlock the page. */
+ +      error = mapping->a_ops->readpage(filp, page);
+ +
+ +      if (unlikely(error)) {
+ +              put_page(page);
+ +              return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+ +      }
+ +
+ +      if (!PageUptodate(page)) {
+ +              error = lock_page_for_iocb(iocb, page);
+ +              if (unlikely(error)) {
+ +                      put_page(page);
+ +                      return ERR_PTR(error);
+ +              }
+ +              if (!PageUptodate(page)) {
+ +                      if (page->mapping == NULL) {
+ +                              /*
+ +                               * invalidate_mapping_pages got it
+ +                               */
+ +                              unlock_page(page);
+ +                              put_page(page);
+ +                              return NULL;
+ +                      }
+ +                      unlock_page(page);
+ +                      shrink_readahead_size_eio(ra);
+ +                      put_page(page);
+ +                      return ERR_PTR(-EIO);
+ +              }
+ +              unlock_page(page);
+ +      }
+ +
+ +      return page;
+ +}
+ +
+ +static struct page *
+ +generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
+ +                                         struct file *filp,
+ +                                         struct iov_iter *iter,
+ +                                         struct page *page,
+ +                                         loff_t pos, loff_t count)
+ +{
+ +      struct address_space *mapping = filp->f_mapping;
+ +      struct inode *inode = mapping->host;
+ +      int error;
+ +
+ +      /*
+ +       * See comment in do_read_cache_page on why
+ +       * wait_on_page_locked is used to avoid unnecessarily
+ +       * serialisations and why it's safe.
+ +       */
+ +      if (iocb->ki_flags & IOCB_WAITQ) {
+ +              error = wait_on_page_locked_async(page,
+ +                                              iocb->ki_waitq);
+ +      } else {
+ +              error = wait_on_page_locked_killable(page);
+ +      }
+ +      if (unlikely(error)) {
+ +              put_page(page);
+ +              return ERR_PTR(error);
+ +      }
+ +      if (PageUptodate(page))
+ +              return page;
+ +
+ +      if (inode->i_blkbits == PAGE_SHIFT ||
+ +                      !mapping->a_ops->is_partially_uptodate)
+ +              goto page_not_up_to_date;
+ +      /* pipes can't handle partially uptodate pages */
+ +      if (unlikely(iov_iter_is_pipe(iter)))
+ +              goto page_not_up_to_date;
+ +      if (!trylock_page(page))
+ +              goto page_not_up_to_date;
+ +      /* Did it get truncated before we got the lock? */
+ +      if (!page->mapping)
+ +              goto page_not_up_to_date_locked;
+ +      if (!mapping->a_ops->is_partially_uptodate(page,
+ +                              pos & ~PAGE_MASK, count))
+ +              goto page_not_up_to_date_locked;
+ +      unlock_page(page);
+ +      return page;
+ +
+ +page_not_up_to_date:
+ +      /* Get exclusive access to the page ... */
+ +      error = lock_page_for_iocb(iocb, page);
+ +      if (unlikely(error)) {
+ +              put_page(page);
+ +              return ERR_PTR(error);
+ +      }
+ +
+ +page_not_up_to_date_locked:
+ +      /* Did it get truncated before we got the lock? */
+ +      if (!page->mapping) {
+ +              unlock_page(page);
+ +              put_page(page);
+ +              return NULL;
+ +      }
+ +
+ +      /* Did somebody else fill it already? */
+ +      if (PageUptodate(page)) {
+ +              unlock_page(page);
+ +              return page;
+ +      }
+ +
+ +      return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ +}
+ +
+ +static struct page *
+ +generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
+ +                                        struct iov_iter *iter)
+ +{
+ +      struct file *filp = iocb->ki_filp;
+ +      struct address_space *mapping = filp->f_mapping;
+ +      pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ +      struct page *page;
+ +      int error;
+ +
+ +      if (iocb->ki_flags & IOCB_NOIO)
+ +              return ERR_PTR(-EAGAIN);
+ +
+ +      /*
+ +       * Ok, it wasn't cached, so we need to create a new
+ +       * page..
+ +       */
+ +      page = page_cache_alloc(mapping);
+ +      if (!page)
+ +              return ERR_PTR(-ENOMEM);
+ +
+ +      error = add_to_page_cache_lru(page, mapping, index,
+ +                                    mapping_gfp_constraint(mapping, GFP_KERNEL));
+ +      if (error) {
+ +              put_page(page);
+ +              return error != -EEXIST ? ERR_PTR(error) : NULL;
+ +      }
+ +
+ +      return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ +}
+ +
+ +static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+ +                                              struct iov_iter *iter,
+ +                                              struct page **pages,
+ +                                              unsigned int nr)
+ +{
+ +      struct file *filp = iocb->ki_filp;
+ +      struct address_space *mapping = filp->f_mapping;
+ +      struct file_ra_state *ra = &filp->f_ra;
+ +      pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ +      pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ +      int i, j, nr_got, err = 0;
+ +
+ +      nr = min_t(unsigned long, last_index - index, nr);
+ +find_page:
+ +      if (fatal_signal_pending(current))
+ +              return -EINTR;
+ +
+ +      nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ +      if (nr_got)
+ +              goto got_pages;
+ +
+ +      if (iocb->ki_flags & IOCB_NOIO)
+ +              return -EAGAIN;
+ +
+ +      page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+ +
+ +      nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ +      if (nr_got)
+ +              goto got_pages;
+ +
+ +      pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+ +      err = PTR_ERR_OR_ZERO(pages[0]);
+ +      if (!IS_ERR_OR_NULL(pages[0]))
+ +              nr_got = 1;
+ +got_pages:
+ +      for (i = 0; i < nr_got; i++) {
+ +              struct page *page = pages[i];
+ +              pgoff_t pg_index = index + i;
+ +              loff_t pg_pos = max(iocb->ki_pos,
+ +                                  (loff_t) pg_index << PAGE_SHIFT);
+ +              loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
+ +
+ +              if (PageReadahead(page)) {
+ +                      if (iocb->ki_flags & IOCB_NOIO) {
+ +                              for (j = i; j < nr_got; j++)
+ +                                      put_page(pages[j]);
+ +                              nr_got = i;
+ +                              err = -EAGAIN;
+ +                              break;
+ +                      }
+ +                      page_cache_async_readahead(mapping, ra, filp, page,
+ +                                      pg_index, last_index - pg_index);
+ +              }
+ +
+ +              if (!PageUptodate(page)) {
+ +                      if ((iocb->ki_flags & IOCB_NOWAIT) ||
+ +                          ((iocb->ki_flags & IOCB_WAITQ) && i)) {
+ +                              for (j = i; j < nr_got; j++)
+ +                                      put_page(pages[j]);
+ +                              nr_got = i;
+ +                              err = -EAGAIN;
+ +                              break;
+ +                      }
+ +
+ +                      page = generic_file_buffered_read_pagenotuptodate(iocb,
+ +                                      filp, iter, page, pg_pos, pg_count);
+ +                      if (IS_ERR_OR_NULL(page)) {
+ +                              for (j = i + 1; j < nr_got; j++)
+ +                                      put_page(pages[j]);
+ +                              nr_got = i;
+ +                              err = PTR_ERR_OR_ZERO(page);
+ +                              break;
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (likely(nr_got))
+ +              return nr_got;
+ +      if (err)
+ +              return err;
+ +      /*
+ +       * No pages and no error means we raced and should retry:
+ +       */
+ +      goto find_page;
+ +}
+ +
   /**
    * generic_file_buffered_read - generic file read routine
    * @iocb:     the iocb to read
@@@ -2440,117 -2178,284 +2440,117 @@@ ssize_t generic_file_buffered_read(stru
                 struct iov_iter *iter, ssize_t written)
   {
         struct file *filp = iocb->ki_filp;
+ +      struct file_ra_state *ra = &filp->f_ra;
         struct address_space *mapping = filp->f_mapping;
         struct inode *inode = mapping->host;
- -      struct file_ra_state *ra = &filp->f_ra;
- -      loff_t *ppos = &iocb->ki_pos;
- -      pgoff_t index;
- -      pgoff_t last_index;
- -      pgoff_t prev_index;
- -      unsigned long offset;      /* offset into pagecache page */
- -      unsigned int prev_offset;
- -      int error = 0;
- -
- -      if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ +      struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
+ +      unsigned int nr_pages = min_t(unsigned int, 512,
+ +                      ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ +                      (iocb->ki_pos >> PAGE_SHIFT));
+ +      int i, pg_nr, error = 0;
+ +      bool writably_mapped;
+ +      loff_t isize, end_offset;
+ +
+ +      if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                 return 0;
         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
   
- -      index = *ppos >> PAGE_SHIFT;
- -      prev_index = ra->prev_pos >> PAGE_SHIFT;
- -      prev_offset = ra->prev_pos & (PAGE_SIZE-1);
- -      last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- -      offset = *ppos & ~PAGE_MASK;
+ +      if (nr_pages > ARRAY_SIZE(pages_onstack))
+ +              pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
   
- -      /*
- -       * If we've already successfully copied some data, then we
- -       * can no longer safely return -EIOCBQUEUED. Hence mark
- -       * an async read NOWAIT at that point.
- -       */
- -      if (written && (iocb->ki_flags & IOCB_WAITQ))
- -              iocb->ki_flags |= IOCB_NOWAIT;
- -
- -      for (;;) {
- -              struct page *page;
- -              pgoff_t end_index;
- -              loff_t isize;
- -              unsigned long nr, ret;
+ +      if (!pages) {
+ +              pages = pages_onstack;
+ +              nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
+ +      }
   
+ +      do {
                 cond_resched();
- -find_page:
- -              if (fatal_signal_pending(current)) {
- -                      error = -EINTR;
- -                      goto out;
- -              }
   
- -              page = find_get_page(mapping, index);
- -              if (!page) {
- -                      if (iocb->ki_flags & IOCB_NOIO)
- -                              goto would_block;
- -                      page_cache_sync_readahead(mapping,
- -                                      ra, filp,
- -                                      index, last_index - index);
- -                      page = find_get_page(mapping, index);
- -                      if (unlikely(page == NULL))
- -                              goto no_cached_page;
- -              }
- -              if (PageReadahead(page)) {
- -                      if (iocb->ki_flags & IOCB_NOIO) {
- -                              put_page(page);
- -                              goto out;
- -                      }
- -                      page_cache_async_readahead(mapping,
- -                                      ra, filp, page,
- -                                      index, last_index - index);
- -              }
- -              if (!PageUptodate(page)) {
- -                      /*
- -                       * See comment in do_read_cache_page on why
- -                       * wait_on_page_locked is used to avoid unnecessarily
- -                       * serialisations and why it's safe.
- -                       */
- -                      if (iocb->ki_flags & IOCB_WAITQ) {
- -                              if (written) {
- -                                      put_page(page);
- -                                      goto out;
- -                              }
- -                              error = wait_on_page_locked_async(page,
- -                                                              iocb->ki_waitq);
- -                      } else {
- -                              if (iocb->ki_flags & IOCB_NOWAIT) {
- -                                      put_page(page);
- -                                      goto would_block;
- -                              }
- -                              error = wait_on_page_locked_killable(page);
- -                      }
- -                      if (unlikely(error))
- -                              goto readpage_error;
- -                      if (PageUptodate(page))
- -                              goto page_ok;
- -
- -                      if (inode->i_blkbits == PAGE_SHIFT ||
- -                                      !mapping->a_ops->is_partially_uptodate)
- -                              goto page_not_up_to_date;
- -                      /* pipes can't handle partially uptodate pages */
- -                      if (unlikely(iov_iter_is_pipe(iter)))
- -                              goto page_not_up_to_date;
- -                      if (!trylock_page(page))
- -                              goto page_not_up_to_date;
- -                      /* Did it get truncated before we got the lock? */
- -                      if (!page->mapping)
- -                              goto page_not_up_to_date_locked;
- -                      if (!mapping->a_ops->is_partially_uptodate(page,
- -                                                      offset, iter->count))
- -                              goto page_not_up_to_date_locked;
- -                      unlock_page(page);
+ +              /*
+ +               * If we've already successfully copied some data, then we
+ +               * can no longer safely return -EIOCBQUEUED. Hence mark
+ +               * an async read NOWAIT at that point.
+ +               */
+ +              if ((iocb->ki_flags & IOCB_WAITQ) && written)
+ +                      iocb->ki_flags |= IOCB_NOWAIT;
+ +
+ +              i = 0;
+ +              pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
+ +                                                           pages, nr_pages);
+ +              if (pg_nr < 0) {
+ +                      error = pg_nr;
+ +                      break;
                 }
- -page_ok:
+ +
                 /*
- -               * i_size must be checked after we know the page is Uptodate.
+ +               * i_size must be checked after we know the pages are Uptodate.
                  *
                  * Checking i_size after the check allows us to calculate
                  * the correct value for "nr", which means the zero-filled
                  * part of the page is not copied back to userspace (unless
                  * another truncate extends the file - this is desired though).
                  */
- -
                 isize = i_size_read(inode);
- -              end_index = (isize - 1) >> PAGE_SHIFT;
- -              if (unlikely(!isize || index > end_index)) {
- -                      put_page(page);
- -                      goto out;
- -              }
+ +              if (unlikely(iocb->ki_pos >= isize))
+ +                      goto put_pages;
   
- -              /* nr is the maximum number of bytes to copy from this page */
- -              nr = PAGE_SIZE;
- -              if (index == end_index) {
- -                      nr = ((isize - 1) & ~PAGE_MASK) + 1;
- -                      if (nr <= offset) {
- -                              put_page(page);
- -                              goto out;
- -                      }
- -              }
- -              nr = nr - offset;
+ +              end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
   
- -              /* If users can be writing to this page using arbitrary
- -               * virtual addresses, take care about potential aliasing
- -               * before reading the page on the kernel side.
- -               */
- -              if (mapping_writably_mapped(mapping))
- -                      flush_dcache_page(page);
+ +              while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+ +                     (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
+ +                      put_page(pages[--pg_nr]);
   
                 /*
- -               * When a sequential read accesses a page several times,
- -               * only mark it as accessed the first time.
+ +               * Once we start copying data, we don't want to be touching any
+ +               * cachelines that might be contended:
                  */
- -              if (prev_index != index || offset != prev_offset)
- -                      mark_page_accessed(page);
- -              prev_index = index;
+ +              writably_mapped = mapping_writably_mapped(mapping);
   
                 /*
- -               * Ok, we have the page, and it's up-to-date, so
- -               * now we can copy it to user space...
+ +               * When a sequential read accesses a page several times, only
+ +               * mark it as accessed the first time.
                  */
+ +              if (iocb->ki_pos >> PAGE_SHIFT !=
+ +                  ra->prev_pos >> PAGE_SHIFT)
+ +                      mark_page_accessed(pages[0]);
+ +              for (i = 1; i < pg_nr; i++)
+ +                      mark_page_accessed(pages[i]);
+ +
+ +              for (i = 0; i < pg_nr; i++) {
+ +                      unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+ +                      unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ +                                                 PAGE_SIZE - offset);
+ +                      unsigned int copied;
   
- -              ret = copy_page_to_iter(page, offset, nr, iter);
- -              offset += ret;
- -              index += offset >> PAGE_SHIFT;
- -              offset &= ~PAGE_MASK;
- -              prev_offset = offset;
- -
- -              put_page(page);
- -              written += ret;
- -              if (!iov_iter_count(iter))
- -                      goto out;
- -              if (ret < nr) {
- -                      error = -EFAULT;
- -                      goto out;
- -              }
- -              continue;
- -
- -page_not_up_to_date:
- -              /* Get exclusive access to the page ... */
- -              if (iocb->ki_flags & IOCB_WAITQ)
- -                      error = lock_page_async(page, iocb->ki_waitq);
- -              else
- -                      error = lock_page_killable(page);
- -              if (unlikely(error))
- -                      goto readpage_error;
- -
- -page_not_up_to_date_locked:
- -              /* Did it get truncated before we got the lock? */
- -              if (!page->mapping) {
- -                      unlock_page(page);
- -                      put_page(page);
- -                      continue;
- -              }
- -
- -              /* Did somebody else fill it already? */
- -              if (PageUptodate(page)) {
- -                      unlock_page(page);
- -                      goto page_ok;
- -              }
+ +                      /*
+ +                       * If users can be writing to this page using arbitrary
+ +                       * virtual addresses, take care about potential aliasing
+ +                       * before reading the page on the kernel side.
+ +                       */
+ +                      if (writably_mapped)
+ +                              flush_dcache_page(pages[i]);
   
- -readpage:
- -              if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- -                      unlock_page(page);
- -                      put_page(page);
- -                      goto would_block;
- -              }
- -              /*
- -               * A previous I/O error may have been due to temporary
- -               * failures, eg. multipath errors.
- -               * PG_error will be set again if readpage fails.
- -               */
- -              ClearPageError(page);
- -              /* Start the actual read. The read will unlock the page. */
- -              error = mapping->a_ops->readpage(filp, page);
+ +                      copied = copy_page_to_iter(pages[i], offset, bytes, iter);
   
- -              if (unlikely(error)) {
- -                      if (error == AOP_TRUNCATED_PAGE) {
- -                              put_page(page);
- -                              error = 0;
- -                              goto find_page;
- -                      }
- -                      goto readpage_error;
- -              }
+ +                      written += copied;
+ +                      iocb->ki_pos += copied;
+ +                      ra->prev_pos = iocb->ki_pos;
   
- -              if (!PageUptodate(page)) {
- -                      if (iocb->ki_flags & IOCB_WAITQ)
- -                              error = lock_page_async(page, iocb->ki_waitq);
- -                      else
- -                              error = lock_page_killable(page);
- -
- -                      if (unlikely(error))
- -                              goto readpage_error;
- -                      if (!PageUptodate(page)) {
- -                              if (page->mapping == NULL) {
- -                                      /*
- -                                       * invalidate_mapping_pages got it
- -                                       */
- -                                      unlock_page(page);
- -                                      put_page(page);
- -                                      goto find_page;
- -                              }
- -                              unlock_page(page);
- -                              shrink_readahead_size_eio(ra);
- -                              error = -EIO;
- -                              goto readpage_error;
+ +                      if (copied < bytes) {
+ +                              error = -EFAULT;
+ +                              break;
                         }
- -                      unlock_page(page);
                 }
+ +put_pages:
+ +              for (i = 0; i < pg_nr; i++)
+ +                      put_page(pages[i]);
+ +      } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
   
- -              goto page_ok;
- -
- -readpage_error:
- -              /* UHHUH! A synchronous read error occurred. Report it */
- -              put_page(page);
- -              goto out;
- -
- -no_cached_page:
- -              /*
- -               * Ok, it wasn't cached, so we need to create a new
- -               * page..
- -               */
- -              page = page_cache_alloc(mapping);
- -              if (!page) {
- -                      error = -ENOMEM;
- -                      goto out;
- -              }
- -              error = add_to_page_cache_lru(page, mapping, index,
- -                              mapping_gfp_constraint(mapping, GFP_KERNEL));
- -              if (error) {
- -                      put_page(page);
- -                      if (error == -EEXIST) {
- -                              error = 0;
- -                              goto find_page;
- -                      }
- -                      goto out;
- -              }
- -              goto readpage;
- -      }
+ +      file_accessed(filp);
   
- -would_block:
- -      error = -EAGAIN;
- -out:
- -      ra->prev_pos = prev_index;
- -      ra->prev_pos <<= PAGE_SHIFT;
- -      ra->prev_pos |= prev_offset;
+ +      if (pages != pages_onstack)
+ +              kfree(pages);
   
- -      *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
- -      file_accessed(filp);
         return written ? written : error;
   }
   EXPORT_SYMBOL_GPL(generic_file_buffered_read);
@@@ -2981,14 -2886,14 +2981,14 @@@ EXPORT_SYMBOL(filemap_map_pages)
   
   vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
   {
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
         struct page *page = vmf->page;
-       struct inode *inode = file_inode(vmf->vma->vm_file);
         vm_fault_t ret = VM_FAULT_LOCKED;
   
-       sb_start_pagefault(inode->i_sb);
+       sb_start_pagefault(mapping->host->i_sb);
         file_update_time(vmf->vma->vm_file);
         lock_page(page);
-       if (page->mapping != inode->i_mapping) {
+       if (page->mapping != mapping) {
                 unlock_page(page);
                 ret = VM_FAULT_NOPAGE;
                 goto out;
@@@ -3001,7 -2906,7 +3001,7 @@@
         set_page_dirty(page);
         wait_for_stable_page(page);
   out:
-       sb_end_pagefault(inode->i_sb);
+       sb_end_pagefault(mapping->host->i_sb);
         return ret;
   }
   
@@@ -3244,10 -3149,9 +3244,9 @@@ void dio_warn_stale_pagecache(struct fi
   {
         static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
         char pathname[128];
-       struct inode *inode = file_inode(filp);
         char *path;
   
-       errseq_set(&inode->i_mapping->wb_err, -EIO);
+       errseq_set(&filp->f_mapping->wb_err, -EIO);
         if (__ratelimit(&_rs)) {
                 path = file_path(filp, pathname, sizeof(pathname));
                 if (IS_ERR(path))
@@@ -3274,7 -3178,7 +3273,7 @@@ generic_file_direct_write(struct kiocb 
   
         if (iocb->ki_flags & IOCB_NOWAIT) {
                 /* If there are pages to writeback, return */
-               if (filemap_range_has_page(inode->i_mapping, pos,
+               if (filemap_range_has_page(file->f_mapping, pos,
                                            pos + write_len - 1))
                         return -EAGAIN;
         } else {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 16 Dec 2020 20:57:51 +0000 (12:57 -0800)
		1	2
block/blk-flush.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-merge.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/xen-blkback/common.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/zram/zram_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/zram/zram_drv.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ide/ide-probe.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-raid.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-table.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid0.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/block/dasd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/block/dasd_int.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/zoned.c	patch \|	diff1 \|	\|	blob \| history
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/f2fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/io_uring.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/pstore/blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/blktrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history