Merge tag 'for-6.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/devic...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
Pull device mapper updates from Mike Snitzer:

 - Update DM crypt to allocate compound pages if possible

 - Fix DM crypt target's crypt_ctr_cipher_new return value on invalid
   AEAD cipher

 - Fix DM flakey testing target's write bio corruption feature to
   corrupt the data of a cloned bio instead of the original

 - Add random_read_corrupt and random_write_corrupt features to DM
   flakey target

 - Fix ABBA deadlock in DM thin metadata by resetting associated bufio
   client rather than destroying and recreating it

 - A couple other small DM thinp cleanups

 - Update DM core to support disabling block core IO stats accounting
   and optimize away code that isn't needed if stats are disabled

 - Other small DM core cleanups

 - Improve DM integrity target to not require so much memory on 32 bit
   systems. Also only allocate the recalculate buffer as needed (and
   increasingly reduce its size on allocation failure)

 - Update DM integrity to use %*ph for printing hexdump of a small
   buffer. Also update DM integrity documentation

 - Various DM core ioctl interface hardening. Now more careful about
   alignment of structures and processing of input passed to the kernel
   from userspace.

   Also disallow the creation of DM devices named "control", "." or ".."

 - Eliminate GFP_NOIO workarounds for __vmalloc and kvmalloc in DM
   core's ioctl and bufio code

* tag 'for-6.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (28 commits)
  dm: get rid of GFP_NOIO workarounds for __vmalloc and kvmalloc
  dm integrity: scale down the recalculate buffer if memory allocation fails
  dm integrity: only allocate recalculate buffer when needed
  dm integrity: reduce vmalloc space footprint on 32-bit architectures
  dm ioctl: Refuse to create device named "." or ".."
  dm ioctl: Refuse to create device named "control"
  dm ioctl: Avoid double-fetch of version
  dm ioctl: structs and parameter strings must not overlap
  dm ioctl: Avoid pointer arithmetic overflow
  dm ioctl: Check dm_target_spec is sufficiently aligned
  Documentation: dm-integrity: Document an example of how the tunables relate.
  Documentation: dm-integrity: Document default values.
  Documentation: dm-integrity: Document the meaning of "buffer".
  Documentation: dm-integrity: Fix minor grammatical error.
  dm integrity: Use %*ph for printing hexdump of a small buffer
  dm thin: disable discards for thin-pool if no_discard_passdown
  dm: remove stale/redundant dm_internal_{suspend,resume} prototypes in dm.h
  dm: skip dm-stats work in alloc_io() unless needed
  dm: avoid needless dm_io access if all IO accounting is disabled
  dm: support turning off block-core's io stats accounting
  ...

1  2 
drivers/md/dm-crypt.c
drivers/md/dm-integrity.c
drivers/md/dm-ioctl.c
drivers/md/dm-thin-metadata.c
drivers/md/dm-thin.c
drivers/md/dm.c

diff --combined drivers/md/dm-crypt.c
@@@ -1661,6 -1661,9 +1661,9 @@@ static void crypt_free_buffer_pages(str
   * In order to not degrade performance with excessive locking, we try
   * non-blocking allocations without a mutex first but on failure we fallback
   * to blocking allocations with a mutex.
+  *
+  * In order to reduce allocation overhead, we try to allocate compound pages in
+  * the first pass. If they are not available, we fall back to the mempool.
   */
  static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
  {
        struct bio *clone;
        unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
-       unsigned int i, len, remaining_size;
-       struct page *page;
+       unsigned int remaining_size;
+       unsigned int order = MAX_ORDER - 1;
  
  retry:
        if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
  
        remaining_size = size;
  
-       for (i = 0; i < nr_iovecs; i++) {
-               page = mempool_alloc(&cc->page_pool, gfp_mask);
-               if (!page) {
+       while (remaining_size) {
+               struct page *pages;
+               unsigned size_to_add;
+               unsigned remaining_order = __fls((remaining_size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+               order = min(order, remaining_order);
+               while (order > 0) {
+                       pages = alloc_pages(gfp_mask
+                               | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | __GFP_COMP,
+                               order);
+                       if (likely(pages != NULL))
+                               goto have_pages;
+                       order--;
+               }
+               pages = mempool_alloc(&cc->page_pool, gfp_mask);
+               if (!pages) {
                        crypt_free_buffer_pages(cc, clone);
                        bio_put(clone);
                        gfp_mask |= __GFP_DIRECT_RECLAIM;
+                       order = 0;
                        goto retry;
                }
  
-               len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
-               __bio_add_page(clone, page, len, 0);
-               remaining_size -= len;
+ have_pages:
+               size_to_add = min((unsigned)PAGE_SIZE << order, remaining_size);
+               __bio_add_page(clone, pages, size_to_add, 0);
+               remaining_size -= size_to_add;
        }
  
        /* Allocate space for integrity tags */
  
  static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
  {
-       struct bio_vec *bv;
-       struct bvec_iter_all iter_all;
+       struct folio_iter fi;
  
-       bio_for_each_segment_all(bv, clone, iter_all) {
-               BUG_ON(!bv->bv_page);
-               mempool_free(bv->bv_page, &cc->page_pool);
+       if (clone->bi_vcnt > 0) { /* bio_for_each_folio_all crashes with an empty bio */
+               bio_for_each_folio_all(fi, clone) {
+                       if (folio_test_large(fi.folio))
+                               folio_put(fi.folio);
+                       else
+                               mempool_free(&fi.folio->page, &cc->page_pool);
+               }
        }
  }
  
@@@ -2887,7 -2908,7 +2908,7 @@@ static int crypt_ctr_cipher_new(struct 
                ret = crypt_ctr_auth_cipher(cc, cipher_api);
                if (ret < 0) {
                        ti->error = "Invalid AEAD cipher spec";
-                       return -ENOMEM;
+                       return ret;
                }
        }
  
@@@ -3255,7 -3276,7 +3276,7 @@@ static int crypt_ctr(struct dm_target *
  
        cc->per_bio_data_size = ti->per_io_data_size =
                ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
 -                    ARCH_KMALLOC_MINALIGN);
 +                    ARCH_DMA_MINALIGN);
  
        ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
        if (ret) {
  #define DEFAULT_BUFFER_SECTORS                128
  #define DEFAULT_JOURNAL_WATERMARK     50
  #define DEFAULT_SYNC_MSEC             10000
- #define DEFAULT_MAX_JOURNAL_SECTORS   131072
+ #define DEFAULT_MAX_JOURNAL_SECTORS   (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192)
  #define MIN_LOG2_INTERLEAVE_SECTORS   3
  #define MAX_LOG2_INTERLEAVE_SECTORS   31
  #define METADATA_WORKQUEUE_MAX_ACTIVE 16
- #define RECALC_SECTORS                        32768
+ #define RECALC_SECTORS                        (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048)
  #define RECALC_WRITE_SUPER            16
  #define BITMAP_BLOCK_SIZE             4096    /* don't change it */
  #define BITMAP_FLUSH_INTERVAL         (10 * HZ)
@@@ -251,8 -251,6 +251,6 @@@ struct dm_integrity_c 
  
        struct workqueue_struct *recalc_wq;
        struct work_struct recalc_work;
-       u8 *recalc_buffer;
-       u8 *recalc_tags;
  
        struct bio_list flush_bio_list;
  
@@@ -342,24 -340,9 +340,9 @@@ static struct kmem_cache *journal_io_ca
  #define JOURNAL_IO_MEMPOOL    32
  
  #ifdef DEBUG_PRINT
- #define DEBUG_print(x, ...)   printk(KERN_DEBUG x, ##__VA_ARGS__)
- static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
- {
-       va_list args;
-       va_start(args, msg);
-       vprintk(msg, args);
-       va_end(args);
-       if (len)
-               pr_cont(":");
-       while (len) {
-               pr_cont(" %02x", *bytes);
-               bytes++;
-               len--;
-       }
-       pr_cont("\n");
- }
- #define DEBUG_bytes(bytes, len, msg, ...)     __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+ #define DEBUG_print(x, ...)                   printk(KERN_DEBUG x, ##__VA_ARGS__)
+ #define DEBUG_bytes(bytes, len, msg, ...)     printk(KERN_DEBUG msg "%s%*ph\n", ##__VA_ARGS__, \
+                                                      len ? ": " : "", len, bytes)
  #else
  #define DEBUG_print(x, ...)                   do { } while (0)
  #define DEBUG_bytes(bytes, len, msg, ...)     do { } while (0)
@@@ -2661,6 -2644,9 +2644,9 @@@ static void recalc_write_super(struct d
  static void integrity_recalc(struct work_struct *w)
  {
        struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+       size_t recalc_tags_size;
+       u8 *recalc_buffer = NULL;
+       u8 *recalc_tags = NULL;
        struct dm_integrity_range range;
        struct dm_io_request io_req;
        struct dm_io_region io_loc;
        unsigned int i;
        int r;
        unsigned int super_counter = 0;
+       unsigned recalc_sectors = RECALC_SECTORS;
+ retry:
+       recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
+       if (!recalc_buffer) {
+ oom:
+               recalc_sectors >>= 1;
+               if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
+                       goto retry;
+               DMCRIT("out of memory for recalculate buffer - recalculation disabled");
+               goto free_ret;
+       }
+       recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
+       if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
+               recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+       recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
+       if (!recalc_tags) {
+               vfree(recalc_buffer);
+               goto oom;
+       }
  
        DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
  
@@@ -2693,7 -2699,7 +2699,7 @@@ next_chunk
        }
  
        get_area_and_offset(ic, range.logical_sector, &area, &offset);
-       range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
+       range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
        if (!ic->meta_dev)
                range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset);
  
  
        io_req.bi_opf = REQ_OP_READ;
        io_req.mem.type = DM_IO_VMA;
-       io_req.mem.ptr.addr = ic->recalc_buffer;
+       io_req.mem.ptr.addr = recalc_buffer;
        io_req.notify.fn = NULL;
        io_req.client = ic->io;
        io_loc.bdev = ic->dev->bdev;
                goto err;
        }
  
-       t = ic->recalc_tags;
+       t = recalc_tags;
        for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
-               integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+               integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
                t += ic->tag_size;
        }
  
        metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
  
-       r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
+       r = dm_integrity_rw_tag(ic, recalc_tags, &metadata_block, &metadata_offset, t - recalc_tags, TAG_WRITE);
        if (unlikely(r)) {
                dm_integrity_io_error(ic, "writing tags", r);
                goto err;
@@@ -2784,12 -2790,16 +2790,16 @@@ advance_and_next
  
  err:
        remove_range(ic, &range);
-       return;
+       goto free_ret;
  
  unlock_ret:
        spin_unlock_irq(&ic->endio_wait.lock);
  
        recalc_write_super(ic);
+ free_ret:
+       vfree(recalc_buffer);
+       kvfree(recalc_tags);
  }
  
  static void bitmap_block_work(struct work_struct *w)
@@@ -4268,10 -4278,10 +4278,10 @@@ static int dm_integrity_ctr(struct dm_t
        }
  
        /*
 -       * If this workqueue were percpu, it would cause bio reordering
 +       * If this workqueue weren't ordered, it would cause bio reordering
         * and reduced performance.
         */
 -      ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
 +      ic->wait_wq = alloc_ordered_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM);
        if (!ic->wait_wq) {
                ti->error = "Cannot allocate workqueue";
                r = -ENOMEM;
@@@ -4454,8 -4464,6 +4464,6 @@@ try_smaller_buffer
        }
  
        if (ic->internal_hash) {
-               size_t recalc_tags_size;
                ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
                if (!ic->recalc_wq) {
                        ti->error = "Cannot allocate workqueue";
                        goto bad;
                }
                INIT_WORK(&ic->recalc_work, integrity_recalc);
-               ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
-               if (!ic->recalc_buffer) {
-                       ti->error = "Cannot allocate buffer for recalculating";
-                       r = -ENOMEM;
-                       goto bad;
-               }
-               recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size;
-               if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
-                       recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
-               ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL);
-               if (!ic->recalc_tags) {
-                       ti->error = "Cannot allocate tags for recalculating";
-                       r = -ENOMEM;
-                       goto bad;
-               }
        } else {
                if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                        ti->error = "Recalculate can only be specified with internal_hash";
@@@ -4621,8 -4614,6 +4614,6 @@@ static void dm_integrity_dtr(struct dm_
                destroy_workqueue(ic->writer_wq);
        if (ic->recalc_wq)
                destroy_workqueue(ic->recalc_wq);
-       vfree(ic->recalc_buffer);
-       kvfree(ic->recalc_tags);
        kvfree(ic->bbs);
        if (ic->bufio)
                dm_bufio_client_destroy(ic->bufio);
diff --combined drivers/md/dm-ioctl.c
@@@ -767,7 -767,14 +767,14 @@@ static int get_target_version(struct fi
  static int check_name(const char *name)
  {
        if (strchr(name, '/')) {
-               DMERR("invalid device name");
+               DMERR("device name cannot contain '/'");
+               return -EINVAL;
+       }
+       if (strcmp(name, DM_CONTROL_NODE) == 0 ||
+           strcmp(name, ".") == 0 ||
+           strcmp(name, "..") == 0) {
+               DMERR("device name cannot be \"%s\", \".\", or \"..\"", DM_CONTROL_NODE);
                return -EINVAL;
        }
  
@@@ -1168,10 -1175,13 +1175,10 @@@ static int do_resume(struct dm_ioctl *p
        /* Do we need to load a new map ? */
        if (new_map) {
                sector_t old_size, new_size;
 -              int srcu_idx;
  
                /* Suspend if it isn't already suspended */
 -              old_map = dm_get_live_table(md, &srcu_idx);
 -              if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map)
 +              if (param->flags & DM_SKIP_LOCKFS_FLAG)
                        suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
 -              dm_put_live_table(md, srcu_idx);
                if (param->flags & DM_NOFLUSH_FLAG)
                        suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
                if (!dm_suspended_md(md))
@@@ -1388,16 -1398,38 +1395,38 @@@ static inline blk_mode_t get_mode(struc
        return mode;
  }
  
- static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+ static int next_target(struct dm_target_spec *last, uint32_t next, const char *end,
                       struct dm_target_spec **spec, char **target_params)
  {
-       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
-       *target_params = (char *) (*spec + 1);
+       static_assert(__alignof__(struct dm_target_spec) <= 8,
+               "struct dm_target_spec must not require more than 8-byte alignment");
+       /*
+        * Number of bytes remaining, starting with last. This is always
+        * sizeof(struct dm_target_spec) or more, as otherwise *last was
+        * out of bounds already.
+        */
+       size_t remaining = end - (char *)last;
+       /*
+        * There must be room for both the next target spec and the
+        * NUL-terminator of the target itself.
+        */
+       if (remaining - sizeof(struct dm_target_spec) <= next) {
+               DMERR("Target spec extends beyond end of parameters");
+               return -EINVAL;
+       }
  
-       if (*spec < (last + 1))
+       if (next % __alignof__(struct dm_target_spec)) {
+               DMERR("Next dm_target_spec (offset %u) is not %zu-byte aligned",
+                     next, __alignof__(struct dm_target_spec));
                return -EINVAL;
+       }
+       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+       *target_params = (char *) (*spec + 1);
  
-       return invalid_str(*target_params, end);
+       return 0;
  }
  
  static int populate_table(struct dm_table *table,
        unsigned int i = 0;
        struct dm_target_spec *spec = (struct dm_target_spec *) param;
        uint32_t next = param->data_start;
-       void *end = (void *) param + param_size;
+       const char *const end = (const char *) param + param_size;
        char *target_params;
+       size_t min_size = sizeof(struct dm_ioctl);
  
        if (!param->target_count) {
                DMERR("%s: no targets specified", __func__);
        }
  
        for (i = 0; i < param->target_count; i++) {
+               const char *nul_terminator;
+               if (next < min_size) {
+                       DMERR("%s: next target spec (offset %u) overlaps %s",
+                             __func__, next, i ? "previous target" : "'struct dm_ioctl'");
+                       return -EINVAL;
+               }
  
                r = next_target(spec, next, end, &spec, &target_params);
                if (r) {
                        return r;
                }
  
+               nul_terminator = memchr(target_params, 0, (size_t)(end - target_params));
+               if (nul_terminator == NULL) {
+                       DMERR("%s: target parameters not NUL-terminated", __func__);
+                       return -EINVAL;
+               }
+               /* Add 1 for NUL terminator */
+               min_size = (size_t)(nul_terminator - (const char *)spec) + 1;
                r = dm_table_add_target(table, spec->target_type,
                                        (sector_t) spec->sector_start,
                                        (sector_t) spec->length,
@@@ -1830,30 -1879,36 +1876,36 @@@ static ioctl_fn lookup_ioctl(unsigned i
   * As well as checking the version compatibility this always
   * copies the kernel interface version out.
   */
- static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+ static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
+                        struct dm_ioctl *kernel_params)
  {
-       uint32_t version[3];
        int r = 0;
  
-       if (copy_from_user(version, user->version, sizeof(version)))
+       /* Make certain version is first member of dm_ioctl struct */
+       BUILD_BUG_ON(offsetof(struct dm_ioctl, version) != 0);
+       if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version)))
                return -EFAULT;
  
-       if ((version[0] != DM_VERSION_MAJOR) ||
-           (version[1] > DM_VERSION_MINOR)) {
+       if ((kernel_params->version[0] != DM_VERSION_MAJOR) ||
+           (kernel_params->version[1] > DM_VERSION_MINOR)) {
                DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
                      DM_VERSION_MAJOR, DM_VERSION_MINOR,
                      DM_VERSION_PATCHLEVEL,
-                     version[0], version[1], version[2], cmd);
+                     kernel_params->version[0],
+                     kernel_params->version[1],
+                     kernel_params->version[2],
+                     cmd);
                r = -EINVAL;
        }
  
        /*
         * Fill in the kernel version.
         */
-       version[0] = DM_VERSION_MAJOR;
-       version[1] = DM_VERSION_MINOR;
-       version[2] = DM_VERSION_PATCHLEVEL;
-       if (copy_to_user(user->version, version, sizeof(version)))
+       kernel_params->version[0] = DM_VERSION_MAJOR;
+       kernel_params->version[1] = DM_VERSION_MINOR;
+       kernel_params->version[2] = DM_VERSION_PATCHLEVEL;
+       if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version)))
                return -EFAULT;
  
        return r;
@@@ -1877,9 -1932,11 +1929,11 @@@ static int copy_params(struct dm_ioctl 
        struct dm_ioctl *dmi;
        int secure_data;
        const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
-       unsigned int noio_flag;
  
-       if (copy_from_user(param_kernel, user, minimum_data_size))
+       /* check_version() already copied version from userspace, avoid TOCTOU */
+       if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version),
+                          (char __user *)user + sizeof(param_kernel->version),
+                          minimum_data_size - sizeof(param_kernel->version)))
                return -EFAULT;
  
        if (param_kernel->data_size < minimum_data_size) {
         * Use kmalloc() rather than vmalloc() when we can.
         */
        dmi = NULL;
-       noio_flag = memalloc_noio_save();
-       dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
-       memalloc_noio_restore(noio_flag);
+       dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH);
  
        if (!dmi) {
                if (secure_data && clear_user(user, param_kernel->data_size))
@@@ -1991,7 -2046,7 +2043,7 @@@ static int ctl_ioctl(struct file *file
         * Check the interface version passed in.  This also
         * writes out the kernel's interface version.
         */
-       r = check_version(cmd, user);
+       r = check_version(cmd, user, &param_kernel);
        if (r)
                return r;
  
@@@ -603,6 -603,8 +603,8 @@@ static int __format_metadata(struct dm_
        r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
                                 &pmd->tm, &pmd->metadata_sm);
        if (r < 0) {
+               pmd->tm = NULL;
+               pmd->metadata_sm = NULL;
                DMERR("tm_create_with_sm failed");
                return r;
        }
        if (IS_ERR(pmd->data_sm)) {
                DMERR("sm_disk_create failed");
                r = PTR_ERR(pmd->data_sm);
+               pmd->data_sm = NULL;
                goto bad_cleanup_tm;
        }
  
  
  bad_cleanup_nb_tm:
        dm_tm_destroy(pmd->nb_tm);
+       pmd->nb_tm = NULL;
  bad_cleanup_data_sm:
        dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
  bad_cleanup_tm:
        dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
        dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
  
        return r;
  }
@@@ -711,6 -718,8 +718,8 @@@ static int __open_metadata(struct dm_po
                               sizeof(disk_super->metadata_space_map_root),
                               &pmd->tm, &pmd->metadata_sm);
        if (r < 0) {
+               pmd->tm = NULL;
+               pmd->metadata_sm = NULL;
                DMERR("tm_open_with_sm failed");
                goto bad_unlock_sblock;
        }
        if (IS_ERR(pmd->data_sm)) {
                DMERR("sm_disk_open failed");
                r = PTR_ERR(pmd->data_sm);
+               pmd->data_sm = NULL;
                goto bad_cleanup_tm;
        }
  
  
  bad_cleanup_data_sm:
        dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
  bad_cleanup_tm:
        dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
        dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
  bad_unlock_sblock:
        dm_bm_unlock(sblock);
  
@@@ -795,9 -808,13 +808,13 @@@ static void __destroy_persistent_data_o
                                              bool destroy_bm)
  {
        dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
        dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
        dm_tm_destroy(pmd->nb_tm);
+       pmd->nb_tm = NULL;
        dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
        if (destroy_bm)
                dm_block_manager_destroy(pmd->bm);
  }
@@@ -1005,8 -1022,7 +1022,7 @@@ int dm_pool_metadata_close(struct dm_po
                               __func__, r);
        }
        pmd_write_unlock(pmd);
-       if (!pmd->fail_io)
-               __destroy_persistent_data_objects(pmd, true);
+       __destroy_persistent_data_objects(pmd, true);
  
        kfree(pmd);
        return 0;
@@@ -1756,15 -1772,13 +1772,15 @@@ int dm_thin_remove_range(struct dm_thin
  
  int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
  {
 -      int r;
 +      int r = -EINVAL;
        uint32_t ref_count;
  
        down_read(&pmd->root_lock);
 -      r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
 -      if (!r)
 -              *result = (ref_count > 1);
 +      if (!pmd->fail_io) {
 +              r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
 +              if (!r)
 +                      *result = (ref_count > 1);
 +      }
        up_read(&pmd->root_lock);
  
        return r;
  
  int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
  {
 -      int r = 0;
 +      int r = -EINVAL;
  
        pmd_write_lock(pmd);
 -      r = dm_sm_inc_blocks(pmd->data_sm, b, e);
 +      if (!pmd->fail_io)
 +              r = dm_sm_inc_blocks(pmd->data_sm, b, e);
        pmd_write_unlock(pmd);
  
        return r;
  
  int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
  {
 -      int r = 0;
 +      int r = -EINVAL;
  
        pmd_write_lock(pmd);
 -      r = dm_sm_dec_blocks(pmd->data_sm, b, e);
 +      if (!pmd->fail_io)
 +              r = dm_sm_dec_blocks(pmd->data_sm, b, e);
        pmd_write_unlock(pmd);
  
        return r;
@@@ -1881,53 -1893,29 +1897,29 @@@ static void __set_abort_with_changes_fl
  int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
  {
        int r = -EINVAL;
-       struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
  
        /* fail_io is double-checked with pmd->root_lock held below */
        if (unlikely(pmd->fail_io))
                return r;
  
-       /*
-        * Replacement block manager (new_bm) is created and old_bm destroyed outside of
-        * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
-        * shrinker associated with the block manager's bufio client vs pmd root_lock).
-        * - must take shrinker_rwsem without holding pmd->root_lock
-        */
-       new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-                                        THIN_MAX_CONCURRENT_LOCKS);
        pmd_write_lock(pmd);
        if (pmd->fail_io) {
                pmd_write_unlock(pmd);
-               goto out;
+               return r;
        }
        __set_abort_with_changes_flags(pmd);
+       /* destroy data_sm/metadata_sm/nb_tm/tm */
        __destroy_persistent_data_objects(pmd, false);
-       old_bm = pmd->bm;
-       if (IS_ERR(new_bm)) {
-               DMERR("could not create block manager during abort");
-               pmd->bm = NULL;
-               r = PTR_ERR(new_bm);
-               goto out_unlock;
-       }
  
-       pmd->bm = new_bm;
+       /* reset bm */
+       dm_block_manager_reset(pmd->bm);
+       /* rebuild data_sm/metadata_sm/nb_tm/tm */
        r = __open_or_format_metadata(pmd, false);
-       if (r) {
-               pmd->bm = NULL;
-               goto out_unlock;
-       }
-       new_bm = NULL;
- out_unlock:
        if (r)
                pmd->fail_io = true;
        pmd_write_unlock(pmd);
-       dm_block_manager_destroy(old_bm);
- out:
-       if (new_bm && !IS_ERR(new_bm))
-               dm_block_manager_destroy(new_bm);
        return r;
  }
  
diff --combined drivers/md/dm-thin.c
@@@ -401,7 -401,8 +401,7 @@@ static int issue_discard(struct discard
        sector_t s = block_to_sectors(tc->pool, data_b);
        sector_t len = block_to_sectors(tc->pool, data_e - data_b);
  
 -      return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
 -                                    &op->bio);
 +      return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
  }
  
  static void end_discard(struct discard_op *op, int r)
@@@ -2527,16 -2528,11 +2527,11 @@@ static void noflush_work(struct thin_c 
  
  /*----------------------------------------------------------------*/
  
- static bool passdown_enabled(struct pool_c *pt)
- {
-       return pt->adjusted_pf.discard_passdown;
- }
  static void set_discard_callbacks(struct pool *pool)
  {
        struct pool_c *pt = pool->ti->private;
  
-       if (passdown_enabled(pt)) {
+       if (pt->adjusted_pf.discard_passdown) {
                pool->process_discard_cell = process_discard_cell_passdown;
                pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
                pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
@@@ -2845,7 -2841,7 +2840,7 @@@ static bool is_factor(sector_t block_si
   * If discard_passdown was enabled verify that the data device
   * supports discards.  Disable discard_passdown if not.
   */
- static void disable_passdown_if_not_supported(struct pool_c *pt)
+ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
  {
        struct pool *pool = pt->pool;
        struct block_device *data_bdev = pt->data_dev->bdev;
@@@ -3446,7 -3442,6 +3441,6 @@@ out_unlock
  
  static int pool_map(struct dm_target *ti, struct bio *bio)
  {
-       int r;
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
  
         */
        spin_lock_irq(&pool->lock);
        bio_set_dev(bio, pt->data_dev->bdev);
-       r = DM_MAPIO_REMAPPED;
        spin_unlock_irq(&pool->lock);
  
-       return r;
+       return DM_MAPIO_REMAPPED;
  }
  
  static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
@@@ -4099,21 -4093,22 +4092,22 @@@ static void pool_io_hints(struct dm_tar
         * They get transferred to the live pool in bind_control_target()
         * called from pool_preresume().
         */
-       if (!pt->adjusted_pf.discard_enabled) {
+       if (pt->adjusted_pf.discard_enabled) {
+               disable_discard_passdown_if_not_supported(pt);
+               if (!pt->adjusted_pf.discard_passdown)
+                       limits->max_discard_sectors = 0;
+               /*
+                * The pool uses the same discard limits as the underlying data
+                * device.  DM core has already set this up.
+                */
+       } else {
                /*
                 * Must explicitly disallow stacking discard limits otherwise the
                 * block layer will stack them if pool's data device has support.
                 */
                limits->discard_granularity = 0;
-               return;
        }
-       disable_passdown_if_not_supported(pt);
-       /*
-        * The pool uses the same discard limits as the underlying data
-        * device.  DM core has already set this up.
-        */
  }
  
  static struct target_type pool_target = {
@@@ -4497,11 -4492,10 +4491,10 @@@ static void thin_io_hints(struct dm_tar
        struct thin_c *tc = ti->private;
        struct pool *pool = tc->pool;
  
-       if (!pool->pf.discard_enabled)
-               return;
-       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-       limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+       if (pool->pf.discard_enabled) {
+               limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+               limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+       }
  }
  
  static struct target_type thin_target = {
diff --combined drivers/md/dm.c
@@@ -207,7 -207,7 +207,7 @@@ static int __init local_init(void
        if (r)
                return r;
  
 -      deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 +      deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0);
        if (!deferred_remove_workqueue) {
                r = -ENOMEM;
                goto out_uevent_exit;
@@@ -487,48 -487,50 +487,50 @@@ u64 dm_start_time_ns_from_clone(struct 
  }
  EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
  
- static bool bio_is_flush_with_data(struct bio *bio)
+ static inline bool bio_is_flush_with_data(struct bio *bio)
  {
        return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
  }
  
- static void dm_io_acct(struct dm_io *io, bool end)
+ static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
  {
-       struct dm_stats_aux *stats_aux = &io->stats_aux;
-       unsigned long start_time = io->start_time;
-       struct mapped_device *md = io->md;
-       struct bio *bio = io->orig_bio;
-       unsigned int sectors;
        /*
         * If REQ_PREFLUSH set, don't account payload, it will be
         * submitted (and accounted) after this flush completes.
         */
        if (bio_is_flush_with_data(bio))
-               sectors = 0;
-       else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
-               sectors = bio_sectors(bio);
-       else
-               sectors = io->sectors;
+               return 0;
+       if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
+               return io->sectors;
+       return bio_sectors(bio);
+ }
  
-       if (!end)
-               bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
-       else
-               bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
-                                start_time);
+ static void dm_io_acct(struct dm_io *io, bool end)
+ {
+       struct bio *bio = io->orig_bio;
+       if (dm_io_flagged(io, DM_IO_BLK_STAT)) {
+               if (!end)
+                       bdev_start_io_acct(bio->bi_bdev, bio_op(bio),
+                                          io->start_time);
+               else
+                       bdev_end_io_acct(bio->bi_bdev, bio_op(bio),
+                                        dm_io_sectors(io, bio),
+                                        io->start_time);
+       }
  
        if (static_branch_unlikely(&stats_enabled) &&
-           unlikely(dm_stats_used(&md->stats))) {
+           unlikely(dm_stats_used(&io->md->stats))) {
                sector_t sector;
  
-               if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
-                       sector = bio->bi_iter.bi_sector;
-               else
+               if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
                        sector = bio_end_sector(bio) - io->sector_offset;
+               else
+                       sector = bio->bi_iter.bi_sector;
  
-               dm_stats_account_io(&md->stats, bio_data_dir(bio),
-                                   sector, sectors,
-                                   end, start_time, stats_aux);
+               dm_stats_account_io(&io->md->stats, bio_data_dir(bio),
+                                   sector, dm_io_sectors(io, bio),
+                                   end, io->start_time, &io->stats_aux);
        }
  }
  
@@@ -592,8 -594,11 +594,11 @@@ static struct dm_io *alloc_io(struct ma
        spin_lock_init(&io->lock);
        io->start_time = jiffies;
        io->flags = 0;
+       if (blk_queue_io_stat(md->queue))
+               dm_io_set_flag(io, DM_IO_BLK_STAT);
  
-       if (static_branch_unlikely(&stats_enabled))
+       if (static_branch_unlikely(&stats_enabled) &&
+           unlikely(dm_stats_used(&md->stats)))
                dm_stats_record_start(&md->stats, &io->stats_aux);
  
        return io;
@@@ -1172,8 -1177,7 +1177,8 @@@ static inline sector_t max_io_len_targe
  }
  
  static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
 -                           unsigned int max_granularity)
 +                           unsigned int max_granularity,
 +                           unsigned int max_sectors)
  {
        sector_t target_offset = dm_target_offset(ti, sector);
        sector_t len = max_io_len_target_boundary(ti, target_offset);
        if (!max_granularity)
                return len;
        return min_t(sector_t, len,
 -              min(queue_max_sectors(ti->table->md->queue),
 +              min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
                    blk_chunk_sectors_left(target_offset, max_granularity)));
  }
  
  static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
  {
 -      return __max_io_len(ti, sector, ti->max_io_len);
 +      return __max_io_len(ti, sector, ti->max_io_len, 0);
  }
  
  int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@@ -1582,13 -1586,12 +1587,13 @@@ static void __send_empty_flush(struct c
  
  static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
                                        unsigned int num_bios,
 -                                      unsigned int max_granularity)
 +                                      unsigned int max_granularity,
 +                                      unsigned int max_sectors)
  {
        unsigned int len, bios;
  
        len = min_t(sector_t, ci->sector_count,
 -                  __max_io_len(ti, ci->sector, max_granularity));
 +                  __max_io_len(ti, ci->sector, max_granularity, max_sectors));
  
        atomic_add(num_bios, &ci->io->io_count);
        bios = __send_duplicate_bios(ci, ti, num_bios, &len);
@@@ -1625,27 -1628,23 +1630,27 @@@ static blk_status_t __process_abnormal_
  {
        unsigned int num_bios = 0;
        unsigned int max_granularity = 0;
 +      unsigned int max_sectors = 0;
        struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
  
        switch (bio_op(ci->bio)) {
        case REQ_OP_DISCARD:
                num_bios = ti->num_discard_bios;
 +              max_sectors = limits->max_discard_sectors;
                if (ti->max_discard_granularity)
 -                      max_granularity = limits->max_discard_sectors;
 +                      max_granularity = max_sectors;
                break;
        case REQ_OP_SECURE_ERASE:
                num_bios = ti->num_secure_erase_bios;
 +              max_sectors = limits->max_secure_erase_sectors;
                if (ti->max_secure_erase_granularity)
 -                      max_granularity = limits->max_secure_erase_sectors;
 +                      max_granularity = max_sectors;
                break;
        case REQ_OP_WRITE_ZEROES:
                num_bios = ti->num_write_zeroes_bios;
 +              max_sectors = limits->max_write_zeroes_sectors;
                if (ti->max_write_zeroes_granularity)
 -                      max_granularity = limits->max_write_zeroes_sectors;
 +                      max_granularity = max_sectors;
                break;
        default:
                break;
        if (unlikely(!num_bios))
                return BLK_STS_NOTSUPP;
  
 -      __send_changing_extent_only(ci, ti, num_bios, max_granularity);
 +      __send_changing_extent_only(ci, ti, num_bios,
 +                                  max_granularity, max_sectors);
        return BLK_STS_OK;
  }
  
@@@ -2348,6 -2346,7 +2353,7 @@@ int dm_setup_md_queue(struct mapped_dev
                break;
        case DM_TYPE_BIO_BASED:
        case DM_TYPE_DAX_BIO_BASED:
+               blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue);
                break;
        case DM_TYPE_NONE:
                WARN_ON_ONCE(true);
@@@ -2815,10 -2814,6 +2821,10 @@@ retry
        }
  
        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 +      if (!map) {
 +              /* avoid deadlock with fs/namespace.c:do_mount() */
 +              suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
 +      }
  
        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
        if (r)
@@@ -3143,8 -3138,6 +3149,8 @@@ struct dm_pr 
        bool    fail_early;
        int     ret;
        enum pr_type type;
 +      struct pr_keys *read_keys;
 +      struct pr_held_reservation *rsv;
  };
  
  static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
        return r;
  }
  
 +static int __dm_pr_read_keys(struct dm_target *ti, struct dm_dev *dev,
 +                           sector_t start, sector_t len, void *data)
 +{
 +      struct dm_pr *pr = data;
 +      const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
 +
 +      if (!ops || !ops->pr_read_keys) {
 +              pr->ret = -EOPNOTSUPP;
 +              return -1;
 +      }
 +
 +      pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys);
 +      if (!pr->ret)
 +              return -1;
 +
 +      return 0;
 +}
 +
 +static int dm_pr_read_keys(struct block_device *bdev, struct pr_keys *keys)
 +{
 +      struct dm_pr pr = {
 +              .read_keys = keys,
 +      };
 +      int ret;
 +
 +      ret = dm_call_pr(bdev, __dm_pr_read_keys, &pr);
 +      if (ret)
 +              return ret;
 +
 +      return pr.ret;
 +}
 +
 +static int __dm_pr_read_reservation(struct dm_target *ti, struct dm_dev *dev,
 +                                  sector_t start, sector_t len, void *data)
 +{
 +      struct dm_pr *pr = data;
 +      const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
 +
 +      if (!ops || !ops->pr_read_reservation) {
 +              pr->ret = -EOPNOTSUPP;
 +              return -1;
 +      }
 +
 +      pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv);
 +      if (!pr->ret)
 +              return -1;
 +
 +      return 0;
 +}
 +
 +static int dm_pr_read_reservation(struct block_device *bdev,
 +                                struct pr_held_reservation *rsv)
 +{
 +      struct dm_pr pr = {
 +              .rsv = rsv,
 +      };
 +      int ret;
 +
 +      ret = dm_call_pr(bdev, __dm_pr_read_reservation, &pr);
 +      if (ret)
 +              return ret;
 +
 +      return pr.ret;
 +}
 +
  static const struct pr_ops dm_pr_ops = {
        .pr_register    = dm_pr_register,
        .pr_reserve     = dm_pr_reserve,
        .pr_release     = dm_pr_release,
        .pr_preempt     = dm_pr_preempt,
        .pr_clear       = dm_pr_clear,
 +      .pr_read_keys   = dm_pr_read_keys,
 +      .pr_read_reservation = dm_pr_read_reservation,
  };
  
  static const struct block_device_operations dm_blk_dops = {