Merge tag 'for-6.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/devic...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
diff --combined drivers/md/dm-crypt.c

index 15424bf,98622a1..27f4aa8
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -1661,6 -1661,9 +1661,9 @@@ static void crypt_free_buffer_pages(str
    * In order to not degrade performance with excessive locking, we try
    * non-blocking allocations without a mutex first but on failure we fallback
    * to blocking allocations with a mutex.
+  *
+  * In order to reduce allocation overhead, we try to allocate compound pages in
+  * the first pass. If they are not available, we fall back to the mempool.
    */
   static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
   {
@@@ -1668,8 -1671,8 +1671,8 @@@
         struct bio *clone;
         unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
         gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
-       unsigned int i, len, remaining_size;
-       struct page *page;
+       unsigned int remaining_size;
+       unsigned int order = MAX_ORDER - 1;
   
   retry:
         if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
@@@ -1682,19 -1685,34 +1685,34 @@@
   
         remaining_size = size;
   
-       for (i = 0; i < nr_iovecs; i++) {
-               page = mempool_alloc(&cc->page_pool, gfp_mask);
-               if (!page) {
+       while (remaining_size) {
+               struct page *pages;
+               unsigned size_to_add;
+               unsigned remaining_order = __fls((remaining_size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+               order = min(order, remaining_order);
+ 
+               while (order > 0) {
+                       pages = alloc_pages(gfp_mask
+                               | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | __GFP_COMP,
+                               order);
+                       if (likely(pages != NULL))
+                               goto have_pages;
+                       order--;
+               }
+ 
+               pages = mempool_alloc(&cc->page_pool, gfp_mask);
+               if (!pages) {
                         crypt_free_buffer_pages(cc, clone);
                         bio_put(clone);
                         gfp_mask |= __GFP_DIRECT_RECLAIM;
+                       order = 0;
                         goto retry;
                 }
   
-               len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
- 
-               __bio_add_page(clone, page, len, 0);
-               remaining_size -= len;
+ have_pages:
+               size_to_add = min((unsigned)PAGE_SIZE << order, remaining_size);
+               __bio_add_page(clone, pages, size_to_add, 0);
+               remaining_size -= size_to_add;
         }
   
         /* Allocate space for integrity tags */
@@@ -1712,12 -1730,15 +1730,15 @@@
   
   static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
   {
-       struct bio_vec *bv;
-       struct bvec_iter_all iter_all;
+       struct folio_iter fi;
   
-       bio_for_each_segment_all(bv, clone, iter_all) {
-               BUG_ON(!bv->bv_page);
-               mempool_free(bv->bv_page, &cc->page_pool);
+       if (clone->bi_vcnt > 0) { /* bio_for_each_folio_all crashes with an empty bio */
+               bio_for_each_folio_all(fi, clone) {
+                       if (folio_test_large(fi.folio))
+                               folio_put(fi.folio);
+                       else
+                               mempool_free(&fi.folio->page, &cc->page_pool);
+               }
         }
   }
   
@@@ -2887,7 -2908,7 +2908,7 @@@ static int crypt_ctr_cipher_new(struct 
                 ret = crypt_ctr_auth_cipher(cc, cipher_api);
                 if (ret < 0) {
                         ti->error = "Invalid AEAD cipher spec";
-                       return -ENOMEM;
+                       return ret;
                 }
         }
   
@@@ -3255,7 -3276,7 +3276,7 @@@ static int crypt_ctr(struct dm_target *
   
         cc->per_bio_data_size = ti->per_io_data_size =
                 ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
- -                    ARCH_KMALLOC_MINALIGN);
+ +                    ARCH_DMA_MINALIGN);
   
         ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
         if (ret) {
diff --combined drivers/md/dm-integrity.c

index 63ec502,5ca3fc6..3d5c56e
--- 1/drivers/md/dm-integrity.c
--- 2/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@@ -34,11 -34,11 +34,11 @@@
   #define DEFAULT_BUFFER_SECTORS                128
   #define DEFAULT_JOURNAL_WATERMARK     50
   #define DEFAULT_SYNC_MSEC             10000
- #define DEFAULT_MAX_JOURNAL_SECTORS   131072
+ #define DEFAULT_MAX_JOURNAL_SECTORS   (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192)
   #define MIN_LOG2_INTERLEAVE_SECTORS   3
   #define MAX_LOG2_INTERLEAVE_SECTORS   31
   #define METADATA_WORKQUEUE_MAX_ACTIVE 16
- #define RECALC_SECTORS                        32768
+ #define RECALC_SECTORS                        (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048)
   #define RECALC_WRITE_SUPER            16
   #define BITMAP_BLOCK_SIZE             4096    /* don't change it */
   #define BITMAP_FLUSH_INTERVAL         (10 * HZ)
@@@ -251,8 -251,6 +251,6 @@@ struct dm_integrity_c 
   
         struct workqueue_struct *recalc_wq;
         struct work_struct recalc_work;
-       u8 *recalc_buffer;
-       u8 *recalc_tags;
   
         struct bio_list flush_bio_list;
   
@@@ -342,24 -340,9 +340,9 @@@ static struct kmem_cache *journal_io_ca
   #define JOURNAL_IO_MEMPOOL    32
   
   #ifdef DEBUG_PRINT
- #define DEBUG_print(x, ...)   printk(KERN_DEBUG x, ##__VA_ARGS__)
- static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
- {
-       va_list args;
- 
-       va_start(args, msg);
-       vprintk(msg, args);
-       va_end(args);
-       if (len)
-               pr_cont(":");
-       while (len) {
-               pr_cont(" %02x", *bytes);
-               bytes++;
-               len--;
-       }
-       pr_cont("\n");
- }
- #define DEBUG_bytes(bytes, len, msg, ...)     __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+ #define DEBUG_print(x, ...)                   printk(KERN_DEBUG x, ##__VA_ARGS__)
+ #define DEBUG_bytes(bytes, len, msg, ...)     printk(KERN_DEBUG msg "%s%*ph\n", ##__VA_ARGS__, \
+                                                      len ? ": " : "", len, bytes)
   #else
   #define DEBUG_print(x, ...)                   do { } while (0)
   #define DEBUG_bytes(bytes, len, msg, ...)     do { } while (0)
@@@ -2661,6 -2644,9 +2644,9 @@@ static void recalc_write_super(struct d
   static void integrity_recalc(struct work_struct *w)
   {
         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+       size_t recalc_tags_size;
+       u8 *recalc_buffer = NULL;
+       u8 *recalc_tags = NULL;
         struct dm_integrity_range range;
         struct dm_io_request io_req;
         struct dm_io_region io_loc;
@@@ -2672,6 -2658,26 +2658,26 @@@
         unsigned int i;
         int r;
         unsigned int super_counter = 0;
+       unsigned recalc_sectors = RECALC_SECTORS;
+ 
+ retry:
+       recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
+       if (!recalc_buffer) {
+ oom:
+               recalc_sectors >>= 1;
+               if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
+                       goto retry;
+               DMCRIT("out of memory for recalculate buffer - recalculation disabled");
+               goto free_ret;
+       }
+       recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
+       if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
+               recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+       recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
+       if (!recalc_tags) {
+               vfree(recalc_buffer);
+               goto oom;
+       }
   
         DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
   
@@@ -2693,7 -2699,7 +2699,7 @@@ next_chunk
         }
   
         get_area_and_offset(ic, range.logical_sector, &area, &offset);
-       range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
+       range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
         if (!ic->meta_dev)
                 range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset);
   
@@@ -2735,7 -2741,7 +2741,7 @@@
   
         io_req.bi_opf = REQ_OP_READ;
         io_req.mem.type = DM_IO_VMA;
-       io_req.mem.ptr.addr = ic->recalc_buffer;
+       io_req.mem.ptr.addr = recalc_buffer;
         io_req.notify.fn = NULL;
         io_req.client = ic->io;
         io_loc.bdev = ic->dev->bdev;
@@@ -2748,15 -2754,15 +2754,15 @@@
                 goto err;
         }
   
-       t = ic->recalc_tags;
+       t = recalc_tags;
         for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
-               integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+               integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
                 t += ic->tag_size;
         }
   
         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
   
-       r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
+       r = dm_integrity_rw_tag(ic, recalc_tags, &metadata_block, &metadata_offset, t - recalc_tags, TAG_WRITE);
         if (unlikely(r)) {
                 dm_integrity_io_error(ic, "writing tags", r);
                 goto err;
@@@ -2784,12 -2790,16 +2790,16 @@@ advance_and_next
   
   err:
         remove_range(ic, &range);
-       return;
+       goto free_ret;
   
   unlock_ret:
         spin_unlock_irq(&ic->endio_wait.lock);
   
         recalc_write_super(ic);
+ 
+ free_ret:
+       vfree(recalc_buffer);
+       kvfree(recalc_tags);
   }
   
   static void bitmap_block_work(struct work_struct *w)
@@@ -4268,10 -4278,10 +4278,10 @@@ static int dm_integrity_ctr(struct dm_t
         }
   
         /*
- -       * If this workqueue were percpu, it would cause bio reordering
+ +       * If this workqueue weren't ordered, it would cause bio reordering
          * and reduced performance.
          */
- -      ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ +      ic->wait_wq = alloc_ordered_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM);
         if (!ic->wait_wq) {
                 ti->error = "Cannot allocate workqueue";
                 r = -ENOMEM;
@@@ -4454,8 -4464,6 +4464,6 @@@ try_smaller_buffer
         }
   
         if (ic->internal_hash) {
-               size_t recalc_tags_size;
- 
                 ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
                 if (!ic->recalc_wq) {
                         ti->error = "Cannot allocate workqueue";
@@@ -4463,21 -4471,6 +4471,6 @@@
                         goto bad;
                 }
                 INIT_WORK(&ic->recalc_work, integrity_recalc);
-               ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
-               if (!ic->recalc_buffer) {
-                       ti->error = "Cannot allocate buffer for recalculating";
-                       r = -ENOMEM;
-                       goto bad;
-               }
-               recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size;
-               if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
-                       recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
-               ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL);
-               if (!ic->recalc_tags) {
-                       ti->error = "Cannot allocate tags for recalculating";
-                       r = -ENOMEM;
-                       goto bad;
-               }
         } else {
                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                         ti->error = "Recalculate can only be specified with internal_hash";
@@@ -4621,8 -4614,6 +4614,6 @@@ static void dm_integrity_dtr(struct dm_
                 destroy_workqueue(ic->writer_wq);
         if (ic->recalc_wq)
                 destroy_workqueue(ic->recalc_wq);
-       vfree(ic->recalc_buffer);
-       kvfree(ic->recalc_tags);
         kvfree(ic->bbs);
         if (ic->bufio)
                 dm_bufio_client_destroy(ic->bufio);
diff --combined drivers/md/dm-ioctl.c

index 6d30101,8e14a4a..f5ed729
--- 1/drivers/md/dm-ioctl.c
--- 2/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@@ -767,7 -767,14 +767,14 @@@ static int get_target_version(struct fi
   static int check_name(const char *name)
   {
         if (strchr(name, '/')) {
-               DMERR("invalid device name");
+               DMERR("device name cannot contain '/'");
+               return -EINVAL;
+       }
+ 
+       if (strcmp(name, DM_CONTROL_NODE) == 0 ||
+           strcmp(name, ".") == 0 ||
+           strcmp(name, "..") == 0) {
+               DMERR("device name cannot be \"%s\", \".\", or \"..\"", DM_CONTROL_NODE);
                 return -EINVAL;
         }
   
@@@ -1168,10 -1175,13 +1175,10 @@@ static int do_resume(struct dm_ioctl *p
         /* Do we need to load a new map ? */
         if (new_map) {
                 sector_t old_size, new_size;
- -              int srcu_idx;
   
                 /* Suspend if it isn't already suspended */
- -              old_map = dm_get_live_table(md, &srcu_idx);
- -              if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map)
+ +              if (param->flags & DM_SKIP_LOCKFS_FLAG)
                         suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
- -              dm_put_live_table(md, srcu_idx);
                 if (param->flags & DM_NOFLUSH_FLAG)
                         suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
                 if (!dm_suspended_md(md))
@@@ -1388,16 -1398,38 +1395,38 @@@ static inline blk_mode_t get_mode(struc
         return mode;
   }
   
- static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+ static int next_target(struct dm_target_spec *last, uint32_t next, const char *end,
                        struct dm_target_spec **spec, char **target_params)
   {
-       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
-       *target_params = (char *) (*spec + 1);
+       static_assert(__alignof__(struct dm_target_spec) <= 8,
+               "struct dm_target_spec must not require more than 8-byte alignment");
+ 
+       /*
+        * Number of bytes remaining, starting with last. This is always
+        * sizeof(struct dm_target_spec) or more, as otherwise *last was
+        * out of bounds already.
+        */
+       size_t remaining = end - (char *)last;
+ 
+       /*
+        * There must be room for both the next target spec and the
+        * NUL-terminator of the target itself.
+        */
+       if (remaining - sizeof(struct dm_target_spec) <= next) {
+               DMERR("Target spec extends beyond end of parameters");
+               return -EINVAL;
+       }
   
-       if (*spec < (last + 1))
+       if (next % __alignof__(struct dm_target_spec)) {
+               DMERR("Next dm_target_spec (offset %u) is not %zu-byte aligned",
+                     next, __alignof__(struct dm_target_spec));
                 return -EINVAL;
+       }
+ 
+       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+       *target_params = (char *) (*spec + 1);
   
-       return invalid_str(*target_params, end);
+       return 0;
   }
   
   static int populate_table(struct dm_table *table,
@@@ -1407,8 -1439,9 +1436,9 @@@
         unsigned int i = 0;
         struct dm_target_spec *spec = (struct dm_target_spec *) param;
         uint32_t next = param->data_start;
-       void *end = (void *) param + param_size;
+       const char *const end = (const char *) param + param_size;
         char *target_params;
+       size_t min_size = sizeof(struct dm_ioctl);
   
         if (!param->target_count) {
                 DMERR("%s: no targets specified", __func__);
@@@ -1416,6 -1449,13 +1446,13 @@@
         }
   
         for (i = 0; i < param->target_count; i++) {
+               const char *nul_terminator;
+ 
+               if (next < min_size) {
+                       DMERR("%s: next target spec (offset %u) overlaps %s",
+                             __func__, next, i ? "previous target" : "'struct dm_ioctl'");
+                       return -EINVAL;
+               }
   
                 r = next_target(spec, next, end, &spec, &target_params);
                 if (r) {
@@@ -1423,6 -1463,15 +1460,15 @@@
                         return r;
                 }
   
+               nul_terminator = memchr(target_params, 0, (size_t)(end - target_params));
+               if (nul_terminator == NULL) {
+                       DMERR("%s: target parameters not NUL-terminated", __func__);
+                       return -EINVAL;
+               }
+ 
+               /* Add 1 for NUL terminator */
+               min_size = (size_t)(nul_terminator - (const char *)spec) + 1;
+ 
                 r = dm_table_add_target(table, spec->target_type,
                                         (sector_t) spec->sector_start,
                                         (sector_t) spec->length,
@@@ -1830,30 -1879,36 +1876,36 @@@ static ioctl_fn lookup_ioctl(unsigned i
    * As well as checking the version compatibility this always
    * copies the kernel interface version out.
    */
- static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+ static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
+                        struct dm_ioctl *kernel_params)
   {
-       uint32_t version[3];
         int r = 0;
   
-       if (copy_from_user(version, user->version, sizeof(version)))
+       /* Make certain version is first member of dm_ioctl struct */
+       BUILD_BUG_ON(offsetof(struct dm_ioctl, version) != 0);
+ 
+       if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version)))
                 return -EFAULT;
   
-       if ((version[0] != DM_VERSION_MAJOR) ||
-           (version[1] > DM_VERSION_MINOR)) {
+       if ((kernel_params->version[0] != DM_VERSION_MAJOR) ||
+           (kernel_params->version[1] > DM_VERSION_MINOR)) {
                 DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
                       DM_VERSION_MAJOR, DM_VERSION_MINOR,
                       DM_VERSION_PATCHLEVEL,
-                     version[0], version[1], version[2], cmd);
+                     kernel_params->version[0],
+                     kernel_params->version[1],
+                     kernel_params->version[2],
+                     cmd);
                 r = -EINVAL;
         }
   
         /*
          * Fill in the kernel version.
          */
-       version[0] = DM_VERSION_MAJOR;
-       version[1] = DM_VERSION_MINOR;
-       version[2] = DM_VERSION_PATCHLEVEL;
-       if (copy_to_user(user->version, version, sizeof(version)))
+       kernel_params->version[0] = DM_VERSION_MAJOR;
+       kernel_params->version[1] = DM_VERSION_MINOR;
+       kernel_params->version[2] = DM_VERSION_PATCHLEVEL;
+       if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version)))
                 return -EFAULT;
   
         return r;
@@@ -1877,9 -1932,11 +1929,11 @@@ static int copy_params(struct dm_ioctl 
         struct dm_ioctl *dmi;
         int secure_data;
         const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
-       unsigned int noio_flag;
   
-       if (copy_from_user(param_kernel, user, minimum_data_size))
+       /* check_version() already copied version from userspace, avoid TOCTOU */
+       if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version),
+                          (char __user *)user + sizeof(param_kernel->version),
+                          minimum_data_size - sizeof(param_kernel->version)))
                 return -EFAULT;
   
         if (param_kernel->data_size < minimum_data_size) {
@@@ -1904,9 -1961,7 +1958,7 @@@
          * Use kmalloc() rather than vmalloc() when we can.
          */
         dmi = NULL;
-       noio_flag = memalloc_noio_save();
-       dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
-       memalloc_noio_restore(noio_flag);
+       dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH);
   
         if (!dmi) {
                 if (secure_data && clear_user(user, param_kernel->data_size))
@@@ -1991,7 -2046,7 +2043,7 @@@ static int ctl_ioctl(struct file *file
          * Check the interface version passed in.  This also
          * writes out the kernel's interface version.
          */
-       r = check_version(cmd, user);
+       r = check_version(cmd, user, &param_kernel);
         if (r)
                 return r;
   
diff --combined drivers/md/dm-thin-metadata.c

index 9dd0409,63d92d3..6022189
--- 1/drivers/md/dm-thin-metadata.c
--- 2/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@@ -603,6 -603,8 +603,8 @@@ static int __format_metadata(struct dm_
         r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
                                  &pmd->tm, &pmd->metadata_sm);
         if (r < 0) {
+               pmd->tm = NULL;
+               pmd->metadata_sm = NULL;
                 DMERR("tm_create_with_sm failed");
                 return r;
         }
@@@ -611,6 -613,7 +613,7 @@@
         if (IS_ERR(pmd->data_sm)) {
                 DMERR("sm_disk_create failed");
                 r = PTR_ERR(pmd->data_sm);
+               pmd->data_sm = NULL;
                 goto bad_cleanup_tm;
         }
   
@@@ -641,11 -644,15 +644,15 @@@
   
   bad_cleanup_nb_tm:
         dm_tm_destroy(pmd->nb_tm);
+       pmd->nb_tm = NULL;
   bad_cleanup_data_sm:
         dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
   bad_cleanup_tm:
         dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
         dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
   
         return r;
   }
@@@ -711,6 -718,8 +718,8 @@@ static int __open_metadata(struct dm_po
                                sizeof(disk_super->metadata_space_map_root),
                                &pmd->tm, &pmd->metadata_sm);
         if (r < 0) {
+               pmd->tm = NULL;
+               pmd->metadata_sm = NULL;
                 DMERR("tm_open_with_sm failed");
                 goto bad_unlock_sblock;
         }
@@@ -720,6 -729,7 +729,7 @@@
         if (IS_ERR(pmd->data_sm)) {
                 DMERR("sm_disk_open failed");
                 r = PTR_ERR(pmd->data_sm);
+               pmd->data_sm = NULL;
                 goto bad_cleanup_tm;
         }
   
@@@ -746,9 -756,12 +756,12 @@@
   
   bad_cleanup_data_sm:
         dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
   bad_cleanup_tm:
         dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
         dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
   bad_unlock_sblock:
         dm_bm_unlock(sblock);
   
@@@ -795,9 -808,13 +808,13 @@@ static void __destroy_persistent_data_o
                                               bool destroy_bm)
   {
         dm_sm_destroy(pmd->data_sm);
+       pmd->data_sm = NULL;
         dm_sm_destroy(pmd->metadata_sm);
+       pmd->metadata_sm = NULL;
         dm_tm_destroy(pmd->nb_tm);
+       pmd->nb_tm = NULL;
         dm_tm_destroy(pmd->tm);
+       pmd->tm = NULL;
         if (destroy_bm)
                 dm_block_manager_destroy(pmd->bm);
   }
@@@ -1005,8 -1022,7 +1022,7 @@@ int dm_pool_metadata_close(struct dm_po
                                __func__, r);
         }
         pmd_write_unlock(pmd);
-       if (!pmd->fail_io)
-               __destroy_persistent_data_objects(pmd, true);
+       __destroy_persistent_data_objects(pmd, true);
   
         kfree(pmd);
         return 0;
@@@ -1756,15 -1772,13 +1772,15 @@@ int dm_thin_remove_range(struct dm_thin
   
   int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
   {
- -      int r;
+ +      int r = -EINVAL;
         uint32_t ref_count;
   
         down_read(&pmd->root_lock);
- -      r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
- -      if (!r)
- -              *result = (ref_count > 1);
+ +      if (!pmd->fail_io) {
+ +              r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+ +              if (!r)
+ +                      *result = (ref_count > 1);
+ +      }
         up_read(&pmd->root_lock);
   
         return r;
@@@ -1772,11 -1786,10 +1788,11 @@@
   
   int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
   {
- -      int r = 0;
+ +      int r = -EINVAL;
   
         pmd_write_lock(pmd);
- -      r = dm_sm_inc_blocks(pmd->data_sm, b, e);
+ +      if (!pmd->fail_io)
+ +              r = dm_sm_inc_blocks(pmd->data_sm, b, e);
         pmd_write_unlock(pmd);
   
         return r;
@@@ -1784,11 -1797,10 +1800,11 @@@
   
   int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
   {
- -      int r = 0;
+ +      int r = -EINVAL;
   
         pmd_write_lock(pmd);
- -      r = dm_sm_dec_blocks(pmd->data_sm, b, e);
+ +      if (!pmd->fail_io)
+ +              r = dm_sm_dec_blocks(pmd->data_sm, b, e);
         pmd_write_unlock(pmd);
   
         return r;
@@@ -1881,53 -1893,29 +1897,29 @@@ static void __set_abort_with_changes_fl
   int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
   {
         int r = -EINVAL;
-       struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
   
         /* fail_io is double-checked with pmd->root_lock held below */
         if (unlikely(pmd->fail_io))
                 return r;
   
-       /*
-        * Replacement block manager (new_bm) is created and old_bm destroyed outside of
-        * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
-        * shrinker associated with the block manager's bufio client vs pmd root_lock).
-        * - must take shrinker_rwsem without holding pmd->root_lock
-        */
-       new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-                                        THIN_MAX_CONCURRENT_LOCKS);
- 
         pmd_write_lock(pmd);
         if (pmd->fail_io) {
                 pmd_write_unlock(pmd);
-               goto out;
+               return r;
         }
- 
         __set_abort_with_changes_flags(pmd);
+ 
+       /* destroy data_sm/metadata_sm/nb_tm/tm */
         __destroy_persistent_data_objects(pmd, false);
-       old_bm = pmd->bm;
-       if (IS_ERR(new_bm)) {
-               DMERR("could not create block manager during abort");
-               pmd->bm = NULL;
-               r = PTR_ERR(new_bm);
-               goto out_unlock;
-       }
   
-       pmd->bm = new_bm;
+       /* reset bm */
+       dm_block_manager_reset(pmd->bm);
+ 
+       /* rebuild data_sm/metadata_sm/nb_tm/tm */
         r = __open_or_format_metadata(pmd, false);
-       if (r) {
-               pmd->bm = NULL;
-               goto out_unlock;
-       }
-       new_bm = NULL;
- out_unlock:
         if (r)
                 pmd->fail_io = true;
         pmd_write_unlock(pmd);
-       dm_block_manager_destroy(old_bm);
- out:
-       if (new_bm && !IS_ERR(new_bm))
-               dm_block_manager_destroy(new_bm);
- 
         return r;
   }
   
diff --combined drivers/md/dm-thin.c

index f1d0dcb,5b0c2f0..07c7f97
--- 1/drivers/md/dm-thin.c
--- 2/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@@ -401,7 -401,8 +401,7 @@@ static int issue_discard(struct discard
         sector_t s = block_to_sectors(tc->pool, data_b);
         sector_t len = block_to_sectors(tc->pool, data_e - data_b);
   
- -      return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
- -                                    &op->bio);
+ +      return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
   }
   
   static void end_discard(struct discard_op *op, int r)
@@@ -2527,16 -2528,11 +2527,11 @@@ static void noflush_work(struct thin_c 
   
   /*----------------------------------------------------------------*/
   
- static bool passdown_enabled(struct pool_c *pt)
- {
-       return pt->adjusted_pf.discard_passdown;
- }
- 
   static void set_discard_callbacks(struct pool *pool)
   {
         struct pool_c *pt = pool->ti->private;
   
-       if (passdown_enabled(pt)) {
+       if (pt->adjusted_pf.discard_passdown) {
                 pool->process_discard_cell = process_discard_cell_passdown;
                 pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
                 pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
@@@ -2845,7 -2841,7 +2840,7 @@@ static bool is_factor(sector_t block_si
    * If discard_passdown was enabled verify that the data device
    * supports discards.  Disable discard_passdown if not.
    */
- static void disable_passdown_if_not_supported(struct pool_c *pt)
+ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
   {
         struct pool *pool = pt->pool;
         struct block_device *data_bdev = pt->data_dev->bdev;
@@@ -3446,7 -3442,6 +3441,6 @@@ out_unlock
   
   static int pool_map(struct dm_target *ti, struct bio *bio)
   {
-       int r;
         struct pool_c *pt = ti->private;
         struct pool *pool = pt->pool;
   
@@@ -3455,10 -3450,9 +3449,9 @@@
          */
         spin_lock_irq(&pool->lock);
         bio_set_dev(bio, pt->data_dev->bdev);
-       r = DM_MAPIO_REMAPPED;
         spin_unlock_irq(&pool->lock);
   
-       return r;
+       return DM_MAPIO_REMAPPED;
   }
   
   static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
@@@ -4099,21 -4093,22 +4092,22 @@@ static void pool_io_hints(struct dm_tar
          * They get transferred to the live pool in bind_control_target()
          * called from pool_preresume().
          */
-       if (!pt->adjusted_pf.discard_enabled) {
+ 
+       if (pt->adjusted_pf.discard_enabled) {
+               disable_discard_passdown_if_not_supported(pt);
+               if (!pt->adjusted_pf.discard_passdown)
+                       limits->max_discard_sectors = 0;
+               /*
+                * The pool uses the same discard limits as the underlying data
+                * device.  DM core has already set this up.
+                */
+       } else {
                 /*
                  * Must explicitly disallow stacking discard limits otherwise the
                  * block layer will stack them if pool's data device has support.
                  */
                 limits->discard_granularity = 0;
-               return;
         }
- 
-       disable_passdown_if_not_supported(pt);
- 
-       /*
-        * The pool uses the same discard limits as the underlying data
-        * device.  DM core has already set this up.
-        */
   }
   
   static struct target_type pool_target = {
@@@ -4497,11 -4492,10 +4491,10 @@@ static void thin_io_hints(struct dm_tar
         struct thin_c *tc = ti->private;
         struct pool *pool = tc->pool;
   
-       if (!pool->pf.discard_enabled)
-               return;
- 
-       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-       limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+       if (pool->pf.discard_enabled) {
+               limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+               limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+       }
   }
   
   static struct target_type thin_target = {
diff --combined drivers/md/dm.c

index c4cdab5,ea1671c..f0f118a
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -207,7 -207,7 +207,7 @@@ static int __init local_init(void
         if (r)
                 return r;
   
- -      deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ +      deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0);
         if (!deferred_remove_workqueue) {
                 r = -ENOMEM;
                 goto out_uevent_exit;
@@@ -487,48 -487,50 +487,50 @@@ u64 dm_start_time_ns_from_clone(struct 
   }
   EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
   
- static bool bio_is_flush_with_data(struct bio *bio)
+ static inline bool bio_is_flush_with_data(struct bio *bio)
   {
         return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
   }
   
- static void dm_io_acct(struct dm_io *io, bool end)
+ static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
   {
-       struct dm_stats_aux *stats_aux = &io->stats_aux;
-       unsigned long start_time = io->start_time;
-       struct mapped_device *md = io->md;
-       struct bio *bio = io->orig_bio;
-       unsigned int sectors;
- 
         /*
          * If REQ_PREFLUSH set, don't account payload, it will be
          * submitted (and accounted) after this flush completes.
          */
         if (bio_is_flush_with_data(bio))
-               sectors = 0;
-       else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
-               sectors = bio_sectors(bio);
-       else
-               sectors = io->sectors;
+               return 0;
+       if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
+               return io->sectors;
+       return bio_sectors(bio);
+ }
   
-       if (!end)
-               bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
-       else
-               bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
-                                start_time);
+ static void dm_io_acct(struct dm_io *io, bool end)
+ {
+       struct bio *bio = io->orig_bio;
+ 
+       if (dm_io_flagged(io, DM_IO_BLK_STAT)) {
+               if (!end)
+                       bdev_start_io_acct(bio->bi_bdev, bio_op(bio),
+                                          io->start_time);
+               else
+                       bdev_end_io_acct(bio->bi_bdev, bio_op(bio),
+                                        dm_io_sectors(io, bio),
+                                        io->start_time);
+       }
   
         if (static_branch_unlikely(&stats_enabled) &&
-           unlikely(dm_stats_used(&md->stats))) {
+           unlikely(dm_stats_used(&io->md->stats))) {
                 sector_t sector;
   
-               if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
-                       sector = bio->bi_iter.bi_sector;
-               else
+               if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
                         sector = bio_end_sector(bio) - io->sector_offset;
+               else
+                       sector = bio->bi_iter.bi_sector;
   
-               dm_stats_account_io(&md->stats, bio_data_dir(bio),
-                                   sector, sectors,
-                                   end, start_time, stats_aux);
+               dm_stats_account_io(&io->md->stats, bio_data_dir(bio),
+                                   sector, dm_io_sectors(io, bio),
+                                   end, io->start_time, &io->stats_aux);
         }
   }
   
@@@ -592,8 -594,11 +594,11 @@@ static struct dm_io *alloc_io(struct ma
         spin_lock_init(&io->lock);
         io->start_time = jiffies;
         io->flags = 0;
+       if (blk_queue_io_stat(md->queue))
+               dm_io_set_flag(io, DM_IO_BLK_STAT);
   
-       if (static_branch_unlikely(&stats_enabled))
+       if (static_branch_unlikely(&stats_enabled) &&
+           unlikely(dm_stats_used(&md->stats)))
                 dm_stats_record_start(&md->stats, &io->stats_aux);
   
         return io;
@@@ -1172,8 -1177,7 +1177,8 @@@ static inline sector_t max_io_len_targe
   }
   
   static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
- -                           unsigned int max_granularity)
+ +                           unsigned int max_granularity,
+ +                           unsigned int max_sectors)
   {
         sector_t target_offset = dm_target_offset(ti, sector);
         sector_t len = max_io_len_target_boundary(ti, target_offset);
@@@ -1187,13 -1191,13 +1192,13 @@@
         if (!max_granularity)
                 return len;
         return min_t(sector_t, len,
- -              min(queue_max_sectors(ti->table->md->queue),
+ +              min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
                     blk_chunk_sectors_left(target_offset, max_granularity)));
   }
   
   static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
   {
- -      return __max_io_len(ti, sector, ti->max_io_len);
+ +      return __max_io_len(ti, sector, ti->max_io_len, 0);
   }
   
   int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@@ -1582,13 -1586,12 +1587,13 @@@ static void __send_empty_flush(struct c
   
   static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
                                         unsigned int num_bios,
- -                                      unsigned int max_granularity)
+ +                                      unsigned int max_granularity,
+ +                                      unsigned int max_sectors)
   {
         unsigned int len, bios;
   
         len = min_t(sector_t, ci->sector_count,
- -                  __max_io_len(ti, ci->sector, max_granularity));
+ +                  __max_io_len(ti, ci->sector, max_granularity, max_sectors));
   
         atomic_add(num_bios, &ci->io->io_count);
         bios = __send_duplicate_bios(ci, ti, num_bios, &len);
@@@ -1625,27 -1628,23 +1630,27 @@@ static blk_status_t __process_abnormal_
   {
         unsigned int num_bios = 0;
         unsigned int max_granularity = 0;
+ +      unsigned int max_sectors = 0;
         struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
   
         switch (bio_op(ci->bio)) {
         case REQ_OP_DISCARD:
                 num_bios = ti->num_discard_bios;
+ +              max_sectors = limits->max_discard_sectors;
                 if (ti->max_discard_granularity)
- -                      max_granularity = limits->max_discard_sectors;
+ +                      max_granularity = max_sectors;
                 break;
         case REQ_OP_SECURE_ERASE:
                 num_bios = ti->num_secure_erase_bios;
+ +              max_sectors = limits->max_secure_erase_sectors;
                 if (ti->max_secure_erase_granularity)
- -                      max_granularity = limits->max_secure_erase_sectors;
+ +                      max_granularity = max_sectors;
                 break;
         case REQ_OP_WRITE_ZEROES:
                 num_bios = ti->num_write_zeroes_bios;
+ +              max_sectors = limits->max_write_zeroes_sectors;
                 if (ti->max_write_zeroes_granularity)
- -                      max_granularity = limits->max_write_zeroes_sectors;
+ +                      max_granularity = max_sectors;
                 break;
         default:
                 break;
@@@ -1660,8 -1659,7 +1665,8 @@@
         if (unlikely(!num_bios))
                 return BLK_STS_NOTSUPP;
   
- -      __send_changing_extent_only(ci, ti, num_bios, max_granularity);
+ +      __send_changing_extent_only(ci, ti, num_bios,
+ +                                  max_granularity, max_sectors);
         return BLK_STS_OK;
   }
   
@@@ -2348,6 -2346,7 +2353,7 @@@ int dm_setup_md_queue(struct mapped_dev
                 break;
         case DM_TYPE_BIO_BASED:
         case DM_TYPE_DAX_BIO_BASED:
+               blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue);
                 break;
         case DM_TYPE_NONE:
                 WARN_ON_ONCE(true);
@@@ -2815,10 -2814,6 +2821,10 @@@ retry
         }
   
         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ +      if (!map) {
+ +              /* avoid deadlock with fs/namespace.c:do_mount() */
+ +              suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ +      }
   
         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
         if (r)
@@@ -3143,8 -3138,6 +3149,8 @@@ struct dm_pr 
         bool    fail_early;
         int     ret;
         enum pr_type type;
+ +      struct pr_keys *read_keys;
+ +      struct pr_held_reservation *rsv;
   };
   
   static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
@@@ -3377,79 -3370,12 +3383,79 @@@ out
         return r;
   }
   
+ +static int __dm_pr_read_keys(struct dm_target *ti, struct dm_dev *dev,
+ +                           sector_t start, sector_t len, void *data)
+ +{
+ +      struct dm_pr *pr = data;
+ +      const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+ +
+ +      if (!ops || !ops->pr_read_keys) {
+ +              pr->ret = -EOPNOTSUPP;
+ +              return -1;
+ +      }
+ +
+ +      pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys);
+ +      if (!pr->ret)
+ +              return -1;
+ +
+ +      return 0;
+ +}
+ +
+ +static int dm_pr_read_keys(struct block_device *bdev, struct pr_keys *keys)
+ +{
+ +      struct dm_pr pr = {
+ +              .read_keys = keys,
+ +      };
+ +      int ret;
+ +
+ +      ret = dm_call_pr(bdev, __dm_pr_read_keys, &pr);
+ +      if (ret)
+ +              return ret;
+ +
+ +      return pr.ret;
+ +}
+ +
+ +static int __dm_pr_read_reservation(struct dm_target *ti, struct dm_dev *dev,
+ +                                  sector_t start, sector_t len, void *data)
+ +{
+ +      struct dm_pr *pr = data;
+ +      const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+ +
+ +      if (!ops || !ops->pr_read_reservation) {
+ +              pr->ret = -EOPNOTSUPP;
+ +              return -1;
+ +      }
+ +
+ +      pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv);
+ +      if (!pr->ret)
+ +              return -1;
+ +
+ +      return 0;
+ +}
+ +
+ +static int dm_pr_read_reservation(struct block_device *bdev,
+ +                                struct pr_held_reservation *rsv)
+ +{
+ +      struct dm_pr pr = {
+ +              .rsv = rsv,
+ +      };
+ +      int ret;
+ +
+ +      ret = dm_call_pr(bdev, __dm_pr_read_reservation, &pr);
+ +      if (ret)
+ +              return ret;
+ +
+ +      return pr.ret;
+ +}
+ +
   static const struct pr_ops dm_pr_ops = {
         .pr_register    = dm_pr_register,
         .pr_reserve     = dm_pr_reserve,
         .pr_release     = dm_pr_release,
         .pr_preempt     = dm_pr_preempt,
         .pr_clear       = dm_pr_clear,
+ +      .pr_read_keys   = dm_pr_read_keys,
+ +      .pr_read_reservation = dm_pr_read_reservation,
   };
   
   static const struct block_device_operations dm_blk_dops = {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 30 Jun 2023 19:16:00 +0000 (12:16 -0700)
		1	2
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-integrity.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-thin-metadata.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-thin.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history