* In order to not degrade performance with excessive locking, we try
* non-blocking allocations without a mutex first but on failure we fallback
* to blocking allocations with a mutex.
+ *
+ * In order to reduce allocation overhead, we try to allocate compound pages in
+ * the first pass. If they are not available, we fall back to the mempool.
*/
static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
{
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
- unsigned int i, len, remaining_size;
- struct page *page;
+ unsigned int remaining_size;
+ unsigned int order = MAX_ORDER - 1;
retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
remaining_size = size;
- for (i = 0; i < nr_iovecs; i++) {
- page = mempool_alloc(&cc->page_pool, gfp_mask);
- if (!page) {
+ while (remaining_size) {
+ struct page *pages;
+ unsigned size_to_add;
+ unsigned remaining_order = __fls((remaining_size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ order = min(order, remaining_order);
+
+ while (order > 0) {
+ pages = alloc_pages(gfp_mask
+ | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | __GFP_COMP,
+ order);
+ if (likely(pages != NULL))
+ goto have_pages;
+ order--;
+ }
+
+ pages = mempool_alloc(&cc->page_pool, gfp_mask);
+ if (!pages) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
gfp_mask |= __GFP_DIRECT_RECLAIM;
+ order = 0;
goto retry;
}
- len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
-
- __bio_add_page(clone, page, len, 0);
- remaining_size -= len;
+ have_pages:
+ size_to_add = min((unsigned)PAGE_SIZE << order, remaining_size);
+ __bio_add_page(clone, pages, size_to_add, 0);
+ remaining_size -= size_to_add;
}
/* Allocate space for integrity tags */
static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bv, clone, iter_all) {
- BUG_ON(!bv->bv_page);
- mempool_free(bv->bv_page, &cc->page_pool);
+ if (clone->bi_vcnt > 0) { /* bio_for_each_folio_all crashes with an empty bio */
+ bio_for_each_folio_all(fi, clone) {
+ if (folio_test_large(fi.folio))
+ folio_put(fi.folio);
+ else
+ mempool_free(&fi.folio->page, &cc->page_pool);
+ }
}
}
ret = crypt_ctr_auth_cipher(cc, cipher_api);
if (ret < 0) {
ti->error = "Invalid AEAD cipher spec";
- return -ENOMEM;
+ return ret;
}
}
cc->per_bio_data_size = ti->per_io_data_size =
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
- ARCH_KMALLOC_MINALIGN);
+ ARCH_DMA_MINALIGN);
ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
if (ret) {
#define DEFAULT_BUFFER_SECTORS 128
#define DEFAULT_JOURNAL_WATERMARK 50
#define DEFAULT_SYNC_MSEC 10000
- #define DEFAULT_MAX_JOURNAL_SECTORS 131072
+ #define DEFAULT_MAX_JOURNAL_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192)
#define MIN_LOG2_INTERLEAVE_SECTORS 3
#define MAX_LOG2_INTERLEAVE_SECTORS 31
#define METADATA_WORKQUEUE_MAX_ACTIVE 16
- #define RECALC_SECTORS 32768
+ #define RECALC_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048)
#define RECALC_WRITE_SUPER 16
#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
#define BITMAP_FLUSH_INTERVAL (10 * HZ)
struct workqueue_struct *recalc_wq;
struct work_struct recalc_work;
- u8 *recalc_buffer;
- u8 *recalc_tags;
struct bio_list flush_bio_list;
#define JOURNAL_IO_MEMPOOL 32
#ifdef DEBUG_PRINT
- #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
- static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
- {
- va_list args;
-
- va_start(args, msg);
- vprintk(msg, args);
- va_end(args);
- if (len)
- pr_cont(":");
- while (len) {
- pr_cont(" %02x", *bytes);
- bytes++;
- len--;
- }
- pr_cont("\n");
- }
- #define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+ #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
+ #define DEBUG_bytes(bytes, len, msg, ...) printk(KERN_DEBUG msg "%s%*ph\n", ##__VA_ARGS__, \
+ len ? ": " : "", len, bytes)
#else
#define DEBUG_print(x, ...) do { } while (0)
#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
static void integrity_recalc(struct work_struct *w)
{
struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+ size_t recalc_tags_size;
+ u8 *recalc_buffer = NULL;
+ u8 *recalc_tags = NULL;
struct dm_integrity_range range;
struct dm_io_request io_req;
struct dm_io_region io_loc;
unsigned int i;
int r;
unsigned int super_counter = 0;
+ unsigned recalc_sectors = RECALC_SECTORS;
+
+ retry:
+ recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
+ if (!recalc_buffer) {
+ oom:
+ recalc_sectors >>= 1;
+ if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
+ goto retry;
+ DMCRIT("out of memory for recalculate buffer - recalculation disabled");
+ goto free_ret;
+ }
+ recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
+ if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
+ recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+ recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
+ if (!recalc_tags) {
+ vfree(recalc_buffer);
+ goto oom;
+ }
DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
}
get_area_and_offset(ic, range.logical_sector, &area, &offset);
- range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
+ range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
if (!ic->meta_dev)
range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset);
io_req.bi_opf = REQ_OP_READ;
io_req.mem.type = DM_IO_VMA;
- io_req.mem.ptr.addr = ic->recalc_buffer;
+ io_req.mem.ptr.addr = recalc_buffer;
io_req.notify.fn = NULL;
io_req.client = ic->io;
io_loc.bdev = ic->dev->bdev;
goto err;
}
- t = ic->recalc_tags;
+ t = recalc_tags;
for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
- integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+ integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
t += ic->tag_size;
}
metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
- r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
+ r = dm_integrity_rw_tag(ic, recalc_tags, &metadata_block, &metadata_offset, t - recalc_tags, TAG_WRITE);
if (unlikely(r)) {
dm_integrity_io_error(ic, "writing tags", r);
goto err;
err:
remove_range(ic, &range);
- return;
+ goto free_ret;
unlock_ret:
spin_unlock_irq(&ic->endio_wait.lock);
recalc_write_super(ic);
+
+ free_ret:
+ vfree(recalc_buffer);
+ kvfree(recalc_tags);
}
static void bitmap_block_work(struct work_struct *w)
}
/*
- * If this workqueue were percpu, it would cause bio reordering
+ * If this workqueue weren't ordered, it would cause bio reordering
* and reduced performance.
*/
- ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ ic->wait_wq = alloc_ordered_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM);
if (!ic->wait_wq) {
ti->error = "Cannot allocate workqueue";
r = -ENOMEM;
}
if (ic->internal_hash) {
- size_t recalc_tags_size;
-
ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
if (!ic->recalc_wq) {
ti->error = "Cannot allocate workqueue";
goto bad;
}
INIT_WORK(&ic->recalc_work, integrity_recalc);
- ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
- if (!ic->recalc_buffer) {
- ti->error = "Cannot allocate buffer for recalculating";
- r = -ENOMEM;
- goto bad;
- }
- recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size;
- if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
- recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
- ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL);
- if (!ic->recalc_tags) {
- ti->error = "Cannot allocate tags for recalculating";
- r = -ENOMEM;
- goto bad;
- }
} else {
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
ti->error = "Recalculate can only be specified with internal_hash";
destroy_workqueue(ic->writer_wq);
if (ic->recalc_wq)
destroy_workqueue(ic->recalc_wq);
- vfree(ic->recalc_buffer);
- kvfree(ic->recalc_tags);
kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
static int check_name(const char *name)
{
if (strchr(name, '/')) {
- DMERR("invalid device name");
+ DMERR("device name cannot contain '/'");
+ return -EINVAL;
+ }
+
+ if (strcmp(name, DM_CONTROL_NODE) == 0 ||
+ strcmp(name, ".") == 0 ||
+ strcmp(name, "..") == 0) {
+ DMERR("device name cannot be \"%s\", \".\", or \"..\"", DM_CONTROL_NODE);
return -EINVAL;
}
/* Do we need to load a new map ? */
if (new_map) {
sector_t old_size, new_size;
- int srcu_idx;
/* Suspend if it isn't already suspended */
- old_map = dm_get_live_table(md, &srcu_idx);
- if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map)
+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
- dm_put_live_table(md, srcu_idx);
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended_md(md))
return mode;
}
- static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+ static int next_target(struct dm_target_spec *last, uint32_t next, const char *end,
struct dm_target_spec **spec, char **target_params)
{
- *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
- *target_params = (char *) (*spec + 1);
+ static_assert(__alignof__(struct dm_target_spec) <= 8,
+ "struct dm_target_spec must not require more than 8-byte alignment");
+
+ /*
+ * Number of bytes remaining, starting with last. This is always
+ * sizeof(struct dm_target_spec) or more, as otherwise *last was
+ * out of bounds already.
+ */
+ size_t remaining = end - (char *)last;
+
+ /*
+ * There must be room for both the next target spec and the
+ * NUL-terminator of the target itself.
+ */
+ if (remaining - sizeof(struct dm_target_spec) <= next) {
+ DMERR("Target spec extends beyond end of parameters");
+ return -EINVAL;
+ }
- if (*spec < (last + 1))
+ if (next % __alignof__(struct dm_target_spec)) {
+ DMERR("Next dm_target_spec (offset %u) is not %zu-byte aligned",
+ next, __alignof__(struct dm_target_spec));
return -EINVAL;
+ }
+
+ *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+ *target_params = (char *) (*spec + 1);
- return invalid_str(*target_params, end);
+ return 0;
}
static int populate_table(struct dm_table *table,
unsigned int i = 0;
struct dm_target_spec *spec = (struct dm_target_spec *) param;
uint32_t next = param->data_start;
- void *end = (void *) param + param_size;
+ const char *const end = (const char *) param + param_size;
char *target_params;
+ size_t min_size = sizeof(struct dm_ioctl);
if (!param->target_count) {
DMERR("%s: no targets specified", __func__);
}
for (i = 0; i < param->target_count; i++) {
+ const char *nul_terminator;
+
+ if (next < min_size) {
+ DMERR("%s: next target spec (offset %u) overlaps %s",
+ __func__, next, i ? "previous target" : "'struct dm_ioctl'");
+ return -EINVAL;
+ }
r = next_target(spec, next, end, &spec, &target_params);
if (r) {
return r;
}
+ nul_terminator = memchr(target_params, 0, (size_t)(end - target_params));
+ if (nul_terminator == NULL) {
+ DMERR("%s: target parameters not NUL-terminated", __func__);
+ return -EINVAL;
+ }
+
+ /* Add 1 for NUL terminator */
+ min_size = (size_t)(nul_terminator - (const char *)spec) + 1;
+
r = dm_table_add_target(table, spec->target_type,
(sector_t) spec->sector_start,
(sector_t) spec->length,
* As well as checking the version compatibility this always
* copies the kernel interface version out.
*/
- static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+ static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
+ struct dm_ioctl *kernel_params)
{
- uint32_t version[3];
int r = 0;
- if (copy_from_user(version, user->version, sizeof(version)))
+ /* Make certain version is first member of dm_ioctl struct */
+ BUILD_BUG_ON(offsetof(struct dm_ioctl, version) != 0);
+
+ if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version)))
return -EFAULT;
- if ((version[0] != DM_VERSION_MAJOR) ||
- (version[1] > DM_VERSION_MINOR)) {
+ if ((kernel_params->version[0] != DM_VERSION_MAJOR) ||
+ (kernel_params->version[1] > DM_VERSION_MINOR)) {
DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
DM_VERSION_MAJOR, DM_VERSION_MINOR,
DM_VERSION_PATCHLEVEL,
- version[0], version[1], version[2], cmd);
+ kernel_params->version[0],
+ kernel_params->version[1],
+ kernel_params->version[2],
+ cmd);
r = -EINVAL;
}
/*
* Fill in the kernel version.
*/
- version[0] = DM_VERSION_MAJOR;
- version[1] = DM_VERSION_MINOR;
- version[2] = DM_VERSION_PATCHLEVEL;
- if (copy_to_user(user->version, version, sizeof(version)))
+ kernel_params->version[0] = DM_VERSION_MAJOR;
+ kernel_params->version[1] = DM_VERSION_MINOR;
+ kernel_params->version[2] = DM_VERSION_PATCHLEVEL;
+ if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version)))
return -EFAULT;
return r;
struct dm_ioctl *dmi;
int secure_data;
const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
- unsigned int noio_flag;
- if (copy_from_user(param_kernel, user, minimum_data_size))
+ /* check_version() already copied version from userspace, avoid TOCTOU */
+ if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version),
+ (char __user *)user + sizeof(param_kernel->version),
+ minimum_data_size - sizeof(param_kernel->version)))
return -EFAULT;
if (param_kernel->data_size < minimum_data_size) {
* Use kmalloc() rather than vmalloc() when we can.
*/
dmi = NULL;
- noio_flag = memalloc_noio_save();
- dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
- memalloc_noio_restore(noio_flag);
+ dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH);
if (!dmi) {
if (secure_data && clear_user(user, param_kernel->data_size))
* Check the interface version passed in. This also
* writes out the kernel's interface version.
*/
- r = check_version(cmd, user);
+ r = check_version(cmd, user, ¶m_kernel);
if (r)
return r;
r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&pmd->tm, &pmd->metadata_sm);
if (r < 0) {
+ pmd->tm = NULL;
+ pmd->metadata_sm = NULL;
DMERR("tm_create_with_sm failed");
return r;
}
if (IS_ERR(pmd->data_sm)) {
DMERR("sm_disk_create failed");
r = PTR_ERR(pmd->data_sm);
+ pmd->data_sm = NULL;
goto bad_cleanup_tm;
}
bad_cleanup_nb_tm:
dm_tm_destroy(pmd->nb_tm);
+ pmd->nb_tm = NULL;
bad_cleanup_data_sm:
dm_sm_destroy(pmd->data_sm);
+ pmd->data_sm = NULL;
bad_cleanup_tm:
dm_tm_destroy(pmd->tm);
+ pmd->tm = NULL;
dm_sm_destroy(pmd->metadata_sm);
+ pmd->metadata_sm = NULL;
return r;
}
sizeof(disk_super->metadata_space_map_root),
&pmd->tm, &pmd->metadata_sm);
if (r < 0) {
+ pmd->tm = NULL;
+ pmd->metadata_sm = NULL;
DMERR("tm_open_with_sm failed");
goto bad_unlock_sblock;
}
if (IS_ERR(pmd->data_sm)) {
DMERR("sm_disk_open failed");
r = PTR_ERR(pmd->data_sm);
+ pmd->data_sm = NULL;
goto bad_cleanup_tm;
}
bad_cleanup_data_sm:
dm_sm_destroy(pmd->data_sm);
+ pmd->data_sm = NULL;
bad_cleanup_tm:
dm_tm_destroy(pmd->tm);
+ pmd->tm = NULL;
dm_sm_destroy(pmd->metadata_sm);
+ pmd->metadata_sm = NULL;
bad_unlock_sblock:
dm_bm_unlock(sblock);
bool destroy_bm)
{
dm_sm_destroy(pmd->data_sm);
+ pmd->data_sm = NULL;
dm_sm_destroy(pmd->metadata_sm);
+ pmd->metadata_sm = NULL;
dm_tm_destroy(pmd->nb_tm);
+ pmd->nb_tm = NULL;
dm_tm_destroy(pmd->tm);
+ pmd->tm = NULL;
if (destroy_bm)
dm_block_manager_destroy(pmd->bm);
}
__func__, r);
}
pmd_write_unlock(pmd);
- if (!pmd->fail_io)
- __destroy_persistent_data_objects(pmd, true);
+ __destroy_persistent_data_objects(pmd, true);
kfree(pmd);
return 0;
int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
{
- int r;
+ int r = -EINVAL;
uint32_t ref_count;
down_read(&pmd->root_lock);
- r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
- if (!r)
- *result = (ref_count > 1);
+ if (!pmd->fail_io) {
+ r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+ if (!r)
+ *result = (ref_count > 1);
+ }
up_read(&pmd->root_lock);
return r;
int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
{
- int r = 0;
+ int r = -EINVAL;
pmd_write_lock(pmd);
- r = dm_sm_inc_blocks(pmd->data_sm, b, e);
+ if (!pmd->fail_io)
+ r = dm_sm_inc_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
{
- int r = 0;
+ int r = -EINVAL;
pmd_write_lock(pmd);
- r = dm_sm_dec_blocks(pmd->data_sm, b, e);
+ if (!pmd->fail_io)
+ r = dm_sm_dec_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
- struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
/* fail_io is double-checked with pmd->root_lock held below */
if (unlikely(pmd->fail_io))
return r;
- /*
- * Replacement block manager (new_bm) is created and old_bm destroyed outside of
- * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
- * shrinker associated with the block manager's bufio client vs pmd root_lock).
- * - must take shrinker_rwsem without holding pmd->root_lock
- */
- new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
- THIN_MAX_CONCURRENT_LOCKS);
-
pmd_write_lock(pmd);
if (pmd->fail_io) {
pmd_write_unlock(pmd);
- goto out;
+ return r;
}
-
__set_abort_with_changes_flags(pmd);
+
+ /* destroy data_sm/metadata_sm/nb_tm/tm */
__destroy_persistent_data_objects(pmd, false);
- old_bm = pmd->bm;
- if (IS_ERR(new_bm)) {
- DMERR("could not create block manager during abort");
- pmd->bm = NULL;
- r = PTR_ERR(new_bm);
- goto out_unlock;
- }
- pmd->bm = new_bm;
+ /* reset bm */
+ dm_block_manager_reset(pmd->bm);
+
+ /* rebuild data_sm/metadata_sm/nb_tm/tm */
r = __open_or_format_metadata(pmd, false);
- if (r) {
- pmd->bm = NULL;
- goto out_unlock;
- }
- new_bm = NULL;
- out_unlock:
if (r)
pmd->fail_io = true;
pmd_write_unlock(pmd);
- dm_block_manager_destroy(old_bm);
- out:
- if (new_bm && !IS_ERR(new_bm))
- dm_block_manager_destroy(new_bm);
-
return r;
}
sector_t s = block_to_sectors(tc->pool, data_b);
sector_t len = block_to_sectors(tc->pool, data_e - data_b);
- return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
- &op->bio);
+ return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
}
static void end_discard(struct discard_op *op, int r)
/*----------------------------------------------------------------*/
- static bool passdown_enabled(struct pool_c *pt)
- {
- return pt->adjusted_pf.discard_passdown;
- }
-
static void set_discard_callbacks(struct pool *pool)
{
struct pool_c *pt = pool->ti->private;
- if (passdown_enabled(pt)) {
+ if (pt->adjusted_pf.discard_passdown) {
pool->process_discard_cell = process_discard_cell_passdown;
pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
* If discard_passdown was enabled verify that the data device
* supports discards. Disable discard_passdown if not.
*/
- static void disable_passdown_if_not_supported(struct pool_c *pt)
+ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
{
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
static int pool_map(struct dm_target *ti, struct bio *bio)
{
- int r;
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
*/
spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
- r = DM_MAPIO_REMAPPED;
spin_unlock_irq(&pool->lock);
- return r;
+ return DM_MAPIO_REMAPPED;
}
static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
* They get transferred to the live pool in bind_control_target()
* called from pool_preresume().
*/
- if (!pt->adjusted_pf.discard_enabled) {
+
+ if (pt->adjusted_pf.discard_enabled) {
+ disable_discard_passdown_if_not_supported(pt);
+ if (!pt->adjusted_pf.discard_passdown)
+ limits->max_discard_sectors = 0;
+ /*
+ * The pool uses the same discard limits as the underlying data
+ * device. DM core has already set this up.
+ */
+ } else {
/*
* Must explicitly disallow stacking discard limits otherwise the
* block layer will stack them if pool's data device has support.
*/
limits->discard_granularity = 0;
- return;
}
-
- disable_passdown_if_not_supported(pt);
-
- /*
- * The pool uses the same discard limits as the underlying data
- * device. DM core has already set this up.
- */
}
static struct target_type pool_target = {
struct thin_c *tc = ti->private;
struct pool *pool = tc->pool;
- if (!pool->pf.discard_enabled)
- return;
-
- limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
- limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+ if (pool->pf.discard_enabled) {
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
+ }
}
static struct target_type thin_target = {
if (r)
return r;
- deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0);
if (!deferred_remove_workqueue) {
r = -ENOMEM;
goto out_uevent_exit;
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
- static bool bio_is_flush_with_data(struct bio *bio)
+ static inline bool bio_is_flush_with_data(struct bio *bio)
{
return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
}
- static void dm_io_acct(struct dm_io *io, bool end)
+ static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
{
- struct dm_stats_aux *stats_aux = &io->stats_aux;
- unsigned long start_time = io->start_time;
- struct mapped_device *md = io->md;
- struct bio *bio = io->orig_bio;
- unsigned int sectors;
-
/*
* If REQ_PREFLUSH set, don't account payload, it will be
* submitted (and accounted) after this flush completes.
*/
if (bio_is_flush_with_data(bio))
- sectors = 0;
- else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
- sectors = bio_sectors(bio);
- else
- sectors = io->sectors;
+ return 0;
+ if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
+ return io->sectors;
+ return bio_sectors(bio);
+ }
- if (!end)
- bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
- else
- bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
- start_time);
+ static void dm_io_acct(struct dm_io *io, bool end)
+ {
+ struct bio *bio = io->orig_bio;
+
+ if (dm_io_flagged(io, DM_IO_BLK_STAT)) {
+ if (!end)
+ bdev_start_io_acct(bio->bi_bdev, bio_op(bio),
+ io->start_time);
+ else
+ bdev_end_io_acct(bio->bi_bdev, bio_op(bio),
+ dm_io_sectors(io, bio),
+ io->start_time);
+ }
if (static_branch_unlikely(&stats_enabled) &&
- unlikely(dm_stats_used(&md->stats))) {
+ unlikely(dm_stats_used(&io->md->stats))) {
sector_t sector;
- if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
- sector = bio->bi_iter.bi_sector;
- else
+ if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
sector = bio_end_sector(bio) - io->sector_offset;
+ else
+ sector = bio->bi_iter.bi_sector;
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
- sector, sectors,
- end, start_time, stats_aux);
+ dm_stats_account_io(&io->md->stats, bio_data_dir(bio),
+ sector, dm_io_sectors(io, bio),
+ end, io->start_time, &io->stats_aux);
}
}
spin_lock_init(&io->lock);
io->start_time = jiffies;
io->flags = 0;
+ if (blk_queue_io_stat(md->queue))
+ dm_io_set_flag(io, DM_IO_BLK_STAT);
- if (static_branch_unlikely(&stats_enabled))
+ if (static_branch_unlikely(&stats_enabled) &&
+ unlikely(dm_stats_used(&md->stats)))
dm_stats_record_start(&md->stats, &io->stats_aux);
return io;
}
static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
- unsigned int max_granularity)
+ unsigned int max_granularity,
+ unsigned int max_sectors)
{
sector_t target_offset = dm_target_offset(ti, sector);
sector_t len = max_io_len_target_boundary(ti, target_offset);
if (!max_granularity)
return len;
return min_t(sector_t, len,
- min(queue_max_sectors(ti->table->md->queue),
+ min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
blk_chunk_sectors_left(target_offset, max_granularity)));
}
static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
{
- return __max_io_len(ti, sector, ti->max_io_len);
+ return __max_io_len(ti, sector, ti->max_io_len, 0);
}
int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
unsigned int num_bios,
- unsigned int max_granularity)
+ unsigned int max_granularity,
+ unsigned int max_sectors)
{
unsigned int len, bios;
len = min_t(sector_t, ci->sector_count,
- __max_io_len(ti, ci->sector, max_granularity));
+ __max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, num_bios, &len);
{
unsigned int num_bios = 0;
unsigned int max_granularity = 0;
+ unsigned int max_sectors = 0;
struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
switch (bio_op(ci->bio)) {
case REQ_OP_DISCARD:
num_bios = ti->num_discard_bios;
+ max_sectors = limits->max_discard_sectors;
if (ti->max_discard_granularity)
- max_granularity = limits->max_discard_sectors;
+ max_granularity = max_sectors;
break;
case REQ_OP_SECURE_ERASE:
num_bios = ti->num_secure_erase_bios;
+ max_sectors = limits->max_secure_erase_sectors;
if (ti->max_secure_erase_granularity)
- max_granularity = limits->max_secure_erase_sectors;
+ max_granularity = max_sectors;
break;
case REQ_OP_WRITE_ZEROES:
num_bios = ti->num_write_zeroes_bios;
+ max_sectors = limits->max_write_zeroes_sectors;
if (ti->max_write_zeroes_granularity)
- max_granularity = limits->max_write_zeroes_sectors;
+ max_granularity = max_sectors;
break;
default:
break;
if (unlikely(!num_bios))
return BLK_STS_NOTSUPP;
- __send_changing_extent_only(ci, ti, num_bios, max_granularity);
+ __send_changing_extent_only(ci, ti, num_bios,
+ max_granularity, max_sectors);
return BLK_STS_OK;
}
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
}
map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map) {
+ /* avoid deadlock with fs/namespace.c:do_mount() */
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ }
r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
if (r)
bool fail_early;
int ret;
enum pr_type type;
+ struct pr_keys *read_keys;
+ struct pr_held_reservation *rsv;
};
static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
return r;
}
+static int __dm_pr_read_keys(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_pr *pr = data;
+ const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+ if (!ops || !ops->pr_read_keys) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys);
+ if (!pr->ret)
+ return -1;
+
+ return 0;
+}
+
+static int dm_pr_read_keys(struct block_device *bdev, struct pr_keys *keys)
+{
+ struct dm_pr pr = {
+ .read_keys = keys,
+ };
+ int ret;
+
+ ret = dm_call_pr(bdev, __dm_pr_read_keys, &pr);
+ if (ret)
+ return ret;
+
+ return pr.ret;
+}
+
+static int __dm_pr_read_reservation(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_pr *pr = data;
+ const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+ if (!ops || !ops->pr_read_reservation) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv);
+ if (!pr->ret)
+ return -1;
+
+ return 0;
+}
+
+static int dm_pr_read_reservation(struct block_device *bdev,
+ struct pr_held_reservation *rsv)
+{
+ struct dm_pr pr = {
+ .rsv = rsv,
+ };
+ int ret;
+
+ ret = dm_call_pr(bdev, __dm_pr_read_reservation, &pr);
+ if (ret)
+ return ret;
+
+ return pr.ret;
+}
+
static const struct pr_ops dm_pr_ops = {
.pr_register = dm_pr_register,
.pr_reserve = dm_pr_reserve,
.pr_release = dm_pr_release,
.pr_preempt = dm_pr_preempt,
.pr_clear = dm_pr_clear,
+ .pr_read_keys = dm_pr_read_keys,
+ .pr_read_reservation = dm_pr_read_reservation,
};
static const struct block_device_operations dm_blk_dops = {