Merge branch 'for-linus-3.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / md / dm-thin.c
index 68694da..af1fc3b 100644 (file)
@@ -1,10 +1,11 @@
 /*
- * Copyright (C) 2011 Red Hat UK.
+ * Copyright (C) 2011-2012 Red Hat UK.
  *
  * This file is released under the GPL.
  */
 
 #include "dm-thin-metadata.h"
+#include "dm.h"
 
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
@@ -19,7 +20,7 @@
 /*
  * Tunable constants
  */
-#define ENDIO_HOOK_POOL_SIZE 10240
+#define ENDIO_HOOK_POOL_SIZE 1024
 #define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
@@ -496,12 +497,27 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  */
 struct dm_thin_new_mapping;
 
+/*
+ * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ */
+enum pool_mode {
+       PM_WRITE,               /* metadata may be changed */
+       PM_READ_ONLY,           /* metadata may not be changed */
+       PM_FAIL,                /* all I/O fails */
+};
+
 struct pool_features {
+       enum pool_mode mode;
+
        unsigned zero_new_blocks:1;
        unsigned discard_enabled:1;
        unsigned discard_passdown:1;
 };
 
+struct thin_c;
+typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+
 struct pool {
        struct list_head list;
        struct dm_target *ti;   /* Only set if a pool target is bound */
@@ -510,10 +526,9 @@ struct pool {
        struct block_device *md_dev;
        struct dm_pool_metadata *pmd;
 
-       uint32_t sectors_per_block;
-       unsigned block_shift;
-       dm_block_t offset_mask;
        dm_block_t low_water_blocks;
+       uint32_t sectors_per_block;
+       int sectors_per_block_shift;
 
        struct pool_features pf;
        unsigned low_water_triggered:1; /* A dm event has been sent */
@@ -526,8 +541,8 @@ struct pool {
        struct work_struct worker;
        struct delayed_work waker;
 
-       unsigned ref_count;
        unsigned long last_commit_jiffies;
+       unsigned ref_count;
 
        spinlock_t lock;
        struct bio_list deferred_bios;
@@ -543,8 +558,17 @@ struct pool {
        struct dm_thin_new_mapping *next_mapping;
        mempool_t *mapping_pool;
        mempool_t *endio_hook_pool;
+
+       process_bio_fn process_bio;
+       process_bio_fn process_discard;
+
+       process_mapping_fn process_prepared_mapping;
+       process_mapping_fn process_prepared_discard;
 };
 
+static enum pool_mode get_pool_mode(struct pool *pool);
+static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+
 /*
  * Target context for a pool.
  */
@@ -679,16 +703,28 @@ static void requeue_io(struct thin_c *tc)
 
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
-       return bio->bi_sector >> tc->pool->block_shift;
+       sector_t block_nr = bio->bi_sector;
+
+       if (tc->pool->sectors_per_block_shift < 0)
+               (void) sector_div(block_nr, tc->pool->sectors_per_block);
+       else
+               block_nr >>= tc->pool->sectors_per_block_shift;
+
+       return block_nr;
 }
 
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
        struct pool *pool = tc->pool;
+       sector_t bi_sector = bio->bi_sector;
 
        bio->bi_bdev = tc->pool_dev->bdev;
-       bio->bi_sector = (block << pool->block_shift) +
-               (bio->bi_sector & pool->offset_mask);
+       if (tc->pool->sectors_per_block_shift < 0)
+               bio->bi_sector = (block * pool->sectors_per_block) +
+                                sector_div(bi_sector, pool->sectors_per_block);
+       else
+               bio->bi_sector = (block << pool->sectors_per_block_shift) |
+                               (bi_sector & (pool->sectors_per_block - 1));
 }
 
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -696,21 +732,39 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
        bio->bi_bdev = tc->origin_dev->bdev;
 }
 
+static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
+{
+       return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+               dm_thin_changed_this_transaction(tc->td);
+}
+
 static void issue(struct thin_c *tc, struct bio *bio)
 {
        struct pool *pool = tc->pool;
        unsigned long flags;
 
+       if (!bio_triggers_commit(tc, bio)) {
+               generic_make_request(bio);
+               return;
+       }
+
        /*
-        * Batch together any FUA/FLUSH bios we find and then issue
-        * a single commit for them in process_deferred_bios().
+        * Complete bio with an error if earlier I/O caused changes to
+        * the metadata that can't be committed e.g, due to I/O errors
+        * on the metadata device.
         */
-       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-               spin_lock_irqsave(&pool->lock, flags);
-               bio_list_add(&pool->deferred_flush_bios, bio);
-               spin_unlock_irqrestore(&pool->lock, flags);
-       } else
-               generic_make_request(bio);
+       if (dm_thin_aborted_changes(tc->td)) {
+               bio_io_error(bio);
+               return;
+       }
+
+       /*
+        * Batch together any bios that trigger commits and then issue a
+        * single commit for them in process_deferred_bios().
+        */
+       spin_lock_irqsave(&pool->lock, flags);
+       bio_list_add(&pool->deferred_flush_bios, bio);
+       spin_unlock_irqrestore(&pool->lock, flags);
 }
 
 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -847,6 +901,14 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell
        wake_worker(pool);
 }
 
+static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
+{
+       if (m->bio)
+               m->bio->bi_end_io = m->saved_bi_end_io;
+       cell_error(m->cell);
+       list_del(&m->list);
+       mempool_free(m, m->tc->pool->mapping_pool);
+}
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
        struct thin_c *tc = m->tc;
@@ -859,7 +921,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
        if (m->err) {
                cell_error(m->cell);
-               return;
+               goto out;
        }
 
        /*
@@ -871,7 +933,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
        if (r) {
                DMERR("dm_thin_insert_block() failed");
                cell_error(m->cell);
-               return;
+               goto out;
        }
 
        /*
@@ -886,22 +948,25 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
        } else
                cell_defer(tc, m->cell, m->data_block);
 
+out:
        list_del(&m->list);
        mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 {
-       int r;
        struct thin_c *tc = m->tc;
 
-       r = dm_thin_remove_block(tc->td, m->virt_block);
-       if (r)
-               DMERR("dm_thin_remove_block() failed");
+       bio_io_error(m->bio);
+       cell_defer_except(tc, m->cell);
+       cell_defer_except(tc, m->cell2);
+       mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+       struct thin_c *tc = m->tc;
 
-       /*
-        * Pass the discard down to the underlying device?
-        */
        if (m->pass_discard)
                remap_and_issue(tc, m->bio, m->data_block);
        else
@@ -912,8 +977,20 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
        mempool_free(m, tc->pool->mapping_pool);
 }
 
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_block(tc->td, m->virt_block);
+       if (r)
+               DMERR("dm_thin_remove_block() failed");
+
+       process_prepared_discard_passdown(m);
+}
+
 static void process_prepared(struct pool *pool, struct list_head *head,
-                            void (*fn)(struct dm_thin_new_mapping *))
+                            process_mapping_fn *fn)
 {
        unsigned long flags;
        struct list_head maps;
@@ -925,7 +1002,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
        spin_unlock_irqrestore(&pool->lock, flags);
 
        list_for_each_entry_safe(m, tmp, &maps, list)
-               fn(m);
+               (*fn)(m);
 }
 
 /*
@@ -933,9 +1010,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
  */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-       return !(bio->bi_sector & pool->offset_mask) &&
-               (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
-
+       return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
 }
 
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -1093,6 +1168,35 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        }
 }
 
+static int commit(struct pool *pool)
+{
+       int r;
+
+       r = dm_pool_commit_metadata(pool->pmd);
+       if (r)
+               DMERR("commit failed, error = %d", r);
+
+       return r;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ * Many callers don't care about the return value.
+ */
+static int commit_or_fallback(struct pool *pool)
+{
+       int r;
+
+       if (get_pool_mode(pool) != PM_WRITE)
+               return -EINVAL;
+
+       r = commit(pool);
+       if (r)
+               set_pool_mode(pool, PM_READ_ONLY);
+
+       return r;
+}
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
        int r;
@@ -1121,12 +1225,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                         * Try to commit to see if that will free up some
                         * more space.
                         */
-                       r = dm_pool_commit_metadata(pool->pmd);
-                       if (r) {
-                               DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-                                     __func__, r);
-                               return r;
-                       }
+                       (void) commit_or_fallback(pool);
 
                        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
                        if (r)
@@ -1218,7 +1317,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
                         */
                        m = get_next_mapping(pool);
                        m->tc = tc;
-                       m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+                       m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
                        m->virt_block = block;
                        m->data_block = lookup_result.block;
                        m->cell = cell;
@@ -1234,15 +1333,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
                        }
                } else {
                        /*
-                        * This path is hit if people are ignoring
-                        * limits->discard_granularity.  It ignores any
-                        * part of the discard that is in a subsequent
-                        * block.
+                        * The DM core makes sure that the discard doesn't span
+                        * a block boundary.  So we submit the discard of a
+                        * partial block appropriately.
                         */
-                       sector_t offset = bio->bi_sector - (block << pool->block_shift);
-                       unsigned remaining = (pool->sectors_per_block - offset) << 9;
-                       bio->bi_size = min(bio->bi_size, remaining);
-
                        cell_release_singleton(cell, bio);
                        cell_release_singleton(cell2, bio);
                        if ((!lookup_result.shared) && pool->pf.discard_passdown)
@@ -1310,7 +1404,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
        if (bio_detain(pool->prison, &key, bio, &cell))
                return;
 
-       if (bio_data_dir(bio) == WRITE)
+       if (bio_data_dir(bio) == WRITE && bio->bi_size)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
@@ -1362,6 +1456,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 
        default:
                DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+               set_pool_mode(tc->pool, PM_READ_ONLY);
                cell_error(cell);
                break;
        }
@@ -1419,6 +1514,49 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
        }
 }
 
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+       int r;
+       int rw = bio_data_dir(bio);
+       dm_block_t block = get_bio_block(tc, bio);
+       struct dm_thin_lookup_result lookup_result;
+
+       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+       switch (r) {
+       case 0:
+               if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+                       bio_io_error(bio);
+               else
+                       remap_and_issue(tc, bio, lookup_result.block);
+               break;
+
+       case -ENODATA:
+               if (rw != READ) {
+                       bio_io_error(bio);
+                       break;
+               }
+
+               if (tc->origin_dev) {
+                       remap_to_origin_and_issue(tc, bio);
+                       break;
+               }
+
+               zero_fill_bio(bio);
+               bio_endio(bio, 0);
+               break;
+
+       default:
+               DMERR("dm_thin_find_block() failed, error = %d", r);
+               bio_io_error(bio);
+               break;
+       }
+}
+
+static void process_bio_fail(struct thin_c *tc, struct bio *bio)
+{
+       bio_io_error(bio);
+}
+
 static int need_commit_due_to_time(struct pool *pool)
 {
        return jiffies < pool->last_commit_jiffies ||
@@ -1430,7 +1568,6 @@ static void process_deferred_bios(struct pool *pool)
        unsigned long flags;
        struct bio *bio;
        struct bio_list bios;
-       int r;
 
        bio_list_init(&bios);
 
@@ -1457,9 +1594,9 @@ static void process_deferred_bios(struct pool *pool)
                }
 
                if (bio->bi_rw & REQ_DISCARD)
-                       process_discard(tc, bio);
+                       pool->process_discard(tc, bio);
                else
-                       process_bio(tc, bio);
+                       pool->process_bio(tc, bio);
        }
 
        /*
@@ -1475,10 +1612,7 @@ static void process_deferred_bios(struct pool *pool)
        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
                return;
 
-       r = dm_pool_commit_metadata(pool->pmd);
-       if (r) {
-               DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-                     __func__, r);
+       if (commit_or_fallback(pool)) {
                while ((bio = bio_list_pop(&bios)))
                        bio_io_error(bio);
                return;
@@ -1493,8 +1627,8 @@ static void do_worker(struct work_struct *ws)
 {
        struct pool *pool = container_of(ws, struct pool, worker);
 
-       process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
-       process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
+       process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+       process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
        process_deferred_bios(pool);
 }
 
@@ -1511,6 +1645,52 @@ static void do_waker(struct work_struct *ws)
 
 /*----------------------------------------------------------------*/
 
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+       return pool->pf.mode;
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+{
+       int r;
+
+       pool->pf.mode = mode;
+
+       switch (mode) {
+       case PM_FAIL:
+               DMERR("switching pool to failure mode");
+               pool->process_bio = process_bio_fail;
+               pool->process_discard = process_bio_fail;
+               pool->process_prepared_mapping = process_prepared_mapping_fail;
+               pool->process_prepared_discard = process_prepared_discard_fail;
+               break;
+
+       case PM_READ_ONLY:
+               DMERR("switching pool to read-only mode");
+               r = dm_pool_abort_metadata(pool->pmd);
+               if (r) {
+                       DMERR("aborting transaction failed");
+                       set_pool_mode(pool, PM_FAIL);
+               } else {
+                       dm_pool_metadata_read_only(pool->pmd);
+                       pool->process_bio = process_bio_read_only;
+                       pool->process_discard = process_discard;
+                       pool->process_prepared_mapping = process_prepared_mapping_fail;
+                       pool->process_prepared_discard = process_prepared_discard_passdown;
+               }
+               break;
+
+       case PM_WRITE:
+               pool->process_bio = process_bio;
+               pool->process_discard = process_discard;
+               pool->process_prepared_mapping = process_prepared_mapping;
+               pool->process_prepared_discard = process_prepared_discard;
+               break;
+       }
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Mapping functions.
  */
@@ -1556,6 +1736,12 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
        struct dm_thin_lookup_result result;
 
        map_context->ptr = thin_hook_bio(tc, bio);
+
+       if (get_pool_mode(tc->pool) == PM_FAIL) {
+               bio_io_error(bio);
+               return DM_MAPIO_SUBMITTED;
+       }
+
        if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
                thin_defer_bio(tc, bio);
                return DM_MAPIO_SUBMITTED;
@@ -1592,14 +1778,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
                break;
 
        case -ENODATA:
+               if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
+                       /*
+                        * This block isn't provisioned, and we have no way
+                        * of doing so.  Just error it.
+                        */
+                       bio_io_error(bio);
+                       r = DM_MAPIO_SUBMITTED;
+                       break;
+               }
+               /* fall through */
+
+       case -EWOULDBLOCK:
                /*
                 * In future, the failed dm_thin_find_block above could
                 * provide the hint to load the metadata into cache.
                 */
-       case -EWOULDBLOCK:
                thin_defer_bio(tc, bio);
                r = DM_MAPIO_SUBMITTED;
                break;
+
+       default:
+               /*
+                * Must always call bio_io_error on failure.
+                * dm_thin_find_block can fail with -EINVAL if the
+                * pool is switched to fail-io mode.
+                */
+               bio_io_error(bio);
+               r = DM_MAPIO_SUBMITTED;
+               break;
        }
 
        return r;
@@ -1636,15 +1843,26 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 {
        struct pool_c *pt = ti->private;
 
+       /*
+        * We want to make sure that degraded pools are never upgraded.
+        */
+       enum pool_mode old_mode = pool->pf.mode;
+       enum pool_mode new_mode = pt->pf.mode;
+
+       if (old_mode > new_mode)
+               new_mode = old_mode;
+
        pool->ti = ti;
        pool->low_water_blocks = pt->low_water_blocks;
        pool->pf = pt->pf;
+       set_pool_mode(pool, new_mode);
 
        /*
         * If discard_passdown was enabled verify that the data device
         * supports discards.  Disable discard_passdown if not; otherwise
         * -EOPNOTSUPP will be returned.
         */
+       /* FIXME: pull this out into a sep fn. */
        if (pt->pf.discard_passdown) {
                struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
                if (!q || !blk_queue_discard(q)) {
@@ -1670,6 +1888,7 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 /* Initialize pool features. */
 static void pool_features_init(struct pool_features *pf)
 {
+       pf->mode = PM_WRITE;
        pf->zero_new_blocks = 1;
        pf->discard_enabled = 1;
        pf->discard_passdown = 1;
@@ -1700,14 +1919,16 @@ static struct kmem_cache *_endio_hook_cache;
 
 static struct pool *pool_create(struct mapped_device *pool_md,
                                struct block_device *metadata_dev,
-                               unsigned long block_size, char **error)
+                               unsigned long block_size,
+                               int read_only, char **error)
 {
        int r;
        void *err_p;
        struct pool *pool;
        struct dm_pool_metadata *pmd;
+       bool format_device = read_only ? false : true;
 
-       pmd = dm_pool_metadata_open(metadata_dev, block_size);
+       pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
        if (IS_ERR(pmd)) {
                *error = "Error creating metadata object";
                return (struct pool *)pmd;
@@ -1722,8 +1943,10 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
        pool->pmd = pmd;
        pool->sectors_per_block = block_size;
-       pool->block_shift = ffs(block_size) - 1;
-       pool->offset_mask = block_size - 1;
+       if (block_size & (block_size - 1))
+               pool->sectors_per_block_shift = -1;
+       else
+               pool->sectors_per_block_shift = __ffs(block_size);
        pool->low_water_blocks = 0;
        pool_features_init(&pool->pf);
        pool->prison = prison_create(PRISON_CELLS);
@@ -1822,25 +2045,29 @@ static void __pool_dec(struct pool *pool)
 
 static struct pool *__pool_find(struct mapped_device *pool_md,
                                struct block_device *metadata_dev,
-                               unsigned long block_size, char **error,
-                               int *created)
+                               unsigned long block_size, int read_only,
+                               char **error, int *created)
 {
        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
        if (pool) {
-               if (pool->pool_md != pool_md)
+               if (pool->pool_md != pool_md) {
+                       *error = "metadata device already in use by a pool";
                        return ERR_PTR(-EBUSY);
+               }
                __pool_inc(pool);
 
        } else {
                pool = __pool_table_lookup(pool_md);
                if (pool) {
-                       if (pool->md_dev != metadata_dev)
+                       if (pool->md_dev != metadata_dev) {
+                               *error = "different pool cannot replace a pool";
                                return ERR_PTR(-EINVAL);
+                       }
                        __pool_inc(pool);
 
                } else {
-                       pool = pool_create(pool_md, metadata_dev, block_size, error);
+                       pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
                        *created = 1;
                }
        }
@@ -1891,19 +2118,23 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                arg_name = dm_shift_arg(as);
                argc--;
 
-               if (!strcasecmp(arg_name, "skip_block_zeroing")) {
+               if (!strcasecmp(arg_name, "skip_block_zeroing"))
                        pf->zero_new_blocks = 0;
-                       continue;
-               } else if (!strcasecmp(arg_name, "ignore_discard")) {
+
+               else if (!strcasecmp(arg_name, "ignore_discard"))
                        pf->discard_enabled = 0;
-                       continue;
-               } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+
+               else if (!strcasecmp(arg_name, "no_discard_passdown"))
                        pf->discard_passdown = 0;
-                       continue;
-               }
 
-               ti->error = "Unrecognised pool feature requested";
-               r = -EINVAL;
+               else if (!strcasecmp(arg_name, "read_only"))
+                       pf->mode = PM_READ_ONLY;
+
+               else {
+                       ti->error = "Unrecognised pool feature requested";
+                       r = -EINVAL;
+                       break;
+               }
        }
 
        return r;
@@ -1967,7 +2198,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
-           !is_power_of_2(block_size)) {
+           block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
                ti->error = "Invalid block size";
                r = -EINVAL;
                goto out;
@@ -1996,7 +2227,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
 
        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-                          block_size, &ti->error, &pool_created);
+                          block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
        if (IS_ERR(pool)) {
                r = PTR_ERR(pool);
                goto out_free_pt;
@@ -2014,6 +2245,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto out_flags_changed;
        }
 
+       /*
+        * The block layer requires discard_granularity to be a power of 2.
+        */
+       if (pf.discard_enabled && !is_power_of_2(block_size)) {
+               ti->error = "Discard support must be disabled when the block size is not a power of 2";
+               r = -EINVAL;
+               goto out_flags_changed;
+       }
+
        pt->pool = pool;
        pt->ti = ti;
        pt->metadata_dev = metadata_dev;
@@ -2033,7 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
                 * stacking of discard limits (this keeps the pool and
                 * thin devices' discard limits consistent).
                 */
-               ti->discards_supported = 1;
+               ti->discards_supported = true;
        }
        ti->private = pt;
 
@@ -2093,7 +2333,8 @@ static int pool_preresume(struct dm_target *ti)
        int r;
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
-       dm_block_t data_size, sb_data_size;
+       sector_t data_size = ti->len;
+       dm_block_t sb_data_size;
 
        /*
         * Take control of the pool object.
@@ -2102,7 +2343,8 @@ static int pool_preresume(struct dm_target *ti)
        if (r)
                return r;
 
-       data_size = ti->len >> pool->block_shift;
+       (void) sector_div(data_size, pool->sectors_per_block);
+
        r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
        if (r) {
                DMERR("failed to retrieve data device size");
@@ -2111,22 +2353,19 @@ static int pool_preresume(struct dm_target *ti)
 
        if (data_size < sb_data_size) {
                DMERR("pool target too small, is %llu blocks (expected %llu)",
-                     data_size, sb_data_size);
+                     (unsigned long long)data_size, sb_data_size);
                return -EINVAL;
 
        } else if (data_size > sb_data_size) {
                r = dm_pool_resize_data_dev(pool->pmd, data_size);
                if (r) {
                        DMERR("failed to resize data device");
+                       /* FIXME Stricter than necessary: Rollback transaction instead here */
+                       set_pool_mode(pool, PM_READ_ONLY);
                        return r;
                }
 
-               r = dm_pool_commit_metadata(pool->pmd);
-               if (r) {
-                       DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-                             __func__, r);
-                       return r;
-               }
+               (void) commit_or_fallback(pool);
        }
 
        return 0;
@@ -2149,19 +2388,12 @@ static void pool_resume(struct dm_target *ti)
 
 static void pool_postsuspend(struct dm_target *ti)
 {
-       int r;
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
 
        cancel_delayed_work(&pool->waker);
        flush_workqueue(pool->wq);
-
-       r = dm_pool_commit_metadata(pool->pmd);
-       if (r < 0) {
-               DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-                     __func__, r);
-               /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
-       }
+       (void) commit_or_fallback(pool);
 }
 
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2295,12 +2527,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
        if (r)
                return r;
 
-       r = dm_pool_commit_metadata(pool->pmd);
-       if (r) {
-               DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-                     __func__, r);
-               return r;
-       }
+       (void) commit_or_fallback(pool);
 
        r = dm_pool_reserve_metadata_snap(pool->pmd);
        if (r)
@@ -2361,25 +2588,41 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
        else
                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
-       if (!r) {
-               r = dm_pool_commit_metadata(pool->pmd);
-               if (r)
-                       DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
-                             argv[0], r);
-       }
+       if (!r)
+               (void) commit_or_fallback(pool);
 
        return r;
 }
 
+static void emit_flags(struct pool_features *pf, char *result,
+                      unsigned sz, unsigned maxlen)
+{
+       unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
+               !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+       DMEMIT("%u ", count);
+
+       if (!pf->zero_new_blocks)
+               DMEMIT("skip_block_zeroing ");
+
+       if (!pf->discard_enabled)
+               DMEMIT("ignore_discard ");
+
+       if (!pf->discard_passdown)
+               DMEMIT("no_discard_passdown ");
+
+       if (pf->mode == PM_READ_ONLY)
+               DMEMIT("read_only ");
+}
+
 /*
  * Status line is:
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <used data sectors>/<total data sectors> <held metadata root>
  */
 static int pool_status(struct dm_target *ti, status_type_t type,
-                      char *result, unsigned maxlen)
+                      unsigned status_flags, char *result, unsigned maxlen)
 {
-       int r, count;
+       int r;
        unsigned sz = 0;
        uint64_t transaction_id;
        dm_block_t nr_free_blocks_data;
@@ -2394,6 +2637,15 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 
        switch (type) {
        case STATUSTYPE_INFO:
+               if (get_pool_mode(pool) == PM_FAIL) {
+                       DMEMIT("Fail");
+                       break;
+               }
+
+               /* Commit to ensure statistics aren't out-of-date */
+               if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+                       (void) commit_or_fallback(pool);
+
                r = dm_pool_get_metadata_transaction_id(pool->pmd,
                                                        &transaction_id);
                if (r)
@@ -2429,9 +2681,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
                       (unsigned long long)nr_blocks_data);
 
                if (held_root)
-                       DMEMIT("%llu", held_root);
+                       DMEMIT("%llu ", held_root);
+               else
+                       DMEMIT("- ");
+
+               if (pool->pf.mode == PM_READ_ONLY)
+                       DMEMIT("ro ");
+               else
+                       DMEMIT("rw ");
+
+               if (pool->pf.discard_enabled && pool->pf.discard_passdown)
+                       DMEMIT("discard_passdown");
                else
-                       DMEMIT("-");
+                       DMEMIT("no_discard_passdown");
 
                break;
 
@@ -2441,20 +2703,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
                       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
                       (unsigned long)pool->sectors_per_block,
                       (unsigned long long)pt->low_water_blocks);
-
-               count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
-                       !pt->pf.discard_passdown;
-               DMEMIT("%u ", count);
-
-               if (!pool->pf.zero_new_blocks)
-                       DMEMIT("skip_block_zeroing ");
-
-               if (!pool->pf.discard_enabled)
-                       DMEMIT("ignore_discard ");
-
-               if (!pt->pf.discard_passdown)
-                       DMEMIT("no_discard_passdown ");
-
+               emit_flags(&pt->pf, result, sz, maxlen);
                break;
        }
 
@@ -2492,7 +2741,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
 
        /*
         * This is just a hint, and not enforced.  We have to cope with
-        * bios that overlap 2 blocks.
+        * bios that cover a block partially.  A discard that spans a block
+        * boundary is not sent to this target.
         */
        limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
        limits->discard_zeroes_data = pool->pf.zero_new_blocks;
@@ -2513,7 +2763,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 2, 0},
+       .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -2618,20 +2868,31 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
        __pool_inc(tc->pool);
 
+       if (get_pool_mode(tc->pool) == PM_FAIL) {
+               ti->error = "Couldn't open thin device, Pool is in fail mode";
+               goto bad_thin_open;
+       }
+
        r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
        if (r) {
                ti->error = "Couldn't open thin internal device";
                goto bad_thin_open;
        }
 
-       ti->split_io = tc->pool->sectors_per_block;
+       r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
+       if (r)
+               goto bad_thin_open;
+
        ti->num_flush_requests = 1;
+       ti->flush_supported = true;
 
        /* In case the pool supports discards, pass them on. */
        if (tc->pool->pf.discard_enabled) {
-               ti->discards_supported = 1;
+               ti->discards_supported = true;
                ti->num_discard_requests = 1;
-               ti->discard_zeroes_data_unsupported = 1;
+               ti->discard_zeroes_data_unsupported = true;
+               /* Discard requests must be split on a block boundary */
+               ti->split_discard_requests = true;
        }
 
        dm_put(pool_md);
@@ -2712,7 +2973,7 @@ static void thin_postsuspend(struct dm_target *ti)
  * <nr mapped sectors> <highest mapped sector>
  */
 static int thin_status(struct dm_target *ti, status_type_t type,
-                      char *result, unsigned maxlen)
+                      unsigned status_flags, char *result, unsigned maxlen)
 {
        int r;
        ssize_t sz = 0;
@@ -2720,6 +2981,11 @@ static int thin_status(struct dm_target *ti, status_type_t type,
        char buf[BDEVNAME_SIZE];
        struct thin_c *tc = ti->private;
 
+       if (get_pool_mode(tc->pool) == PM_FAIL) {
+               DMEMIT("Fail");
+               return 0;
+       }
+
        if (!tc->td)
                DMEMIT("-");
        else {
@@ -2757,19 +3023,21 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 static int thin_iterate_devices(struct dm_target *ti,
                                iterate_devices_callout_fn fn, void *data)
 {
-       dm_block_t blocks;
+       sector_t blocks;
        struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
 
        /*
         * We can't call dm_pool_get_data_dev_size() since that blocks.  So
         * we follow a more convoluted path through to the pool's target.
         */
-       if (!tc->pool->ti)
+       if (!pool->ti)
                return 0;       /* nothing is bound */
 
-       blocks = tc->pool->ti->len >> tc->pool->block_shift;
+       blocks = pool->ti->len;
+       (void) sector_div(blocks, pool->sectors_per_block);
        if (blocks)
-               return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
+               return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
 
        return 0;
 }
@@ -2786,7 +3054,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 1, 0},
+       .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,