dm thin: ensure user takes action to validate data and metadata consistency

author Mike Snitzer <snitzer@redhat.com>

Fri, 14 Feb 2014 16:58:41 +0000 (11:58 -0500)

committer Mike Snitzer <snitzer@redhat.com>

Wed, 5 Mar 2014 20:25:35 +0000 (15:25 -0500)
author Mike Snitzer <snitzer@redhat.com>
Fri, 14 Feb 2014 16:58:41 +0000 (11:58 -0500)
committer Mike Snitzer <snitzer@redhat.com>
Wed, 5 Mar 2014 20:25:35 +0000 (15:25 -0500)
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt

index e6b72d3..68c0f51 100644 (file)
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -124,12 +124,11 @@ the default being 204800 sectors (or 100MB).
  Updating on-disk metadata
  -------------------------
  
-On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is
-written.  If no such requests are made then commits will occur every
-second.  This means the cache behaves like a physical disk that has a
-write cache (the same is true of the thin-provisioning target).  If
-power is lost you may lose some recent writes.  The metadata should
-always be consistent in spite of any crash.
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the cache behaves like a physical disk that has a volatile write
+cache.  If power is lost you may lose some recent writes.  The metadata
+should always be consistent in spite of any crash.
  
  The 'dirty' state for a cache block changes far too frequently for us
  to keep updating it on the fly.  So we treat it as a hint.  In normal
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt

index 8a7a3d4..3b34b4f 100644 (file)
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -116,6 +116,35 @@ Resuming a device with a new table itself triggers an event so the
  userspace daemon can use this to detect a situation where a new table
  already exceeds the threshold.
  
+A low water mark for the metadata device is maintained in the kernel and
+will trigger a dm event if free space on the metadata device drops below
+it.
+
+Updating on-disk metadata
+-------------------------
+
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the thin-provisioning target behaves like a physical disk that has
+a volatile write cache.  If power is lost you may lose some recent
+writes.  The metadata should always be consistent in spite of any crash.
+
+If data space is exhausted the pool will either error or queue IO
+according to the configuration (see: error_if_no_space).  If metadata
+space is exhausted or a metadata operation fails: the pool will error IO
+until the pool is taken offline and repair is performed to 1) fix any
+potential inconsistencies and 2) clear the flag that imposes repair.
+Once the pool's metadata device is repaired it may be resized, which
+will allow the pool to return to normal operation.  Note that if a pool
+is flagged as needing repair, the pool's data and metadata devices
+cannot be resized until repair is performed.  It should also be noted
+that when the pool's metadata space is exhausted the current metadata
+transaction is aborted.  Given that the pool will cache IO whose
+completion may have already been acknowledged to upper IO layers
+(e.g. filesystem) it is strongly suggested that consistency checks
+(e.g. fsck) be performed on those layers when repair of the pool is
+required.
+
  Thin provisioning
  -----------------
  
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c

index baa87ff..fb9efc8 100644 (file)
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
  
  #define THIN_SUPERBLOCK_MAGIC 27022010
  #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
  #define THIN_METADATA_CACHE_SIZE 64
  #define SECTOR_TO_BLOCK_SHIFT 3
  
@@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
  
         return r;
  }
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+       int r;
+       struct dm_block *sblock;
+       struct thin_disk_superblock *disk_super;
+
+       down_write(&pmd->root_lock);
+       pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+       r = superblock_lock(pmd, &sblock);
+       if (r) {
+               DMERR("couldn't read superblock");
+               goto out;
+       }
+
+       disk_super = dm_block_data(sblock);
+       disk_super->flags = cpu_to_le32(pmd->flags);
+
+       dm_bm_unlock(sblock);
+out:
+       up_write(&pmd->root_lock);
+       return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+       bool needs_check;
+
+       down_read(&pmd->root_lock);
+       needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+       up_read(&pmd->root_lock);
+
+       return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h

index 82ea384..e3c857d 100644 (file)
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -25,6 +25,11 @@
  
  /*----------------------------------------------------------------*/
  
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
+
  struct dm_pool_metadata;
  struct dm_thin_device;
  
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
                                         dm_sm_threshold_fn fn,
                                         void *context);
  
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+
  /*----------------------------------------------------------------*/
  
  #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c

index dfa48ec..a04eba9 100644 (file)
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1403,7 +1403,28 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
  {
         int r;
         struct pool_c *pt = pool->ti->private;
-       enum pool_mode old_mode = pool->pf.mode;
+       bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+       enum pool_mode old_mode = get_pool_mode(pool);
+
+       /*
+        * Never allow the pool to transition to PM_WRITE mode if user
+        * intervention is required to verify metadata and data consistency.
+        */
+       if (new_mode == PM_WRITE && needs_check) {
+               DMERR("%s: unable to switch pool to write mode until repaired.",
+                     dm_device_name(pool->pool_md));
+               if (old_mode != new_mode)
+                       new_mode = old_mode;
+               else
+                       new_mode = PM_READ_ONLY;
+       }
+       /*
+        * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+        * not going to recover without a thin_repair.  So we never let the
+        * pool move out of the old mode.
+        */
+       if (old_mode == PM_FAIL)
+               new_mode = old_mode;
  
         switch (new_mode) {
         case PM_FAIL:
@@ -1467,19 +1488,28 @@ static void out_of_data_space(struct pool *pool)
         set_pool_mode(pool, PM_READ_ONLY);
  }
  
-static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+static void abort_transaction(struct pool *pool)
  {
-       dm_block_t free_blocks;
+       const char *dev_name = dm_device_name(pool->pool_md);
+
+       DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+       if (dm_pool_abort_metadata(pool->pmd)) {
+               DMERR("%s: failed to abort metadata transaction", dev_name);
+               set_pool_mode(pool, PM_FAIL);
+       }
+
+       if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+               DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+               set_pool_mode(pool, PM_FAIL);
+       }
+}
  
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
                     dm_device_name(pool->pool_md), op, r);
  
-       if (r == -ENOSPC &&
-           !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-           !free_blocks)
-               DMERR_LIMIT("%s: no free metadata space available.",
-                           dm_device_name(pool->pool_md));
-
+       abort_transaction(pool);
         set_pool_mode(pool, PM_READ_ONLY);
  }
  
@@ -1693,7 +1723,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
         /*
          * We want to make sure that a pool in PM_FAIL mode is never upgraded.
          */
-       enum pool_mode old_mode = pool->pf.mode;
+       enum pool_mode old_mode = get_pool_mode(pool);
         enum pool_mode new_mode = pt->adjusted_pf.mode;
  
         /*
@@ -1707,16 +1737,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
         pool->pf = pt->adjusted_pf;
         pool->low_water_blocks = pt->low_water_blocks;
  
-       /*
-        * If we were in PM_FAIL mode, rollback of metadata failed.  We're
-        * not going to recover without a thin_repair.  So we never let the
-        * pool move out of the old mode.  On the other hand a PM_READ_ONLY
-        * may have been due to a lack of metadata or data space, and may
-        * now work (ie. if the underlying devices have been resized).
-        */
-       if (old_mode == PM_FAIL)
-               new_mode = old_mode;
-
         set_pool_mode(pool, new_mode);
  
         return 0;
@@ -2259,6 +2279,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
                 return -EINVAL;
  
         } else if (data_size > sb_data_size) {
+               if (dm_pool_metadata_needs_check(pool->pmd)) {
+                       DMERR("%s: unable to grow the data device until repaired.",
+                             dm_device_name(pool->pool_md));
+                       return 0;
+               }
+
                 if (sb_data_size)
                         DMINFO("%s: growing the data device from %llu to %llu blocks",
                                dm_device_name(pool->pool_md),
@@ -2300,6 +2326,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                 return -EINVAL;
  
         } else if (metadata_dev_size > sb_metadata_dev_size) {
+               if (dm_pool_metadata_needs_check(pool->pmd)) {
+                       DMERR("%s: unable to grow the metadata device until repaired.",
+                             dm_device_name(pool->pool_md));
+                       return 0;
+               }
+
                 warn_if_metadata_device_too_big(pool->md_dev);
                 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
                        dm_device_name(pool->pool_md),
@@ -2801,7 +2833,7 @@ static struct target_type pool_target = {
         .name = "thin-pool",
         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                     DM_TARGET_IMMUTABLE,
-       .version = {1, 10, 0},
+       .version = {1, 11, 0},
         .module = THIS_MODULE,
         .ctr = pool_ctr,
         .dtr = pool_dtr,
@@ -3091,7 +3123,7 @@ static int thin_iterate_devices(struct dm_target *ti,
  
  static struct target_type thin_target = {
         .name = "thin",
-       .version = {1, 10, 0},
+       .version = {1, 11, 0},
         .module = THIS_MODULE,
         .ctr = thin_ctr,
         .dtr = thin_dtr,
author	Mike Snitzer <snitzer@redhat.com>
	Fri, 14 Feb 2014 16:58:41 +0000 (11:58 -0500)
committer	Mike Snitzer <snitzer@redhat.com>
	Wed, 5 Mar 2014 20:25:35 +0000 (15:25 -0500)
Documentation/device-mapper/cache.txt		patch \| blob \| history
Documentation/device-mapper/thin-provisioning.txt		patch \| blob \| history
drivers/md/dm-thin-metadata.c		patch \| blob \| history
drivers/md/dm-thin-metadata.h		patch \| blob \| history
drivers/md/dm-thin.c		patch \| blob \| history