Updating on-disk metadata
-------------------------
-On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is
-written. If no such requests are made then commits will occur every
-second. This means the cache behaves like a physical disk that has a
-write cache (the same is true of the thin-provisioning target). If
-power is lost you may lose some recent writes. The metadata should
-always be consistent in spite of any crash.
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second. This
+means the cache behaves like a physical disk that has a volatile write
+cache. If power is lost you may lose some recent writes. The metadata
+should always be consistent in spite of any crash.
The 'dirty' state for a cache block changes far too frequently for us
to keep updating it on the fly. So we treat it as a hint. In normal
userspace daemon can use this to detect a situation where a new table
already exceeds the threshold.
+A low water mark for the metadata device is maintained in the kernel and
+will trigger a dm event if free space on the metadata device drops below
+it.
+
+Updating on-disk metadata
+-------------------------
+
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second. This
+means the thin-provisioning target behaves like a physical disk that has
+a volatile write cache. If power is lost you may lose some recent
+writes. The metadata should always be consistent in spite of any crash.
+
+If data space is exhausted the pool will either error or queue IO
+according to the configuration (see: error_if_no_space). If metadata
+space is exhausted or a metadata operation fails: the pool will error IO
+until the pool is taken offline and repair is performed to 1) fix any
+potential inconsistencies and 2) clear the flag that imposes repair.
+Once the pool's metadata device is repaired it may be resized, which
+will allow the pool to return to normal operation. Note that if a pool
+is flagged as needing repair, the pool's data and metadata devices
+cannot be resized until repair is performed. It should also be noted
+that when the pool's metadata space is exhausted the current metadata
+transaction is aborted. Given that the pool will cache IO whose
+completion may have already been acknowledged to upper IO layers
+(e.g. filesystem) it is strongly suggested that consistency checks
+(e.g. fsck) be performed on those layers when repair of the pool is
+required.
+
Thin provisioning
-----------------
#define THIN_SUPERBLOCK_MAGIC 27022010
#define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
#define THIN_METADATA_CACHE_SIZE 64
#define SECTOR_TO_BLOCK_SHIFT 3
return r;
}
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct thin_disk_superblock *disk_super;
+
+ down_write(&pmd->root_lock);
+ pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+ r = superblock_lock(pmd, &sblock);
+ if (r) {
+ DMERR("couldn't read superblock");
+ goto out;
+ }
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = cpu_to_le32(pmd->flags);
+
+ dm_bm_unlock(sblock);
+out:
+ up_write(&pmd->root_lock);
+ return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+ bool needs_check;
+
+ down_read(&pmd->root_lock);
+ needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+ up_read(&pmd->root_lock);
+
+ return needs_check;
+}
{
int r;
struct pool_c *pt = pool->ti->private;
- enum pool_mode old_mode = pool->pf.mode;
+ bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+ enum pool_mode old_mode = get_pool_mode(pool);
+
+ /*
+ * Never allow the pool to transition to PM_WRITE mode if user
+ * intervention is required to verify metadata and data consistency.
+ */
+ if (new_mode == PM_WRITE && needs_check) {
+ DMERR("%s: unable to switch pool to write mode until repaired.",
+ dm_device_name(pool->pool_md));
+ if (old_mode != new_mode)
+ new_mode = old_mode;
+ else
+ new_mode = PM_READ_ONLY;
+ }
+ /*
+ * If we were in PM_FAIL mode, rollback of metadata failed. We're
+ * not going to recover without a thin_repair. So we never let the
+ * pool move out of the old mode.
+ */
+ if (old_mode == PM_FAIL)
+ new_mode = old_mode;
switch (new_mode) {
case PM_FAIL:
set_pool_mode(pool, PM_READ_ONLY);
}
-static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+static void abort_transaction(struct pool *pool)
{
- dm_block_t free_blocks;
+ const char *dev_name = dm_device_name(pool->pool_md);
+
+ DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+ if (dm_pool_abort_metadata(pool->pmd)) {
+ DMERR("%s: failed to abort metadata transaction", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
+
+ if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
+}
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
dm_device_name(pool->pool_md), op, r);
- if (r == -ENOSPC &&
- !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
- !free_blocks)
- DMERR_LIMIT("%s: no free metadata space available.",
- dm_device_name(pool->pool_md));
-
+ abort_transaction(pool);
set_pool_mode(pool, PM_READ_ONLY);
}
/*
* We want to make sure that a pool in PM_FAIL mode is never upgraded.
*/
- enum pool_mode old_mode = pool->pf.mode;
+ enum pool_mode old_mode = get_pool_mode(pool);
enum pool_mode new_mode = pt->adjusted_pf.mode;
/*
pool->pf = pt->adjusted_pf;
pool->low_water_blocks = pt->low_water_blocks;
- /*
- * If we were in PM_FAIL mode, rollback of metadata failed. We're
- * not going to recover without a thin_repair. So we never let the
- * pool move out of the old mode. On the other hand a PM_READ_ONLY
- * may have been due to a lack of metadata or data space, and may
- * now work (ie. if the underlying devices have been resized).
- */
- if (old_mode == PM_FAIL)
- new_mode = old_mode;
-
set_pool_mode(pool, new_mode);
return 0;
return -EINVAL;
} else if (data_size > sb_data_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the data device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
if (sb_data_size)
DMINFO("%s: growing the data device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
return -EINVAL;
} else if (metadata_dev_size > sb_metadata_dev_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the metadata device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
warn_if_metadata_device_too_big(pool->md_dev);
DMINFO("%s: growing the metadata device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,