Btrfs-progs: check, ability to detect and fix outdated snapshot root items
authorFilipe Manana <fdmanana@suse.com>
Fri, 17 Oct 2014 17:20:08 +0000 (18:20 +0100)
committerDavid Sterba <dsterba@suse.cz>
Fri, 17 Oct 2014 16:24:54 +0000 (18:24 +0200)
This change adds code to detect and fix the issue introduced in the kernel
release 3.17, where creation of read-only snapshots lead to a corrupted
filesystem if they were created at a moment when the source subvolume/snapshot
had orphan items. The issue was that the on-disk root items became incorrect,
referring to the pre orphan cleanup root node instead of the post orphan
cleanup root node.

A test filesystem can be generated with the test case recently submitted for
xfstests/fstests, which is essencially the following (bash script):

    workout()
    {
ops=$1
procs=$2
num_snapshots=$3

_scratch_mkfs >> $seqres.full 2>&1
_scratch_mount

snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT"
snapshot_cmd="$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`"
run_check $FSSTRESS_PROG -p $procs \
    -x "$snapshot_cmd" -X $num_snapshots -d $SCRATCH_MNT -n $ops
    }

    ops=10000
    procs=4
    snapshots=500
    workout $ops $procs $snapshots

Example of btrfsck's (btrfs check) behaviour against such filesystem:

  $ btrfsck /dev/loop0
  root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1
  root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1
  root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1
  root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 34455552, new gen 1355, new level 1
  root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1
  root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1
  root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1
  Found 7 roots with an outdated root item.
  Please run a filesystem check with the option --repair to fix them.

  $ echo $?
  1

  $ btrfsck --repair /dev/loop0
  enabling repair mode
  fixing root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1
  fixing root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1
  fixing root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1
  fixing root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 34455552, new gen 1355, new level 1
  fixing root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1
  fixing root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1
  fixing root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1
  Fixed 7 roots.
  Checking filesystem on /dev/loop0
  UUID: 2186e9b9-c977-4a35-9c7b-69c6609d4620
  checking extents
  checking free space cache
  cache and super generation don't match, space cache will be invalidated
  checking fs roots
  checking csums
  checking root refs
  found 618537000 bytes used err is 0
  total csum bytes: 130824
  total tree bytes: 601620480
  total fs tree bytes: 580288512
  total extent tree bytes: 18464768
  btree space waste bytes: 136939144
  file data blocks allocated: 34150318080
   referenced 27815415808
  Btrfs v3.17-rc3-2-gbbe1dd8

  $ echo $?
  0

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
cmds-check.c
disk-io.c
extent-tree.c
tests/fsck-tests.sh
tests/fsck-tests/006-bad_root_items_fs.tar.xz [new file with mode: 0644]
tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz [new file with mode: 0644]
utils.c
utils.h

index 310eb2a..2a5f823 100644 (file)
@@ -7198,6 +7198,345 @@ static int fill_csum_tree(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+struct root_item_info {
+       /* level of the root */
+       u8 level;
+       /* number of nodes at this level, must be 1 for a root */
+       int node_count;
+       u64 bytenr;
+       u64 gen;
+       struct cache_extent cache_extent;
+};
+
+static struct cache_tree *roots_info_cache = NULL;
+
+static void free_roots_info_cache(void)
+{
+       if (!roots_info_cache)
+               return;
+
+       while (!cache_tree_empty(roots_info_cache)) {
+               struct cache_extent *entry;
+               struct root_item_info *rii;
+
+               entry = first_cache_extent(roots_info_cache);
+               remove_cache_extent(roots_info_cache, entry);
+               rii = container_of(entry, struct root_item_info, cache_extent);
+               free(rii);
+       }
+
+       free(roots_info_cache);
+       roots_info_cache = NULL;
+}
+
+static int build_roots_info_cache(struct btrfs_fs_info *info)
+{
+       int ret = 0;
+       struct btrfs_key key;
+       struct extent_buffer *leaf;
+       struct btrfs_path *path;
+
+       if (!roots_info_cache) {
+               roots_info_cache = malloc(sizeof(*roots_info_cache));
+               if (!roots_info_cache)
+                       return -ENOMEM;
+               cache_tree_init(roots_info_cache);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = 0;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, info->extent_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       leaf = path->nodes[0];
+
+       while (1) {
+               struct btrfs_key found_key;
+               struct btrfs_extent_item *ei;
+               struct btrfs_extent_inline_ref *iref;
+               int slot = path->slots[0];
+               int type;
+               u64 flags;
+               u64 root_id;
+               u8 level;
+               struct cache_extent *entry;
+               struct root_item_info *rii;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(info->extent_root, path);
+                       if (ret < 0) {
+                               break;
+                       } else if (ret) {
+                               ret = 0;
+                               break;
+                       }
+                       leaf = path->nodes[0];
+                       slot = path->slots[0];
+               }
+
+               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+               if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
+                   found_key.type != BTRFS_METADATA_ITEM_KEY)
+                       goto next;
+
+               ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+               flags = btrfs_extent_flags(leaf, ei);
+
+               if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                   !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                       goto next;
+
+               if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+                       iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+                       level = found_key.offset;
+               } else {
+                       struct btrfs_tree_block_info *info;
+
+                       info = (struct btrfs_tree_block_info *)(ei + 1);
+                       iref = (struct btrfs_extent_inline_ref *)(info + 1);
+                       level = btrfs_tree_block_level(leaf, info);
+               }
+
+               /*
+                * For a root extent, it must be of the following type and the
+                * first (and only one) iref in the item.
+                */
+               type = btrfs_extent_inline_ref_type(leaf, iref);
+               if (type != BTRFS_TREE_BLOCK_REF_KEY)
+                       goto next;
+
+               root_id = btrfs_extent_inline_ref_offset(leaf, iref);
+               entry = lookup_cache_extent(roots_info_cache, root_id, 1);
+               if (!entry) {
+                       rii = malloc(sizeof(struct root_item_info));
+                       if (!rii) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       rii->cache_extent.start = root_id;
+                       rii->cache_extent.size = 1;
+                       rii->level = (u8)-1;
+                       entry = &rii->cache_extent;
+                       ret = insert_cache_extent(roots_info_cache, entry);
+                       ASSERT(ret == 0);
+               } else {
+                       rii = container_of(entry, struct root_item_info,
+                                          cache_extent);
+               }
+
+               ASSERT(rii->cache_extent.start == root_id);
+               ASSERT(rii->cache_extent.size == 1);
+
+               if (level > rii->level || rii->level == (u8)-1) {
+                       rii->level = level;
+                       rii->bytenr = found_key.objectid;
+                       rii->gen = btrfs_extent_generation(leaf, ei);
+                       rii->node_count = 1;
+               } else if (level == rii->level) {
+                       rii->node_count++;
+               }
+next:
+               path->slots[0]++;
+       }
+
+out:
+       btrfs_free_path(path);
+
+       return ret;
+}
+
+static int maybe_repair_root_item(struct btrfs_fs_info *info,
+                                 struct btrfs_path *path,
+                                 const struct btrfs_key *root_key,
+                                 const int read_only_mode)
+{
+       const u64 root_id = root_key->objectid;
+       struct cache_extent *entry;
+       struct root_item_info *rii;
+       struct btrfs_root_item ri;
+       unsigned long offset;
+
+       entry = lookup_cache_extent(roots_info_cache, root_id, 1);
+       if (!entry) {
+               fprintf(stderr,
+                       "Error: could not find extent items for root %llu\n",
+                       root_key->objectid);
+               return -ENOENT;
+       }
+
+       rii = container_of(entry, struct root_item_info, cache_extent);
+       ASSERT(rii->cache_extent.start == root_id);
+       ASSERT(rii->cache_extent.size == 1);
+
+       if (rii->node_count != 1) {
+               fprintf(stderr,
+                       "Error: could not find btree root extent for root %llu\n",
+                       root_id);
+               return -ENOENT;
+       }
+
+       offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+       read_extent_buffer(path->nodes[0], &ri, offset, sizeof(ri));
+
+       if (btrfs_root_bytenr(&ri) != rii->bytenr ||
+           btrfs_root_level(&ri) != rii->level ||
+           btrfs_root_generation(&ri) != rii->gen) {
+
+               /*
+                * If we're in repair mode but our caller told us to not update
+                * the root item, i.e. just check if it needs to be updated, don't
+                * print this message, since the caller will call us again shortly
+                * for the same root item without read only mode (the caller will
+                * open a transaction first).
+                */
+               if (!(read_only_mode && repair))
+                       fprintf(stderr,
+                               "%sroot item for root %llu,"
+                               " current bytenr %llu, current gen %llu, current level %u,"
+                               " new bytenr %llu, new gen %llu, new level %u\n",
+                               (read_only_mode ? "" : "fixing "),
+                               root_id,
+                               btrfs_root_bytenr(&ri), btrfs_root_generation(&ri),
+                               btrfs_root_level(&ri),
+                               rii->bytenr, rii->gen, rii->level);
+
+               if (btrfs_root_generation(&ri) > rii->gen) {
+                       fprintf(stderr,
+                               "root %llu has a root item with a more recent gen (%llu) compared to the found root node (%llu)\n",
+                               root_id, btrfs_root_generation(&ri), rii->gen);
+                       return -EINVAL;
+               }
+
+               if (!read_only_mode) {
+                       btrfs_set_root_bytenr(&ri, rii->bytenr);
+                       btrfs_set_root_level(&ri, rii->level);
+                       btrfs_set_root_generation(&ri, rii->gen);
+                       write_extent_buffer(path->nodes[0], &ri,
+                                           offset, sizeof(ri));
+               }
+
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * A regression introduced in the 3.17 kernel (more specifically in 3.17-rc2),
+ * caused read-only snapshots to be corrupted if they were created at a moment
+ * when the source subvolume/snapshot had orphan items. The issue was that the
+ * on-disk root items became incorrect, referring to the pre orphan cleanup root
+ * node instead of the post orphan cleanup root node.
+ * So this function, and its callees, just detects and fixes those cases. Even
+ * though the regression was for read-only snapshots, this function applies to
+ * any snapshot/subvolume root.
+ * This must be run before any other repair code - not doing it so, makes other
+ * repair code delete or modify backrefs in the extent tree for example, which
+ * will result in an inconsistent fs after repairing the root items.
+ */
+static int repair_root_items(struct btrfs_fs_info *info)
+{
+       struct btrfs_path *path = NULL;
+       struct btrfs_key key;
+       struct extent_buffer *leaf;
+       struct btrfs_trans_handle *trans = NULL;
+       int ret = 0;
+       int bad_roots = 0;
+       int need_trans = 0;
+
+       ret = build_roots_info_cache(info);
+       if (ret)
+               goto out;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = 0;
+
+again:
+       /*
+        * Avoid opening and committing transactions if a leaf doesn't have
+        * any root items that need to be fixed, so that we avoid rotating
+        * backup roots unnecessarily.
+        */
+       if (need_trans) {
+               trans = btrfs_start_transaction(info->tree_root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       goto out;
+               }
+       }
+
+       ret = btrfs_search_slot(trans, info->tree_root, &key, path,
+                               0, trans ? 1 : 0);
+       if (ret < 0)
+               goto out;
+       leaf = path->nodes[0];
+
+       while (1) {
+               struct btrfs_key found_key;
+
+               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                       int no_more_keys = find_next_key(path, &key);
+
+                       btrfs_release_path(path);
+                       if (trans) {
+                               ret = btrfs_commit_transaction(trans,
+                                                              info->tree_root);
+                               trans = NULL;
+                               if (ret < 0)
+                                       goto out;
+                       }
+                       need_trans = 0;
+                       if (no_more_keys)
+                               break;
+                       goto again;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+               if (found_key.type != BTRFS_ROOT_ITEM_KEY)
+                       goto next;
+
+               ret = maybe_repair_root_item(info, path, &found_key,
+                                            trans ? 0 : 1);
+               if (ret < 0)
+                       goto out;
+               if (ret) {
+                       if (!trans && repair) {
+                               need_trans = 1;
+                               key = found_key;
+                               btrfs_release_path(path);
+                               goto again;
+                       }
+                       bad_roots++;
+               }
+next:
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       free_roots_info_cache();
+       if (path)
+               btrfs_free_path(path);
+       if (ret < 0)
+               return ret;
+
+       return bad_roots;
+}
+
 static struct option long_options[] = {
        { "super", 1, NULL, 's' },
        { "repair", 0, NULL, 0 },
@@ -7320,6 +7659,23 @@ int cmd_check(int argc, char **argv)
        }
 
        root = info->fs_root;
+
+       ret = repair_root_items(info);
+       if (ret < 0)
+               goto close_out;
+       if (repair) {
+               fprintf(stderr, "Fixed %d roots.\n", ret);
+               ret = 0;
+       } else if (ret > 0) {
+               fprintf(stderr,
+                      "Found %d roots with an outdated root item.\n",
+                      ret);
+               fprintf(stderr,
+                       "Please run a filesystem check with the option --repair to fix them.\n");
+               ret = 1;
+               goto close_out;
+       }
+
        /*
         * repair mode will force us to commit transaction which
         * will make us fail to load log tree when mounting.
index 02b6d42..77fc610 100644 (file)
--- a/disk-io.c
+++ b/disk-io.c
@@ -475,6 +475,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        if (root->commit_root == root->node)
                goto commit_tree;
+       if (root == root->fs_info->tree_root)
+               goto commit_tree;
 
        free_extent_buffer(root->commit_root);
        root->commit_root = NULL;
index 5443ec8..080f30d 100644 (file)
@@ -29,6 +29,7 @@
 #include "volumes.h"
 #include "free-space-cache.h"
 #include "math.h"
+#include "utils.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -972,27 +973,6 @@ static inline int extent_ref_type(u64 parent, u64 owner)
        return type;
 }
 
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
-
-{
-       int level;
-       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-               if (!path->nodes[level])
-                       break;
-               if (path->slots[level] + 1 >=
-                   btrfs_header_nritems(path->nodes[level]))
-                       continue;
-               if (level == 0)
-                       btrfs_item_key_to_cpu(path->nodes[level], key,
-                                             path->slots[level] + 1);
-               else
-                       btrfs_node_key_to_cpu(path->nodes[level], key,
-                                             path->slots[level] + 1);
-               return 0;
-       }
-       return 1;
-}
-
 static int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
index 867366b..3f04626 100644 (file)
@@ -27,12 +27,23 @@ rm -f $RESULT
 # test rely on corrupting blocks tool
 run_check make btrfs-corrupt-block
 
-for i in $(find $here/tests/fsck-tests -name '*.img')
+# Some broken filesystem images are kept as .img files, created by the tool
+# btrfs-image, and others are kept as .tar.xz files that contain raw filesystem
+# image (the backing file of a loop device, as a sparse file). The reason for
+# keeping some as tarballs of raw images is that for these cases btrfs-image
+# isn't able to preserve all the (bad) filesystem structure for some reason.
+for i in $(find $here/tests/fsck-tests -name '*.img' -o -name '*.tar.xz')
 do
        echo "     [TEST]    $(basename $i)"
        echo "testing image $i" >> $RESULT
 
-       run_check $here/btrfs-image -r $i test.img
+       extension=${i#*.}
+
+       if [ $extension == "img" ]; then
+               run_check $here/btrfs-image -r $i test.img
+       else
+               run_check tar xJf $i
+       fi
 
        $here/btrfsck test.img >> $RESULT 2>&1
        [ $? -eq 0 ] && _fail "btrfsck should have detected corruption"
diff --git a/tests/fsck-tests/006-bad_root_items_fs.tar.xz b/tests/fsck-tests/006-bad_root_items_fs.tar.xz
new file mode 100644 (file)
index 0000000..125d8e7
Binary files /dev/null and b/tests/fsck-tests/006-bad_root_items_fs.tar.xz differ
diff --git a/tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz b/tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz
new file mode 100644 (file)
index 0000000..ed99dc4
Binary files /dev/null and b/tests/fsck-tests/007-bad_root_items_fs_skinny.tar.xz differ
diff --git a/utils.c b/utils.c
index 5a72f19..f10c178 100644 (file)
--- a/utils.c
+++ b/utils.c
@@ -2410,3 +2410,24 @@ void units_set_base(unsigned *units, unsigned base)
 
        *units = base | mode;
 }
+
+int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+{
+       int level;
+
+       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+               if (!path->nodes[level])
+                       break;
+               if (path->slots[level] + 1 >=
+                   btrfs_header_nritems(path->nodes[level]))
+                       continue;
+               if (level == 0)
+                       btrfs_item_key_to_cpu(path->nodes[level], key,
+                                             path->slots[level] + 1);
+               else
+                       btrfs_node_key_to_cpu(path->nodes[level], key,
+                                             path->slots[level] + 1);
+               return 0;
+       }
+       return 1;
+}
diff --git a/utils.h b/utils.h
index aed03f2..7accbd2 100644 (file)
--- a/utils.h
+++ b/utils.h
@@ -158,4 +158,6 @@ static inline u64 btrfs_min_dev_size(u32 leafsize)
                    btrfs_min_global_blk_rsv_size(leafsize));
 }
 
+int find_next_key(struct btrfs_path *path, struct btrfs_key *key);
+
 #endif