btrfs-progs: remove wrong set_argv0 for restore
[platform/upstream/btrfs-progs.git] / cmds-check.c
index 8015288..d479361 100644 (file)
@@ -38,6 +38,7 @@
 #include "commands.h"
 #include "free-space-cache.h"
 #include "btrfsck.h"
+#include "qgroup-verify.h"
 
 static u64 bytes_used = 0;
 static u64 total_csum_bytes = 0;
@@ -49,6 +50,11 @@ static u64 data_bytes_allocated = 0;
 static u64 data_bytes_referenced = 0;
 static int found_old_backref = 0;
 static LIST_HEAD(duplicate_extents);
+static LIST_HEAD(delete_items);
+static int repair = 0;
+static int no_holes = 0;
+static int init_extent_tree = 0;
+static int check_data_csum = 0;
 
 struct extent_backref {
        struct list_head list;
@@ -56,6 +62,7 @@ struct extent_backref {
        unsigned int found_extent_tree:1;
        unsigned int full_backref:1;
        unsigned int found_ref:1;
+       unsigned int broken:1;
 };
 
 struct data_backref {
@@ -87,16 +94,17 @@ struct extent_record {
        struct list_head list;
        struct cache_extent cache;
        struct btrfs_disk_key parent_key;
-       unsigned int found_rec;
        u64 start;
        u64 max_size;
        u64 nr;
        u64 refs;
        u64 extent_item_refs;
        u64 generation;
+       u64 parent_generation;
        u64 info_objectid;
-       u64 num_duplicates;
+       u32 num_duplicates;
        u8 info_level;
+       unsigned int found_rec:1;
        unsigned int content_checked:1;
        unsigned int owner_ref_checked:1;
        unsigned int is_root:1;
@@ -117,6 +125,12 @@ struct inode_backref {
        char name[0];
 };
 
+struct dropping_root_item_record {
+       struct list_head list;
+       struct btrfs_root_item ri;
+       struct btrfs_key found_key;
+};
+
 #define REF_ERR_NO_DIR_ITEM            (1 << 0)
 #define REF_ERR_NO_DIR_INDEX           (1 << 1)
 #define REF_ERR_NO_INODE_REF           (1 << 2)
@@ -221,6 +235,14 @@ struct walk_control {
        int root_level;
 };
 
+struct bad_item {
+       struct btrfs_key key;
+       u64 root_id;
+       struct list_head list;
+};
+
+static void reset_cached_block_groups(struct btrfs_fs_info *fs_info);
+
 static u8 imode_to_type(u32 imode)
 {
 #define S_SHIFT 12
@@ -274,6 +296,70 @@ static struct inode_record *clone_inode_rec(struct inode_record *orig_rec)
        return rec;
 }
 
+static void print_inode_error(int errors)
+{
+       if (errors & I_ERR_NO_INODE_ITEM)
+               fprintf(stderr, ", no inode item");
+       if (errors & I_ERR_NO_ORPHAN_ITEM)
+               fprintf(stderr, ", no orphan item");
+       if (errors & I_ERR_DUP_INODE_ITEM)
+               fprintf(stderr, ", dup inode item");
+       if (errors & I_ERR_DUP_DIR_INDEX)
+               fprintf(stderr, ", dup dir index");
+       if (errors & I_ERR_ODD_DIR_ITEM)
+               fprintf(stderr, ", odd dir item");
+       if (errors & I_ERR_ODD_FILE_EXTENT)
+               fprintf(stderr, ", odd file extent");
+       if (errors & I_ERR_BAD_FILE_EXTENT)
+               fprintf(stderr, ", bad file extent");
+       if (errors & I_ERR_FILE_EXTENT_OVERLAP)
+               fprintf(stderr, ", file extent overlap");
+       if (errors & I_ERR_FILE_EXTENT_DISCOUNT)
+               fprintf(stderr, ", file extent discount");
+       if (errors & I_ERR_DIR_ISIZE_WRONG)
+               fprintf(stderr, ", dir isize wrong");
+       if (errors & I_ERR_FILE_NBYTES_WRONG)
+               fprintf(stderr, ", nbytes wrong");
+       if (errors & I_ERR_ODD_CSUM_ITEM)
+               fprintf(stderr, ", odd csum item");
+       if (errors & I_ERR_SOME_CSUM_MISSING)
+               fprintf(stderr, ", some csum missing");
+       if (errors & I_ERR_LINK_COUNT_WRONG)
+               fprintf(stderr, ", link count wrong");
+       fprintf(stderr, "\n");
+}
+
+static void print_ref_error(int errors)
+{
+       if (errors & REF_ERR_NO_DIR_ITEM)
+               fprintf(stderr, ", no dir item");
+       if (errors & REF_ERR_NO_DIR_INDEX)
+               fprintf(stderr, ", no dir index");
+       if (errors & REF_ERR_NO_INODE_REF)
+               fprintf(stderr, ", no inode ref");
+       if (errors & REF_ERR_DUP_DIR_ITEM)
+               fprintf(stderr, ", dup dir item");
+       if (errors & REF_ERR_DUP_DIR_INDEX)
+               fprintf(stderr, ", dup dir index");
+       if (errors & REF_ERR_DUP_INODE_REF)
+               fprintf(stderr, ", dup inode ref");
+       if (errors & REF_ERR_INDEX_UNMATCH)
+               fprintf(stderr, ", index unmatch");
+       if (errors & REF_ERR_FILETYPE_UNMATCH)
+               fprintf(stderr, ", filetype unmatch");
+       if (errors & REF_ERR_NAME_TOO_LONG)
+               fprintf(stderr, ", name too long");
+       if (errors & REF_ERR_NO_ROOT_REF)
+               fprintf(stderr, ", no root ref");
+       if (errors & REF_ERR_NO_ROOT_BACKREF)
+               fprintf(stderr, ", no root backref");
+       if (errors & REF_ERR_DUP_ROOT_REF)
+               fprintf(stderr, ", dup root ref");
+       if (errors & REF_ERR_DUP_ROOT_BACKREF)
+               fprintf(stderr, ", dup root backref");
+       fprintf(stderr, "\n");
+}
+
 static struct inode_record *get_inode_rec(struct cache_tree *inode_cache,
                                          u64 ino, int mod)
 {
@@ -375,8 +461,9 @@ static void maybe_free_inode_rec(struct cache_tree *inode_cache,
                        rec->errors |= I_ERR_FILE_NBYTES_WRONG;
                if (rec->extent_start == (u64)-1 || rec->extent_start > 0)
                        rec->first_extent_gap = 0;
-               if (rec->nlink > 0 && (rec->extent_end < rec->isize ||
-                   rec->first_extent_gap < rec->isize))
+               if (rec->nlink > 0 && !no_holes &&
+                   (rec->extent_end < rec->isize ||
+                    rec->first_extent_gap < rec->isize))
                        rec->errors |= I_ERR_FILE_EXTENT_DISCOUNT;
        }
 
@@ -410,7 +497,7 @@ static int check_orphan_item(struct btrfs_root *root, u64 ino)
 
        btrfs_init_path(&path);
        ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
-       btrfs_release_path(root, &path);
+       btrfs_release_path(&path);
        if (ret > 0)
                ret = -ENOENT;
        return ret;
@@ -794,7 +881,7 @@ static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
        ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
                                0, 0);
        BUG_ON(ret < 0);
-       btrfs_release_path(root, &path);
+       btrfs_release_path(&path);
        if (!ret)
                return 1;
 
@@ -824,14 +911,14 @@ static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
                has_parent = 1;
 
                if (key.offset == parent_root_id) {
-                       btrfs_release_path(root, &path);
+                       btrfs_release_path(&path);
                        return 1;
                }
 
                path.slots[0]++;
        }
 
-       btrfs_release_path(root, &path);
+       btrfs_release_path(&path);
        return has_parent? 0 : -1;
 }
 
@@ -1045,7 +1132,7 @@ static u64 count_csum_range(struct btrfs_root *root, u64 start, u64 len)
 
                path.slots[0]++;
        }
-       btrfs_release_path(root->fs_info->csum_root, &path);
+       btrfs_release_path(&path);
        return found;
 }
 
@@ -1081,7 +1168,7 @@ static int process_file_extent(struct btrfs_root *root,
        extent_type = btrfs_file_extent_type(eb, fi);
 
        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-               num_bytes = btrfs_file_extent_inline_len(eb, fi);
+               num_bytes = btrfs_file_extent_inline_len(eb, slot, fi);
                if (num_bytes == 0)
                        rec->errors |= I_ERR_BAD_FILE_EXTENT;
                rec->found_size += num_bytes;
@@ -1136,6 +1223,7 @@ static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
        u32 nritems;
        int i;
        int ret = 0;
+       int error = 0;
        struct cache_tree *inode_cache;
        struct shared_node *active_node;
 
@@ -1151,6 +1239,8 @@ static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
 
                if (key.objectid == BTRFS_FREE_SPACE_OBJECTID)
                        continue;
+               if (key.type == BTRFS_ORPHAN_ITEM_KEY)
+                       continue;
 
                if (active_node->current == NULL ||
                    active_node->current->ino < key.objectid) {
@@ -1183,8 +1273,10 @@ static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
                default:
                        break;
                };
+               if (ret != 0)
+                       error = 1;
        }
-       return ret;
+       return error;
 }
 
 static void reada_walk_down(struct btrfs_root *root,
@@ -1195,7 +1287,6 @@ static void reada_walk_down(struct btrfs_root *root,
        u32 nritems;
        u32 blocksize;
        int i;
-       int ret;
        int level;
 
        level = btrfs_header_level(node);
@@ -1207,9 +1298,7 @@ static void reada_walk_down(struct btrfs_root *root,
        for (i = slot; i < nritems; i++) {
                bytenr = btrfs_node_blockptr(node, i);
                ptr_gen = btrfs_node_ptr_generation(node, i);
-               ret = readahead_tree_block(root, bytenr, blocksize, ptr_gen);
-               if (ret)
-                       break;
+               readahead_tree_block(root, bytenr, blocksize, ptr_gen);
        }
 }
 
@@ -1221,7 +1310,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
        struct extent_buffer *next;
        struct extent_buffer *cur;
        u32 blocksize;
-       int ret;
+       int ret, err = 0;
        u64 refs;
 
        WARN_ON(*level < 0);
@@ -1229,14 +1318,18 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
        ret = btrfs_lookup_extent_info(NULL, root,
                                       path->nodes[*level]->start,
                                       *level, 1, &refs, NULL);
-       if (ret < 0)
+       if (ret < 0) {
+               err = ret;
                goto out;
+       }
 
        if (refs > 1) {
                ret = enter_shared_node(root, path->nodes[*level]->start,
                                        refs, wc, *level);
-               if (ret > 0)
+               if (ret > 0) {
+                       err = ret;
                        goto out;
+               }
        }
 
        while (*level >= 0) {
@@ -1276,6 +1369,10 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
                        reada_walk_down(root, cur, path->slots[*level]);
                        next = read_tree_block(root, bytenr, blocksize,
                                               ptr_gen);
+                       if (!next) {
+                               err = -EIO;
+                               goto out;
+                       }
                }
 
                *level = *level - 1;
@@ -1285,7 +1382,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
        }
 out:
        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
-       return 0;
+       return err;
 }
 
 static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
@@ -1336,6 +1433,95 @@ out:
        return ret;
 }
 
+static int repair_inode_isize(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, struct btrfs_path *path,
+                             struct inode_record *rec)
+{
+       struct btrfs_inode_item *ei;
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = rec->ino;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret < 0)
+               goto out;
+       if (ret) {
+               if (!path->slots[0]) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               path->slots[0]--;
+               ret = 0;
+       }
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+       if (key.objectid != rec->ino) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                           struct btrfs_inode_item);
+       btrfs_set_inode_size(path->nodes[0], ei, rec->found_size);
+       btrfs_mark_buffer_dirty(path->nodes[0]);
+       rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
+       printf("reset isize for dir %Lu root %Lu\n", rec->ino,
+              root->root_key.objectid);
+out:
+       btrfs_release_path(path);
+       return ret;
+}
+
+static int repair_inode_orphan_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct inode_record *rec)
+{
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
+       key.offset = rec->ino;
+
+       ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+       btrfs_release_path(path);
+       if (!ret)
+               rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
+       return ret;
+}
+
+static int try_repair_inode(struct btrfs_root *root, struct inode_record *rec)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       int ret = 0;
+
+       /* So far we just fix dir isize wrong */
+       if (!(rec->errors & (I_ERR_DIR_ISIZE_WRONG | I_ERR_NO_ORPHAN_ITEM)))
+               return 1;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       if (rec->errors & I_ERR_DIR_ISIZE_WRONG)
+               ret = repair_inode_isize(trans, root, path, rec);
+       if (!ret && rec->errors & I_ERR_NO_ORPHAN_ITEM)
+               ret = repair_inode_orphan_item(trans, root, path, rec);
+       btrfs_commit_transaction(trans, root);
+       btrfs_free_path(path);
+       return ret;
+}
+
 static int check_inode_recs(struct btrfs_root *root,
                            struct cache_tree *inode_cache)
 {
@@ -1392,14 +1578,24 @@ static int check_inode_recs(struct btrfs_root *root,
                        }
                }
 
+               if (repair) {
+                       ret = try_repair_inode(root, rec);
+                       if (ret == 0 && can_free_inode_rec(rec)) {
+                               free_inode_rec(rec);
+                               continue;
+                       }
+                       ret = 0;
+               }
+
                error++;
                if (!rec->found_inode_item)
                        rec->errors |= I_ERR_NO_INODE_ITEM;
                if (rec->found_link != rec->nlink)
                        rec->errors |= I_ERR_LINK_COUNT_WRONG;
-               fprintf(stderr, "root %llu inode %llu errors %x\n",
+               fprintf(stderr, "root %llu inode %llu errors %x",
                        (unsigned long long) root->root_key.objectid,
                        (unsigned long long) rec->ino, rec->errors);
+               print_inode_error(rec->errors);
                list_for_each_entry(backref, &rec->backrefs, list) {
                        if (!backref->found_dir_item)
                                backref->errors |= REF_ERR_NO_DIR_ITEM;
@@ -1408,11 +1604,12 @@ static int check_inode_recs(struct btrfs_root *root,
                        if (!backref->found_inode_ref)
                                backref->errors |= REF_ERR_NO_INODE_REF;
                        fprintf(stderr, "\tunresolved ref dir %llu index %llu"
-                               " namelen %u name %s filetype %d error %x\n",
+                               " namelen %u name %s filetype %d errors %x",
                                (unsigned long long)backref->dir,
                                (unsigned long long)backref->index,
                                backref->namelen, backref->name,
                                backref->filetype, backref->errors);
+                       print_ref_error(backref->errors);
                }
                free_inode_rec(rec);
        }
@@ -1683,12 +1880,13 @@ static int check_root_refs(struct btrfs_root *root,
                        if (!backref->errors && rec->found_root_item)
                                continue;
                        fprintf(stderr, "\tunresolved ref root %llu dir %llu"
-                               " index %llu namelen %u name %s error %x\n",
+                               " index %llu namelen %u name %s errors %x\n",
                                (unsigned long long)backref->ref_root,
                                (unsigned long long)backref->dir,
                                (unsigned long long)backref->index,
                                backref->namelen, backref->name,
                                backref->errors);
+                       print_ref_error(backref->errors);
                }
        }
        return errors > 0 ? 1 : 0;
@@ -1793,7 +1991,7 @@ static int check_fs_root(struct btrfs_root *root,
                if (wret != 0)
                        break;
        }
-       btrfs_release_path(root, &path);
+       btrfs_release_path(&path);
 
        merge_root_recs(root, &root_node.root_cache, root_cache);
 
@@ -1809,13 +2007,10 @@ static int check_fs_root(struct btrfs_root *root,
 
 static int fs_root_objectid(u64 objectid)
 {
-       if (objectid == BTRFS_FS_TREE_OBJECTID ||
-           objectid == BTRFS_TREE_RELOC_OBJECTID ||
-           objectid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
-           (objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-            objectid <= BTRFS_LAST_FREE_OBJECTID))
+       if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
+           objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
                return 1;
-       return 0;
+       return is_fstree(objectid);
 }
 
 static int check_fs_roots(struct btrfs_root *root,
@@ -1830,6 +2025,12 @@ static int check_fs_roots(struct btrfs_root *root,
        int ret;
        int err = 0;
 
+       /*
+        * Just in case we made any changes to the extent tree that weren't
+        * reflected into the free space cache yet.
+        */
+       if (repair)
+               reset_cached_block_groups(root->fs_info);
        memset(&wc, 0, sizeof(wc));
        cache_tree_init(&wc.shared);
        btrfs_init_path(&path);
@@ -1850,8 +2051,8 @@ static int check_fs_roots(struct btrfs_root *root,
                btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
                if (key.type == BTRFS_ROOT_ITEM_KEY &&
                    fs_root_objectid(key.objectid)) {
-                       tmp_root = btrfs_read_fs_root_no_cache(root->fs_info,
-                                                              &key);
+                       key.offset = (u64)-1;
+                       tmp_root = btrfs_read_fs_root(root->fs_info, &key);
                        if (IS_ERR(tmp_root)) {
                                err = 1;
                                goto next;
@@ -1859,7 +2060,6 @@ static int check_fs_roots(struct btrfs_root *root,
                        ret = check_fs_root(tmp_root, root_cache, &wc);
                        if (ret)
                                err = 1;
-                       btrfs_free_fs_root(tmp_root);
                } else if (key.type == BTRFS_ROOT_REF_KEY ||
                           key.type == BTRFS_ROOT_BACKREF_KEY) {
                        process_root_ref(leaf, path.slots[0], &key,
@@ -1868,7 +2068,7 @@ static int check_fs_roots(struct btrfs_root *root,
 next:
                path.slots[0]++;
        }
-       btrfs_release_path(tree_root, &path);
+       btrfs_release_path(&path);
 
        if (!cache_tree_empty(&wc.shared))
                fprintf(stderr, "warning line %d\n", __LINE__);
@@ -2088,7 +2288,7 @@ static int check_owner_ref(struct btrfs_root *root,
                                                        path.slots[level + 1]))
                found = 1;
 
-       btrfs_release_path(ref_root, &path);
+       btrfs_release_path(&path);
        return found ? 0 : 1;
 }
 
@@ -2134,14 +2334,151 @@ static int record_bad_block_io(struct btrfs_fs_info *info,
        return btrfs_add_corrupt_extent_record(info, &key, start, len, 0);
 }
 
-static int check_block(struct btrfs_root *root,
+static int swap_values(struct btrfs_root *root, struct btrfs_path *path,
+                      struct extent_buffer *buf, int slot)
+{
+       if (btrfs_header_level(buf)) {
+               struct btrfs_key_ptr ptr1, ptr2;
+
+               read_extent_buffer(buf, &ptr1, btrfs_node_key_ptr_offset(slot),
+                                  sizeof(struct btrfs_key_ptr));
+               read_extent_buffer(buf, &ptr2,
+                                  btrfs_node_key_ptr_offset(slot + 1),
+                                  sizeof(struct btrfs_key_ptr));
+               write_extent_buffer(buf, &ptr1,
+                                   btrfs_node_key_ptr_offset(slot + 1),
+                                   sizeof(struct btrfs_key_ptr));
+               write_extent_buffer(buf, &ptr2,
+                                   btrfs_node_key_ptr_offset(slot),
+                                   sizeof(struct btrfs_key_ptr));
+               if (slot == 0) {
+                       struct btrfs_disk_key key;
+                       btrfs_node_key(buf, &key, 0);
+                       btrfs_fixup_low_keys(root, path, &key,
+                                            btrfs_header_level(buf) + 1);
+               }
+       } else {
+               struct btrfs_item *item1, *item2;
+               struct btrfs_key k1, k2;
+               char *item1_data, *item2_data;
+               u32 item1_offset, item2_offset, item1_size, item2_size;
+
+               item1 = btrfs_item_nr(slot);
+               item2 = btrfs_item_nr(slot + 1);
+               btrfs_item_key_to_cpu(buf, &k1, slot);
+               btrfs_item_key_to_cpu(buf, &k2, slot + 1);
+               item1_offset = btrfs_item_offset(buf, item1);
+               item2_offset = btrfs_item_offset(buf, item2);
+               item1_size = btrfs_item_size(buf, item1);
+               item2_size = btrfs_item_size(buf, item2);
+
+               item1_data = malloc(item1_size);
+               if (!item1_data)
+                       return -ENOMEM;
+               item2_data = malloc(item2_size);
+               if (!item2_data) {
+                       free(item1_data);
+                       return -ENOMEM;
+               }
+
+               read_extent_buffer(buf, item1_data, item1_offset, item1_size);
+               read_extent_buffer(buf, item2_data, item2_offset, item2_size);
+
+               write_extent_buffer(buf, item1_data, item2_offset, item2_size);
+               write_extent_buffer(buf, item2_data, item1_offset, item1_size);
+               free(item1_data);
+               free(item2_data);
+
+               btrfs_set_item_offset(buf, item1, item2_offset);
+               btrfs_set_item_offset(buf, item2, item1_offset);
+               btrfs_set_item_size(buf, item1, item2_size);
+               btrfs_set_item_size(buf, item2, item1_size);
+
+               path->slots[0] = slot;
+               btrfs_set_item_key_unsafe(root, path, &k2);
+               path->slots[0] = slot + 1;
+               btrfs_set_item_key_unsafe(root, path, &k1);
+       }
+       return 0;
+}
+
+/*
+ * Attempt to fix basic block failures.  Currently we only handle bad key
+ * orders, we will cycle through the keys and swap them if necessary.
+ */
+static int try_to_fix_bad_block(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_buffer *buf,
+                               struct btrfs_disk_key *parent_key,
+                               enum btrfs_tree_block_status status)
+{
+       struct btrfs_path *path;
+       struct btrfs_key k1, k2;
+       int i;
+       int level;
+       int ret;
+
+       if (status != BTRFS_TREE_BLOCK_BAD_KEY_ORDER)
+               return -EIO;
+
+       k1.objectid = btrfs_header_owner(buf);
+       k1.type = BTRFS_ROOT_ITEM_KEY;
+       k1.offset = (u64)-1;
+
+       root = btrfs_read_fs_root(root->fs_info, &k1);
+       if (IS_ERR(root))
+               return -EIO;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -EIO;
+
+       level = btrfs_header_level(buf);
+       path->lowest_level = level;
+       path->skip_check_block = 1;
+       if (level)
+               btrfs_node_key_to_cpu(buf, &k1, 0);
+       else
+               btrfs_item_key_to_cpu(buf, &k1, 0);
+
+       ret = btrfs_search_slot(trans, root, &k1, path, 0, 1);
+       if (ret) {
+               btrfs_free_path(path);
+               return -EIO;
+       }
+
+       buf = path->nodes[level];
+       for (i = 0; i < btrfs_header_nritems(buf) - 1; i++) {
+               if (level) {
+                       btrfs_node_key_to_cpu(buf, &k1, i);
+                       btrfs_node_key_to_cpu(buf, &k2, i + 1);
+               } else {
+                       btrfs_item_key_to_cpu(buf, &k1, i);
+                       btrfs_item_key_to_cpu(buf, &k2, i + 1);
+               }
+               if (btrfs_comp_cpu_keys(&k1, &k2) < 0)
+                       continue;
+               ret = swap_values(root, path, buf, i);
+               if (ret)
+                       break;
+               btrfs_mark_buffer_dirty(buf);
+               i = 0;
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int check_block(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
                       struct cache_tree *extent_cache,
                       struct extent_buffer *buf, u64 flags)
 {
        struct extent_record *rec;
        struct cache_extent *cache;
        struct btrfs_key key;
-       int ret = 1;
+       enum btrfs_tree_block_status status;
+       int ret = 0;
        int level;
 
        cache = lookup_cache_extent(extent_cache, buf->start, buf->len);
@@ -2163,13 +2500,26 @@ static int check_block(struct btrfs_root *root,
        rec->info_level = level;
 
        if (btrfs_is_leaf(buf))
-               ret = btrfs_check_leaf(root, &rec->parent_key, buf);
+               status = btrfs_check_leaf(root, &rec->parent_key, buf);
        else
-               ret = btrfs_check_node(root, &rec->parent_key, buf);
-
-       if (ret) {
-               fprintf(stderr, "bad block %llu\n",
-                       (unsigned long long)buf->start);
+               status = btrfs_check_node(root, &rec->parent_key, buf);
+
+       if (status != BTRFS_TREE_BLOCK_CLEAN) {
+               if (repair)
+                       status = try_to_fix_bad_block(trans, root, buf,
+                                                     &rec->parent_key,
+                                                     status);
+               if (status != BTRFS_TREE_BLOCK_CLEAN) {
+                       ret = -EIO;
+                       fprintf(stderr, "bad block %llu\n",
+                               (unsigned long long)buf->start);
+               } else {
+                       /*
+                        * Signal to callers we need to start the scan over
+                        * again since we'll have cow'ed blocks.
+                        */
+                       ret = -EAGAIN;
+               }
        } else {
                rec->content_checked = 1;
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
@@ -2297,7 +2647,7 @@ static struct data_backref *alloc_data_backref(struct extent_record *rec,
 }
 
 static int add_extent_rec(struct cache_tree *extent_cache,
-                         struct btrfs_key *parent_key,
+                         struct btrfs_key *parent_key, u64 parent_gen,
                          u64 start, u64 nr, u64 extent_item_refs,
                          int is_root, int inc_ref, int set_checked,
                          int metadata, int extent_rec, u64 max_size)
@@ -2373,6 +2723,8 @@ static int add_extent_rec(struct cache_tree *extent_cache,
 
                if (parent_key)
                        btrfs_cpu_key_to_disk(&rec->parent_key, parent_key);
+               if (parent_gen)
+                       rec->parent_generation = parent_gen;
 
                if (rec->max_size < max_size)
                        rec->max_size = max_size;
@@ -2384,7 +2736,7 @@ static int add_extent_rec(struct cache_tree *extent_cache,
        rec->start = start;
        rec->max_size = max_size;
        rec->nr = max(nr, max_size);
-       rec->found_rec = extent_rec;
+       rec->found_rec = !!extent_rec;
        rec->content_checked = 0;
        rec->owner_ref_checked = 0;
        rec->num_duplicates = 0;
@@ -2413,6 +2765,11 @@ static int add_extent_rec(struct cache_tree *extent_cache,
        else
                memset(&rec->parent_key, 0, sizeof(*parent_key));
 
+       if (parent_gen)
+               rec->parent_generation = parent_gen;
+       else
+               rec->parent_generation = 0;
+
        rec->cache.start = start;
        rec->cache.size = nr;
        ret = insert_cache_extent(extent_cache, &rec->cache);
@@ -2434,7 +2791,7 @@ static int add_tree_backref(struct cache_tree *extent_cache, u64 bytenr,
 
        cache = lookup_cache_extent(extent_cache, bytenr, 1);
        if (!cache) {
-               add_extent_rec(extent_cache, NULL, bytenr,
+               add_extent_rec(extent_cache, NULL, 0, bytenr,
                               1, 0, 0, 0, 0, 1, 0, 0);
                cache = lookup_cache_extent(extent_cache, bytenr, 1);
                if (!cache)
@@ -2469,6 +2826,7 @@ static int add_tree_backref(struct cache_tree *extent_cache, u64 bytenr,
                }
                back->node.found_extent_tree = 1;
        }
+       maybe_free_extent_rec(extent_cache, rec);
        return 0;
 }
 
@@ -2482,7 +2840,7 @@ static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
 
        cache = lookup_cache_extent(extent_cache, bytenr, 1);
        if (!cache) {
-               add_extent_rec(extent_cache, NULL, bytenr, 1, 0, 0, 0, 0,
+               add_extent_rec(extent_cache, NULL, 0, bytenr, 1, 0, 0, 0, 0,
                               0, 0, max_size);
                cache = lookup_cache_extent(extent_cache, bytenr, 1);
                if (!cache)
@@ -2523,7 +2881,7 @@ static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
        } else {
                if (back->node.found_extent_tree) {
                        fprintf(stderr, "Extent back ref already exists "
-                               "for %llu parent %llu root %llu"
+                               "for %llu parent %llu root %llu "
                                "owner %llu offset %llu num_refs %lu\n",
                                (unsigned long long)bytenr,
                                (unsigned long long)parent,
@@ -2535,6 +2893,7 @@ static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
                back->num_refs = num_refs;
                back->node.found_extent_tree = 1;
        }
+       maybe_free_extent_rec(extent_cache, rec);
        return 0;
 }
 
@@ -2562,7 +2921,7 @@ static int pick_next_pending(struct cache_tree *pending,
        cache = search_cache_extent(reada, 0);
        if (cache) {
                bits[0].start = cache->start;
-               bits[1].size = cache->size;
+               bits[0].size = cache->size;
                *reada_bits = 1;
                return 1;
        }
@@ -2969,7 +3328,7 @@ static int process_extent_item(struct btrfs_root *root,
 #else
                BUG();
 #endif
-               return add_extent_rec(extent_cache, NULL, key.objectid,
+               return add_extent_rec(extent_cache, NULL, 0, key.objectid,
                                      num_bytes, refs, 0, 0, 0, metadata, 1,
                                      num_bytes);
        }
@@ -2977,7 +3336,7 @@ static int process_extent_item(struct btrfs_root *root,
        ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
        refs = btrfs_extent_refs(eb, ei);
 
-       add_extent_rec(extent_cache, NULL, key.objectid, num_bytes,
+       add_extent_rec(extent_cache, NULL, 0, key.objectid, num_bytes,
                       refs, 0, 0, 0, metadata, 1, num_bytes);
 
        ptr = (unsigned long)(ei + 1);
@@ -3145,13 +3504,13 @@ static int verify_space_cache(struct btrfs_root *root,
 
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
-               return ret;
+               goto out;
        ret = 0;
        while (1) {
                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
-                               return ret;
+                               goto out;
                        if (ret > 0) {
                                ret = 0;
                                break;
@@ -3191,6 +3550,8 @@ static int verify_space_cache(struct btrfs_root *root,
                ret = check_cache_range(root, cache, last,
                                        cache->key.objectid +
                                        cache->key.offset - last);
+
+out:
        btrfs_free_path(path);
 
        if (!ret &&
@@ -3210,7 +3571,8 @@ static int check_space_cache(struct btrfs_root *root)
        int ret;
        int error = 0;
 
-       if (btrfs_super_generation(root->fs_info->super_copy) !=
+       if (btrfs_super_cache_generation(root->fs_info->super_copy) != -1ULL &&
+           btrfs_super_generation(root->fs_info->super_copy) !=
            btrfs_super_cache_generation(root->fs_info->super_copy)) {
                printf("cache and super generation don't match, space cache "
                       "will be invalidated\n");
@@ -3248,6 +3610,109 @@ static int check_space_cache(struct btrfs_root *root)
        return error ? -EINVAL : 0;
 }
 
+static int read_extent_data(struct btrfs_root *root, char *data,
+                       u64 logical, u64 *len, int mirror)
+{
+       u64 offset = 0;
+       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_device *device;
+       int ret = 0;
+       u64 max_len = *len;
+
+       ret = btrfs_map_block(&info->mapping_tree, READ, logical, len,
+                             &multi, mirror, NULL);
+       if (ret) {
+               fprintf(stderr, "Couldn't map the block %llu\n",
+                               logical + offset);
+               goto err;
+       }
+       device = multi->stripes[0].dev;
+
+       if (device->fd == 0)
+               goto err;
+       if (*len > max_len)
+               *len = max_len;
+
+       ret = pread64(device->fd, data, *len, multi->stripes[0].physical);
+       if (ret != *len)
+               ret = -EIO;
+       else
+               ret = 0;
+err:
+       kfree(multi);
+       return ret;
+}
+
+static int check_extent_csums(struct btrfs_root *root, u64 bytenr,
+                       u64 num_bytes, unsigned long leaf_offset,
+                       struct extent_buffer *eb) {
+
+       u64 offset = 0;
+       u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+       char *data;
+       unsigned long csum_offset;
+       u32 csum;
+       u32 csum_expected;
+       u64 read_len;
+       u64 data_checked = 0;
+       u64 tmp;
+       int ret = 0;
+       int mirror;
+       int num_copies;
+
+       if (num_bytes % root->sectorsize)
+               return -EINVAL;
+
+       data = malloc(num_bytes);
+       if (!data)
+               return -ENOMEM;
+
+       while (offset < num_bytes) {
+               mirror = 0;
+again:
+               read_len = num_bytes - offset;
+               /* read as much space once a time */
+               ret = read_extent_data(root, data + offset,
+                               bytenr + offset, &read_len, mirror);
+               if (ret)
+                       goto out;
+               data_checked = 0;
+               /* verify every 4k data's checksum */
+               while (data_checked < read_len) {
+                       csum = ~(u32)0;
+                       tmp = offset + data_checked;
+
+                       csum = btrfs_csum_data(NULL, (char *)data + tmp,
+                                              csum, root->sectorsize);
+                       btrfs_csum_final(csum, (char *)&csum);
+
+                       csum_offset = leaf_offset +
+                                tmp / root->sectorsize * csum_size;
+                       read_extent_buffer(eb, (char *)&csum_expected,
+                                          csum_offset, csum_size);
+                       /* try another mirror */
+                       if (csum != csum_expected) {
+                               fprintf(stderr, "mirror %d bytenr %llu csum %u expected csum %u\n",
+                                               mirror, bytenr + tmp,
+                                               csum, csum_expected);
+                               num_copies = btrfs_num_copies(
+                                               &root->fs_info->mapping_tree,
+                                               bytenr, num_bytes);
+                               if (mirror < num_copies - 1) {
+                                       mirror += 1;
+                                       goto again;
+                               }
+                       }
+                       data_checked += root->sectorsize;
+               }
+               offset += read_len;
+       }
+out:
+       free(data);
+       return ret;
+}
+
 static int check_extent_exists(struct btrfs_root *root, u64 bytenr,
                               u64 num_bytes)
 {
@@ -3349,7 +3814,7 @@ again:
                                 * in real life, but no harm in coding it up
                                 * anyway just in case.
                                 */
-                               btrfs_release_path(root, path);
+                               btrfs_release_path(path);
                                ret = check_extent_exists(root, new_start,
                                                          new_bytes);
                                if (ret) {
@@ -3385,6 +3850,8 @@ static int check_csums(struct btrfs_root *root)
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        int errors = 0;
        int ret;
+       u64 data_len;
+       unsigned long leaf_offset;
 
        root = root->fs_info->csum_root;
 
@@ -3426,6 +3893,16 @@ static int check_csums(struct btrfs_root *root)
                        continue;
                }
 
+               data_len = (btrfs_item_size_nr(leaf, path->slots[0]) /
+                             csum_size) * root->sectorsize;
+               if (!check_data_csum)
+                       goto skip_csum_check;
+               leaf_offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
+               ret = check_extent_csums(root, key.offset, data_len,
+                                        leaf_offset, leaf);
+               if (ret)
+                       break;
+skip_csum_check:
                if (!num_bytes) {
                        offset = key.offset;
                } else if (key.offset != offset + num_bytes) {
@@ -3439,9 +3916,7 @@ static int check_csums(struct btrfs_root *root)
                        offset = key.offset;
                        num_bytes = 0;
                }
-
-               num_bytes += (btrfs_item_size_nr(leaf, path->slots[0]) /
-                             csum_size) * root->sectorsize;
+               num_bytes += data_len;
                path->slots[0]++;
        }
 
@@ -3449,7 +3924,23 @@ static int check_csums(struct btrfs_root *root)
        return errors;
 }
 
-static int run_next_block(struct btrfs_root *root,
+static int is_dropped_key(struct btrfs_key *key,
+                         struct btrfs_key *drop_key) {
+       if (key->objectid < drop_key->objectid)
+               return 1;
+       else if (key->objectid == drop_key->objectid) {
+               if (key->type < drop_key->type)
+                       return 1;
+               else if (key->type == drop_key->type) {
+                       if (key->offset < drop_key->offset)
+                               return 1;
+               }
+       }
+       return 0;
+}
+
+static int run_next_block(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
                          struct block_info *bits,
                          int bits_nr,
                          u64 *last,
@@ -3461,7 +3952,8 @@ static int run_next_block(struct btrfs_root *root,
                          struct cache_tree *chunk_cache,
                          struct rb_root *dev_cache,
                          struct block_group_tree *block_group_cache,
-                         struct device_extent_tree *dev_extent_cache)
+                         struct device_extent_tree *dev_extent_cache,
+                         struct btrfs_root_item *ri)
 {
        struct extent_buffer *buf;
        u64 bytenr;
@@ -3469,7 +3961,9 @@ static int run_next_block(struct btrfs_root *root,
        u64 parent;
        u64 owner;
        u64 flags;
-       int ret;
+       u64 ptr;
+       u64 gen = 0;
+       int ret = 0;
        int i;
        int nritems;
        struct btrfs_key key;
@@ -3512,9 +4006,16 @@ static int run_next_block(struct btrfs_root *root,
                remove_cache_extent(nodes, cache);
                free(cache);
        }
+       cache = lookup_cache_extent(extent_cache, bytenr, size);
+       if (cache) {
+               struct extent_record *rec;
+
+               rec = container_of(cache, struct extent_record, cache);
+               gen = rec->parent_generation;
+       }
 
        /* fixme, get the real parent transid */
-       buf = read_tree_block(root, bytenr, size, 0);
+       buf = read_tree_block(root, bytenr, size, gen);
        if (!extent_buffer_uptodate(buf)) {
                record_bad_block_io(root->fs_info,
                                    extent_cache, bytenr, size);
@@ -3523,11 +4024,19 @@ static int run_next_block(struct btrfs_root *root,
 
        nritems = btrfs_header_nritems(buf);
 
-       ret = btrfs_lookup_extent_info(NULL, root, bytenr,
+       /*
+        * FIXME, this only works only if we don't have any full
+        * backref mode.
+        */
+       if (!init_extent_tree) {
+               ret = btrfs_lookup_extent_info(NULL, root, bytenr,
                                       btrfs_header_level(buf), 1, NULL,
                                       &flags);
-       if (ret < 0)
-               flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+               if (ret < 0)
+                       flags = 0;
+       } else {
+               flags = 0;
+       }
 
        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                parent = bytenr;
@@ -3537,7 +4046,7 @@ static int run_next_block(struct btrfs_root *root,
                owner = btrfs_header_owner(buf);
        }
 
-       ret = check_block(root, extent_cache, buf, flags);
+       ret = check_block(trans, root, extent_cache, buf, flags);
        if (ret)
                goto out;
 
@@ -3623,6 +4132,23 @@ static int run_next_block(struct btrfs_root *root,
                                        0, root->sectorsize);
                                continue;
                        }
+                       if (key.type == BTRFS_ORPHAN_ITEM_KEY) {
+                               struct bad_item *bad;
+
+                               if (key.objectid == BTRFS_ORPHAN_OBJECTID)
+                                       continue;
+                               if (!owner)
+                                       continue;
+                               bad = malloc(sizeof(struct bad_item));
+                               if (!bad)
+                                       continue;
+                               INIT_LIST_HEAD(&bad->list);
+                               memcpy(&bad->key, &key,
+                                      sizeof(struct btrfs_key));
+                               bad->root_id = owner;
+                               list_add_tail(&bad->list, &delete_items);
+                               continue;
+                       }
                        if (key.type != BTRFS_EXTENT_DATA_KEY)
                                continue;
                        fi = btrfs_item_ptr(buf, i,
@@ -3645,7 +4171,6 @@ static int run_next_block(struct btrfs_root *root,
                                parent, owner, key.objectid, key.offset -
                                btrfs_file_extent_offset(buf, fi), 1, 1,
                                btrfs_file_extent_disk_num_bytes(buf, fi));
-                       BUG_ON(ret);
                }
        } else {
                int level;
@@ -3657,10 +4182,20 @@ static int run_next_block(struct btrfs_root *root,
                        btrfs_item_key_to_cpu(buf, &first_key, 0);
                level = btrfs_header_level(buf);
                for (i = 0; i < nritems; i++) {
-                       u64 ptr = btrfs_node_blockptr(buf, i);
-                       u32 size = btrfs_level_size(root, level - 1);
+                       ptr = btrfs_node_blockptr(buf, i);
+                       size = btrfs_level_size(root, level - 1);
                        btrfs_node_key_to_cpu(buf, &key, i);
+                       if (ri != NULL) {
+                               struct btrfs_key drop_key;
+                               btrfs_disk_key_to_cpu(&drop_key,
+                                                     &ri->drop_progress);
+                               if ((level == ri->drop_level)
+                                   && is_dropped_key(&key, &drop_key)) {
+                                       continue;
+                               }
+                       }
                        ret = add_extent_rec(extent_cache, &key,
+                                            btrfs_node_ptr_generation(buf, i),
                                             ptr, size, 0, 0, 1, 0, 1, 0,
                                             size);
                        BUG_ON(ret);
@@ -3688,7 +4223,7 @@ static int run_next_block(struct btrfs_root *root,
                found_old_backref = 1;
 out:
        free_extent_buffer(buf);
-       return 0;
+       return ret;
 }
 
 static int add_root_to_pending(struct extent_buffer *buf,
@@ -3702,7 +4237,7 @@ static int add_root_to_pending(struct extent_buffer *buf,
                add_pending(nodes, seen, buf->start, buf->len);
        else
                add_pending(pending, seen, buf->start, buf->len);
-       add_extent_rec(extent_cache, NULL, buf->start, buf->len,
+       add_extent_rec(extent_cache, NULL, 0, buf->start, buf->len,
                       0, 1, 1, 0, 1, 0, buf->len);
 
        if (root_key->objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -3830,7 +4365,7 @@ static int delete_extent_records(struct btrfs_trans_handle *trans,
                    found_key.type != BTRFS_EXTENT_REF_V0_KEY &&
                    found_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
                    found_key.type != BTRFS_SHARED_DATA_REF_KEY) {
-                       btrfs_release_path(NULL, path);
+                       btrfs_release_path(path);
                        if (found_key.type == 0) {
                                if (found_key.offset == 0)
                                        break;
@@ -3848,7 +4383,7 @@ static int delete_extent_records(struct btrfs_trans_handle *trans,
                ret = btrfs_del_item(trans, root->fs_info->extent_root, path);
                if (ret)
                        break;
-               btrfs_release_path(NULL, path);
+               btrfs_release_path(path);
 
                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
                    found_key.type == BTRFS_METADATA_ITEM_KEY) {
@@ -3862,7 +4397,7 @@ static int delete_extent_records(struct btrfs_trans_handle *trans,
                }
        }
 
-       btrfs_release_path(NULL, path);
+       btrfs_release_path(path);
        return ret;
 }
 
@@ -3922,9 +4457,12 @@ static int record_extent(struct btrfs_trans_handle *trans,
                        bi = (struct btrfs_tree_block_info *)(ei + 1);
                        memset_extent_buffer(leaf, 0, (unsigned long)bi,
                                             sizeof(*bi));
-                       memset(&copy_key, 0, sizeof(copy_key));
 
-                       copy_key.objectid = le64_to_cpu(rec->info_objectid);
+                       btrfs_set_disk_key_objectid(&copy_key,
+                                                   rec->info_objectid);
+                       btrfs_set_disk_key_type(&copy_key, 0);
+                       btrfs_set_disk_key_offset(&copy_key, 0);
+
                        btrfs_set_tree_block_level(leaf, bi, rec->info_level);
                        btrfs_set_tree_block_key(leaf, bi, &copy_key);
 
@@ -3937,7 +4475,7 @@ static int record_extent(struct btrfs_trans_handle *trans,
                                               rec->max_size, 1, 0);
                if (ret)
                        goto fail;
-               btrfs_release_path(NULL, path);
+               btrfs_release_path(path);
        }
 
        if (back->is_data) {
@@ -3998,7 +4536,7 @@ static int record_extent(struct btrfs_trans_handle *trans,
        if (ret)
                goto fail;
 fail:
-       btrfs_release_path(NULL, path);
+       btrfs_release_path(path);
        return ret;
 }
 
@@ -4006,6 +4544,7 @@ struct extent_entry {
        u64 bytenr;
        u64 bytes;
        int count;
+       int broken;
        struct list_head list;
 };
 
@@ -4033,6 +4572,13 @@ static struct extent_entry *find_most_right_entry(struct list_head *entries)
                }
 
                /*
+                * If there are as many broken entries as entries then we know
+                * not to trust this particular entry.
+                */
+               if (entry->broken == entry->count)
+                       continue;
+
+               /*
                 * If our current entry == best then we can't be sure our best
                 * is really the best, so we need to keep searching.
                 */
@@ -4043,7 +4589,7 @@ static struct extent_entry *find_most_right_entry(struct list_head *entries)
                }
 
                /* Prev == entry, not good enough, have to keep searching */
-               if (prev->count == entry->count)
+               if (!prev->broken && prev->count == entry->count)
                        continue;
 
                if (!best)
@@ -4115,7 +4661,7 @@ static int repair_ref(struct btrfs_trans_handle *trans,
                path->slots[0]++;
        }
 
-       btrfs_release_path(root, path);
+       btrfs_release_path(path);
 
        /*
         * Have to make sure that this root gets updated when we commit the
@@ -4158,7 +4704,9 @@ static int repair_ref(struct btrfs_trans_handle *trans,
                return -EINVAL;
        }
 
-       if (dback->disk_bytenr > entry->bytenr) {
+       if (dback->node.broken && dback->disk_bytenr != entry->bytenr) {
+               btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
+       } else if (dback->disk_bytenr > entry->bytenr) {
                u64 off_diff, offset;
 
                off_diff = dback->disk_bytenr - entry->bytenr;
@@ -4205,7 +4753,7 @@ static int repair_ref(struct btrfs_trans_handle *trans,
        else
                printf("ram bytes may be wrong?\n");
        btrfs_mark_buffer_dirty(leaf);
-       btrfs_release_path(root, path);
+       btrfs_release_path(path);
        return 0;
 }
 
@@ -4218,7 +4766,9 @@ static int verify_backrefs(struct btrfs_trans_handle *trans,
        struct extent_entry *entry, *best = NULL;
        LIST_HEAD(entries);
        int nr_entries = 0;
+       int broken_entries = 0;
        int ret = 0;
+       short mismatch = 0;
 
        /*
         * Metadata is easy and the backrefs should always agree on bytenr and
@@ -4260,11 +4810,25 @@ static int verify_backrefs(struct btrfs_trans_handle *trans,
                        list_add_tail(&entry->list, &entries);
                        nr_entries++;
                }
+
+               /*
+                * If we only have on entry we may think the entries agree when
+                * in reality they don't so we have to do some extra checking.
+                */
+               if (dback->disk_bytenr != rec->start ||
+                   dback->bytes != rec->nr || back->broken)
+                       mismatch = 1;
+
+               if (back->broken) {
+                       entry->broken++;
+                       broken_entries++;
+               }
+
                entry->count++;
        }
 
        /* Yay all the backrefs agree, carry on good sir */
-       if (nr_entries <= 1)
+       if (nr_entries <= 1 && !mismatch)
                goto out;
 
        fprintf(stderr, "attempting to repair backref discrepency for bytenr "
@@ -4283,21 +4847,36 @@ static int verify_backrefs(struct btrfs_trans_handle *trans,
         */
        if (!best) {
                entry = find_entry(&entries, rec->start, rec->nr);
-               if (!entry) {
-                       fprintf(stderr, "Backrefs don't agree with eachother "
+               if (!entry && (!broken_entries || !rec->found_rec)) {
+                       fprintf(stderr, "Backrefs don't agree with each other "
                                "and extent record doesn't agree with anybody,"
                                " so we can't fix bytenr %Lu bytes %Lu\n",
                                rec->start, rec->nr);
                        ret = -EINVAL;
                        goto out;
-               }
-               entry->count++;
-               best = find_most_right_entry(&entries);
-               if (!best) {
-                       fprintf(stderr, "Backrefs and extent record evenly "
-                               "split on who is right, this is going to "
-                               "require user input to fix bytenr %Lu bytes "
-                               "%Lu\n", rec->start, rec->nr);
+               } else if (!entry) {
+                       /*
+                        * Ok our backrefs were broken, we'll assume this is the
+                        * correct value and add an entry for this range.
+                        */
+                       entry = malloc(sizeof(struct extent_entry));
+                       if (!entry) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       memset(entry, 0, sizeof(*entry));
+                       entry->bytenr = rec->start;
+                       entry->bytes = rec->nr;
+                       list_add_tail(&entry->list, &entries);
+                       nr_entries++;
+               }
+               entry->count++;
+               best = find_most_right_entry(&entries);
+               if (!best) {
+                       fprintf(stderr, "Backrefs and extent record evenly "
+                               "split on who is right, this is going to "
+                               "require user input to fix bytenr %Lu bytes "
+                               "%Lu\n", rec->start, rec->nr);
                        ret = -EINVAL;
                        goto out;
                }
@@ -4509,7 +5088,7 @@ static int delete_duplicate_records(struct btrfs_trans_handle *trans,
                ret = btrfs_del_item(trans, root, path);
                if (ret)
                        goto out;
-               btrfs_release_path(root, path);
+               btrfs_release_path(path);
                nr_del++;
        }
 
@@ -4536,6 +5115,92 @@ out:
        return ret ? ret : nr_del;
 }
 
+static int find_possible_backrefs(struct btrfs_trans_handle *trans,
+                                 struct btrfs_fs_info *info,
+                                 struct btrfs_path *path,
+                                 struct cache_tree *extent_cache,
+                                 struct extent_record *rec)
+{
+       struct btrfs_root *root;
+       struct extent_backref *back;
+       struct data_backref *dback;
+       struct cache_extent *cache;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 bytenr, bytes;
+       int ret;
+
+       list_for_each_entry(back, &rec->backrefs, list) {
+               dback = (struct data_backref *)back;
+
+               /* We found this one, we don't need to do a lookup */
+               if (dback->found_ref)
+                       continue;
+               /* Don't care about full backrefs (poor unloved backrefs) */
+               if (back->full_backref)
+                       continue;
+               key.objectid = dback->root;
+               key.type = BTRFS_ROOT_ITEM_KEY;
+               key.offset = (u64)-1;
+
+               root = btrfs_read_fs_root(info, &key);
+
+               /* No root, definitely a bad ref, skip */
+               if (IS_ERR(root) && PTR_ERR(root) == -ENOENT)
+                       continue;
+               /* Other err, exit */
+               if (IS_ERR(root))
+                       return PTR_ERR(root);
+
+               key.objectid = dback->owner;
+               key.type = BTRFS_EXTENT_DATA_KEY;
+               key.offset = dback->offset;
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret) {
+                       btrfs_release_path(path);
+                       if (ret < 0)
+                               return ret;
+                       /* Didn't find it, we can carry on */
+                       ret = 0;
+                       continue;
+               }
+
+               fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                   struct btrfs_file_extent_item);
+               bytenr = btrfs_file_extent_disk_bytenr(path->nodes[0], fi);
+               bytes = btrfs_file_extent_disk_num_bytes(path->nodes[0], fi);
+               btrfs_release_path(path);
+               cache = lookup_cache_extent(extent_cache, bytenr, 1);
+               if (cache) {
+                       struct extent_record *tmp;
+                       tmp = container_of(cache, struct extent_record, cache);
+
+                       /*
+                        * If we found an extent record for the bytenr for this
+                        * particular backref then we can't add it to our
+                        * current extent record.  We only want to add backrefs
+                        * that don't have a corresponding extent item in the
+                        * extent tree since they likely belong to this record
+                        * and we need to fix it if it doesn't match bytenrs.
+                        */
+                       if  (tmp->found_rec)
+                               continue;
+               }
+
+               dback->found_ref += 1;
+               dback->disk_bytenr = bytenr;
+               dback->bytes = bytes;
+
+               /*
+                * Set this so the verify backref code knows not to trust the
+                * values in this backref.
+                */
+               back->broken = 1;
+       }
+
+       return 0;
+}
+
 /*
  * when an incorrect extent item is found, this will delete
  * all of the existing entries for it and recreate them
@@ -4543,6 +5208,7 @@ out:
  */
 static int fixup_extent_refs(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *info,
+                            struct cache_tree *extent_cache,
                             struct extent_record *rec)
 {
        int ret;
@@ -4553,14 +5219,38 @@ static int fixup_extent_refs(struct btrfs_trans_handle *trans,
        int allocated = 0;
        u64 flags = 0;
 
-       /* remember our flags for recreating the extent */
-       ret = btrfs_lookup_extent_info(NULL, info->extent_root, rec->start,
-                                      rec->max_size, rec->metadata, NULL,
-                                      &flags);
-       if (ret < 0)
-               flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+       /*
+        * remember our flags for recreating the extent.
+        * FIXME, if we have cleared extent tree, we can not
+        * lookup extent info in extent tree.
+        */
+       if (!init_extent_tree) {
+               ret = btrfs_lookup_extent_info(NULL, info->extent_root,
+                                       rec->start, rec->max_size,
+                                       rec->metadata, NULL, &flags);
+               if (ret < 0)
+                       flags = 0;
+       } else {
+               flags = 0;
+       }
 
        path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       if (rec->refs != rec->extent_item_refs && !rec->metadata) {
+               /*
+                * Sometimes the backrefs themselves are so broken they don't
+                * get attached to any meaningful rec, so first go back and
+                * check any of our backrefs that we couldn't find and throw
+                * them into the list if we find the backref so that
+                * verify_backrefs can figure out what to do.
+                */
+               ret = find_possible_backrefs(trans, info, path, extent_cache,
+                                            rec);
+               if (ret < 0)
+                       goto out;
+       }
 
        /* step one, make sure all of the backrefs agree */
        ret = verify_backrefs(trans, info, path, rec);
@@ -4661,7 +5351,7 @@ again:
                goto out;
        } else {
                level++;
-               btrfs_release_path(NULL, &path);
+               btrfs_release_path(&path);
                goto again;
        }
 
@@ -4670,7 +5360,7 @@ del_ptr:
        ret = btrfs_del_ptr(trans, info->extent_root, &path, level, slot);
 
 out:
-       btrfs_release_path(NULL, &path);
+       btrfs_release_path(&path);
        return ret;
 }
 
@@ -4701,55 +5391,6 @@ static void free_corrupt_block(struct cache_extent *cache)
 
 FREE_EXTENT_CACHE_BASED_TREE(corrupt_blocks, free_corrupt_block);
 
-static int check_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *info,
-                             struct map_lookup *map,
-                             int *reinit)
-{
-       struct btrfs_key key;
-       struct btrfs_path path;
-       int ret;
-
-       key.objectid = map->ce.start;
-       key.offset = map->ce.size;
-       key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-
-       btrfs_init_path(&path);
-       ret = btrfs_search_slot(NULL, info->extent_root,
-                               &key, &path, 0, 0);
-       btrfs_release_path(NULL, &path);
-       if (ret <= 0)
-               goto out;
-
-       ret = btrfs_make_block_group(trans, info->extent_root, 0, map->type,
-                              BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                              key.objectid, key.offset);
-       *reinit = 1;
-out:
-       return ret;
-}
-
-static int check_block_groups(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *info, int *reinit)
-{
-       struct cache_extent *ce;
-       struct map_lookup *map;
-       struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
-
-       /* this isn't quite working */
-       return 0;
-
-       ce = search_cache_extent(&map_tree->cache_tree, 0);
-       while (1) {
-               if (!ce)
-                       break;
-               map = container_of(ce, struct map_lookup, ce);
-               check_block_group(trans, info, map, reinit);
-               ce = next_cache_extent(ce);
-       }
-       return 0;
-}
-
 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_block_group_cache *cache;
@@ -4778,14 +5419,13 @@ static void reset_cached_block_groups(struct btrfs_fs_info *fs_info)
 
 static int check_extent_refs(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
-                            struct cache_tree *extent_cache, int repair)
+                            struct cache_tree *extent_cache)
 {
        struct extent_record *rec;
        struct cache_extent *cache;
        int err = 0;
        int ret = 0;
        int fixed = 0;
-       int reinit = 0;
        int had_dups = 0;
 
        if (repair) {
@@ -4811,9 +5451,6 @@ static int check_extent_refs(struct btrfs_trans_handle *trans,
                        cache = next_cache_extent(cache);
                }
                prune_corrupt_blocks(trans, root->fs_info);
-               check_block_groups(trans, root->fs_info, &reinit);
-               if (reinit)
-                       btrfs_read_block_groups(root->fs_info->extent_root);
                reset_cached_block_groups(root->fs_info);
        }
 
@@ -4871,7 +5508,8 @@ static int check_extent_refs(struct btrfs_trans_handle *trans,
                                (unsigned long long)rec->extent_item_refs,
                                (unsigned long long)rec->refs);
                        if (!fixed && repair) {
-                               ret = fixup_extent_refs(trans, root->fs_info, rec);
+                               ret = fixup_extent_refs(trans, root->fs_info,
+                                                       extent_cache, rec);
                                if (ret)
                                        goto repair_abort;
                                fixed = 1;
@@ -4885,7 +5523,8 @@ static int check_extent_refs(struct btrfs_trans_handle *trans,
                                (unsigned long long)rec->nr);
 
                        if (!fixed && repair) {
-                               ret = fixup_extent_refs(trans, root->fs_info, rec);
+                               ret = fixup_extent_refs(trans, root->fs_info,
+                                                       extent_cache, rec);
                                if (ret)
                                        goto repair_abort;
                                fixed = 1;
@@ -4898,7 +5537,8 @@ static int check_extent_refs(struct btrfs_trans_handle *trans,
                                (unsigned long long)rec->start,
                                (unsigned long long)rec->nr);
                        if (!fixed && repair) {
-                               ret = fixup_extent_refs(trans, root->fs_info, rec);
+                               ret = fixup_extent_refs(trans, root->fs_info,
+                                                       extent_cache, rec);
                                if (ret)
                                        goto repair_abort;
                                fixed = 1;
@@ -5169,7 +5809,7 @@ static int check_devices(struct rb_root *dev_cache,
        return ret;
 }
 
-static int check_chunks_and_extents(struct btrfs_root *root, int repair)
+static int check_chunks_and_extents(struct btrfs_root *root)
 {
        struct rb_root dev_cache;
        struct cache_tree chunk_cache;
@@ -5192,6 +5832,7 @@ static int check_chunks_and_extents(struct btrfs_root *root, int repair)
        struct btrfs_trans_handle *trans = NULL;
        int slot;
        struct btrfs_root_item ri;
+       struct list_head dropping_trees;
 
        dev_cache = RB_ROOT;
        cache_tree_init(&chunk_cache);
@@ -5204,6 +5845,7 @@ static int check_chunks_and_extents(struct btrfs_root *root, int repair)
        cache_tree_init(&nodes);
        cache_tree_init(&reada);
        cache_tree_init(&corrupt_blocks);
+       INIT_LIST_HEAD(&dropping_trees);
 
        if (repair) {
                trans = btrfs_start_transaction(root, 1);
@@ -5256,27 +5898,84 @@ again:
 
                        offset = btrfs_item_ptr_offset(leaf, path.slots[0]);
                        read_extent_buffer(leaf, &ri, offset, sizeof(ri));
-                       buf = read_tree_block(root->fs_info->tree_root,
-                                             btrfs_root_bytenr(&ri),
-                                             btrfs_level_size(root,
-                                              btrfs_root_level(&ri)), 0);
-                       add_root_to_pending(buf, &extent_cache, &pending,
-                                           &seen, &nodes, &found_key);
-                       free_extent_buffer(buf);
+                       if (btrfs_disk_key_objectid(&ri.drop_progress) == 0) {
+                               buf = read_tree_block(root->fs_info->tree_root,
+                                                     btrfs_root_bytenr(&ri),
+                                                     btrfs_level_size(root,
+                                                     btrfs_root_level(&ri)),
+                                                     0);
+                               if (!buf) {
+                                       ret = -EIO;
+                                       goto out;
+                               }
+                               add_root_to_pending(buf, &extent_cache,
+                                                   &pending, &seen, &nodes,
+                                                   &found_key);
+                               free_extent_buffer(buf);
+                       } else {
+                               struct dropping_root_item_record *dri_rec;
+                               dri_rec = malloc(sizeof(*dri_rec));
+                               if (!dri_rec) {
+                                       perror("malloc");
+                                       exit(1);
+                               }
+                               memcpy(&dri_rec->ri, &ri, sizeof(ri));
+                               memcpy(&dri_rec->found_key, &found_key,
+                                      sizeof(found_key));
+                               list_add_tail(&dri_rec->list, &dropping_trees);
+                       }
                }
                path.slots[0]++;
        }
-       btrfs_release_path(root, &path);
-       while(1) {
-               ret = run_next_block(root, bits, bits_nr, &last, &pending,
-                                    &seen, &reada, &nodes, &extent_cache,
-                                    &chunk_cache, &dev_cache,
-                                    &block_group_cache, &dev_extent_cache);
+       btrfs_release_path(&path);
+       while (1) {
+               ret = run_next_block(trans, root, bits, bits_nr, &last,
+                                    &pending, &seen, &reada, &nodes,
+                                    &extent_cache, &chunk_cache, &dev_cache,
+                                    &block_group_cache, &dev_extent_cache,
+                                    NULL);
                if (ret != 0)
                        break;
        }
 
-       ret = check_extent_refs(trans, root, &extent_cache, repair);
+       while (!list_empty(&dropping_trees)) {
+               struct dropping_root_item_record *rec;
+               struct extent_buffer *buf;
+               rec = list_entry(dropping_trees.next,
+                                struct dropping_root_item_record, list);
+               last = 0;
+               if (!bits) {
+                       perror("realloc");
+                       exit(1);
+               }
+               buf = read_tree_block(root->fs_info->tree_root,
+                                     btrfs_root_bytenr(&rec->ri),
+                                     btrfs_level_size(root,
+                                     btrfs_root_level(&rec->ri)), 0);
+               if (!buf) {
+                       ret = -EIO;
+                       goto out;
+               }
+               add_root_to_pending(buf, &extent_cache, &pending,
+                                   &seen, &nodes, &rec->found_key);
+               while (1) {
+                       ret = run_next_block(trans, root, bits, bits_nr, &last,
+                                            &pending, &seen, &reada,
+                                            &nodes, &extent_cache,
+                                            &chunk_cache, &dev_cache,
+                                            &block_group_cache,
+                                            &dev_extent_cache,
+                                            &rec->ri);
+                       if (ret != 0)
+                               break;
+               }
+               free_extent_buffer(buf);
+               list_del(&rec->list);
+               free(rec);
+       }
+
+       if (ret >= 0)
+               ret = check_extent_refs(trans, root, &extent_cache);
        if (ret == -EAGAIN) {
                ret = btrfs_commit_transaction(trans, root);
                if (ret)
@@ -5307,8 +6006,6 @@ again:
                ret = err;
 
        if (trans) {
-               int err;
-
                err = btrfs_commit_transaction(trans, root);
                if (!ret)
                        ret = err;
@@ -5325,9 +6022,80 @@ out:
        free_device_cache_tree(&dev_cache);
        free_block_group_tree(&block_group_cache);
        free_device_extent_tree(&dev_extent_cache);
+       free_extent_cache_tree(&seen);
+       free_extent_cache_tree(&pending);
+       free_extent_cache_tree(&reada);
+       free_extent_cache_tree(&nodes);
        return ret;
 }
 
+static int btrfs_fsck_reinit_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, int overwrite)
+{
+       struct extent_buffer *c;
+       struct extent_buffer *old = root->node;
+       int level;
+       int ret;
+       struct btrfs_disk_key disk_key = {0,0,0};
+
+       level = 0;
+
+       if (overwrite) {
+               c = old;
+               extent_buffer_get(c);
+               goto init;
+       }
+       c = btrfs_alloc_free_block(trans, root,
+                                  btrfs_level_size(root, 0),
+                                  root->root_key.objectid,
+                                  &disk_key, level, 0, 0);
+       if (IS_ERR(c)) {
+               c = old;
+               extent_buffer_get(c);
+               overwrite = 1;
+       }
+init:
+       memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+       btrfs_set_header_level(c, level);
+       btrfs_set_header_bytenr(c, c->start);
+       btrfs_set_header_generation(c, trans->transid);
+       btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
+       btrfs_set_header_owner(c, root->root_key.objectid);
+
+       write_extent_buffer(c, root->fs_info->fsid,
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
+
+       write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+                           btrfs_header_chunk_tree_uuid(c),
+                           BTRFS_UUID_SIZE);
+
+       btrfs_mark_buffer_dirty(c);
+       /*
+        * this case can happen in the following case:
+        *
+        * 1.overwrite previous root.
+        *
+        * 2.reinit reloc data root, this is because we skip pin
+        * down reloc data tree before which means we can allocate
+        * same block bytenr here.
+        */
+       if (old->start == c->start) {
+               btrfs_set_root_generation(&root->root_item,
+                                         trans->transid);
+               root->root_item.level = btrfs_header_level(root->node);
+               ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                       &root->root_key, &root->root_item);
+               if (ret) {
+                       free_extent_buffer(c);
+                       return ret;
+               }
+       }
+       free_extent_buffer(old);
+       root->node = c;
+       add_root_to_dirty_list(root);
+       return 0;
+}
+
 static int pin_down_tree_blocks(struct btrfs_fs_info *fs_info,
                                struct extent_buffer *eb, int tree_root)
 {
@@ -5412,11 +6180,13 @@ static int pin_metadata_blocks(struct btrfs_fs_info *fs_info)
 
 static int reset_block_groups(struct btrfs_fs_info *fs_info)
 {
+       struct btrfs_block_group_cache *cache;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_chunk *chunk;
        struct btrfs_key key;
        int ret;
+       u64 start;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -5467,8 +6237,19 @@ static int reset_block_groups(struct btrfs_fs_info *fs_info)
                                      btrfs_chunk_type(leaf, chunk),
                                      key.objectid, key.offset,
                                      btrfs_chunk_length(leaf, chunk));
+               set_extent_dirty(&fs_info->free_space_cache, key.offset,
+                                key.offset + btrfs_chunk_length(leaf, chunk),
+                                GFP_NOFS);
                path->slots[0]++;
        }
+       start = 0;
+       while (1) {
+               cache = btrfs_lookup_first_block_group(fs_info, start);
+               if (!cache)
+                       break;
+               cache->cached = 1;
+               start = cache->key.objectid + cache->key.offset;
+       }
 
        btrfs_free_path(path);
        return 0;
@@ -5497,13 +6278,16 @@ static int reset_balance(struct btrfs_trans_handle *trans,
        if (ret) {
                if (ret > 0)
                        ret = 0;
-               goto out;
+               if (!ret)
+                       goto reinit_data_reloc;
+               else
+                       goto out;
        }
 
        ret = btrfs_del_item(trans, root, path);
        if (ret)
                goto out;
-       btrfs_release_path(root, path);
+       btrfs_release_path(path);
 
        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -5525,7 +6309,7 @@ static int reset_balance(struct btrfs_trans_handle *trans,
                                        goto out;
                        }
                        key.offset++;
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
 
                        found = 0;
                        ret = btrfs_search_slot(trans, root, &key, path,
@@ -5557,8 +6341,9 @@ static int reset_balance(struct btrfs_trans_handle *trans,
                if (ret)
                        goto out;
        }
-       btrfs_release_path(root, path);
+       btrfs_release_path(path);
 
+reinit_data_reloc:
        key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
@@ -5574,14 +6359,17 @@ static int reset_balance(struct btrfs_trans_handle *trans,
                extent_buffer_get(root->node);
        }
        ret = btrfs_fsck_reinit_root(trans, root, 0);
+       if (ret)
+               goto out;
+       ret = btrfs_make_root_dir(trans, root, BTRFS_FIRST_FREE_OBJECTID);
 out:
        btrfs_free_path(path);
        return ret;
 }
 
-static int reinit_extent_tree(struct btrfs_fs_info *fs_info)
+static int reinit_extent_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_trans_handle *trans;
        u64 start = 0;
        int ret;
 
@@ -5600,12 +6388,6 @@ static int reinit_extent_tree(struct btrfs_fs_info *fs_info)
                return -EINVAL;
        }
 
-       trans = btrfs_start_transaction(fs_info->extent_root, 1);
-       if (IS_ERR(trans)) {
-               fprintf(stderr, "Error starting transaction\n");
-               return PTR_ERR(trans);
-       }
-
        /*
         * first we need to walk all of the trees except the extent tree and pin
         * down the bytes that are in use so we don't overwrite any existing
@@ -5629,7 +6411,7 @@ static int reinit_extent_tree(struct btrfs_fs_info *fs_info)
        }
 
        /* Ok we can allocate now, reinit the extent root */
-       ret = btrfs_fsck_reinit_root(trans, fs_info->extent_root, 1);
+       ret = btrfs_fsck_reinit_root(trans, fs_info->extent_root, 0);
        if (ret) {
                fprintf(stderr, "extent root initialization failed\n");
                /*
@@ -5640,12 +6422,6 @@ static int reinit_extent_tree(struct btrfs_fs_info *fs_info)
                return ret;
        }
 
-       ret = reset_balance(trans, fs_info);
-       if (ret) {
-               fprintf(stderr, "error reseting the pending balance\n");
-               return ret;
-       }
-
        /*
         * Now we have all the in-memory block groups setup so we can make
         * allocations properly, and the metadata we care about is safe since we
@@ -5668,11 +6444,95 @@ static int reinit_extent_tree(struct btrfs_fs_info *fs_info)
                btrfs_extent_post_op(trans, fs_info->extent_root);
        }
 
-       /*
-        * Ok now we commit and run the normal fsck, which will add extent
-        * entries for all of the items it finds.
-        */
-       return btrfs_commit_transaction(trans, fs_info->extent_root);
+       ret = reset_balance(trans, fs_info);
+       if (ret)
+               fprintf(stderr, "error reseting the pending balance\n");
+
+       return ret;
+}
+
+static int recow_extent_buffer(struct btrfs_root *root, struct extent_buffer *eb)
+{
+       struct btrfs_path *path;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_key key;
+       int ret;
+
+       printf("Recowing metadata block %llu\n", eb->start);
+       key.objectid = btrfs_header_owner(eb);
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       root = btrfs_read_fs_root(root->fs_info, &key);
+       if (IS_ERR(root)) {
+               fprintf(stderr, "Couldn't find owner root %llu\n",
+                       key.objectid);
+               return PTR_ERR(root);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       path->lowest_level = btrfs_header_level(eb);
+       if (path->lowest_level)
+               btrfs_node_key_to_cpu(eb, &key, 0);
+       else
+               btrfs_item_key_to_cpu(eb, &key, 0);
+
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       btrfs_commit_transaction(trans, root);
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int delete_bad_item(struct btrfs_root *root, struct bad_item *bad)
+{
+       struct btrfs_path *path;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_key key;
+       int ret;
+
+       printf("Deleting bad item [%llu,%u,%llu]\n", bad->key.objectid,
+              bad->key.type, bad->key.offset);
+       key.objectid = bad->root_id;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       root = btrfs_read_fs_root(root->fs_info, &key);
+       if (IS_ERR(root)) {
+               fprintf(stderr, "Couldn't find owner root %llu\n",
+                       key.objectid);
+               return PTR_ERR(root);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       ret = btrfs_search_slot(trans, root, &bad->key, path, -1, 1);
+       if (ret) {
+               if (ret > 0)
+                       ret = 0;
+               goto out;
+       }
+       ret = btrfs_del_item(trans, root, path);
+out:
+       btrfs_commit_transaction(trans, root);
+       btrfs_free_path(path);
+       return ret;
 }
 
 static struct option long_options[] = {
@@ -5680,7 +6540,11 @@ static struct option long_options[] = {
        { "repair", 0, NULL, 0 },
        { "init-csum-tree", 0, NULL, 0 },
        { "init-extent-tree", 0, NULL, 0 },
-       { 0, 0, 0, 0}
+       { "check-data-csum", 0, NULL, 0 },
+       { "backup", 0, NULL, 0 },
+       { "subvol-extents", no_argument, NULL, 'E' },
+       { "qgroup-report", 0, NULL, 'Q' },
+       { NULL, 0, NULL, 0}
 };
 
 const char * const cmd_check_usage[] = {
@@ -5688,9 +6552,13 @@ const char * const cmd_check_usage[] = {
        "Check an unmounted btrfs filesystem.",
        "",
        "-s|--super <superblock>     use this superblock copy",
+       "-b|--backup                 use the backup root copy",
        "--repair                    try to repair the filesystem",
        "--init-csum-tree            create a new CRC tree",
        "--init-extent-tree          create a new extent tree",
+       "--check-data-csum           verify checkums of data blocks",
+       "--qgroup-report             print a report on qgroup consistency",
+       "--subvol-extents            print subvolume extents and sharing state",
        NULL
 };
 
@@ -5700,29 +6568,45 @@ int cmd_check(int argc, char **argv)
        struct btrfs_root *root;
        struct btrfs_fs_info *info;
        u64 bytenr = 0;
-       char uuidbuf[37];
+       u64 subvolid = 0;
+       char uuidbuf[BTRFS_UUID_UNPARSED_SIZE];
        int ret;
-       int num;
-       int repair = 0;
+       u64 num;
        int option_index = 0;
        int init_csum_tree = 0;
-       int init_extent_tree = 0;
-       int rw = 0;
+       int qgroup_report = 0;
+       enum btrfs_open_ctree_flags ctree_flags =
+               OPEN_CTREE_PARTIAL | OPEN_CTREE_EXCLUSIVE;
 
        while(1) {
                int c;
-               c = getopt_long(argc, argv, "as:", long_options,
+               c = getopt_long(argc, argv, "as:b", long_options,
                                &option_index);
                if (c < 0)
                        break;
                switch(c) {
                        case 'a': /* ignored */ break;
+                       case 'b':
+                               ctree_flags |= OPEN_CTREE_BACKUP_ROOT;
+                               break;
                        case 's':
-                               num = atol(optarg);
-                               bytenr = btrfs_sb_offset(num);
-                               printf("using SB copy %d, bytenr %llu\n", num,
+                               num = arg_strtou64(optarg);
+                               if (num >= BTRFS_SUPER_MIRROR_MAX) {
+                                       fprintf(stderr,
+                                               "ERROR: super mirror should be less than: %d\n",
+                                               BTRFS_SUPER_MIRROR_MAX);
+                                       exit(1);
+                               }
+                               bytenr = btrfs_sb_offset(((int)num));
+                               printf("using SB copy %llu, bytenr %llu\n", num,
                                       (unsigned long long)bytenr);
                                break;
+                       case 'Q':
+                               qgroup_report = 1;
+                               break;
+                       case 'E':
+                               subvolid = arg_strtou64(optarg);
+                               break;
                        case '?':
                        case 'h':
                                usage(cmd_check_usage);
@@ -5730,21 +6614,24 @@ int cmd_check(int argc, char **argv)
                if (option_index == 1) {
                        printf("enabling repair mode\n");
                        repair = 1;
-                       rw = 1;
+                       ctree_flags |= OPEN_CTREE_WRITES;
                } else if (option_index == 2) {
                        printf("Creating a new CRC tree\n");
                        init_csum_tree = 1;
-                       rw = 1;
+                       repair = 1;
+                       ctree_flags |= OPEN_CTREE_WRITES;
                } else if (option_index == 3) {
                        init_extent_tree = 1;
-                       rw = 1;
+                       ctree_flags |= (OPEN_CTREE_WRITES |
+                                       OPEN_CTREE_NO_BLOCK_GROUPS);
                        repair = 1;
+               } else if (option_index == 4) {
+                       check_data_csum = 1;
                }
-
        }
        argc = argc - optind;
 
-       if (argc != 1)
+       if (check_argc_exact(argc, 1))
                usage(cmd_check_usage);
 
        radix_tree_init();
@@ -5752,60 +6639,88 @@ int cmd_check(int argc, char **argv)
 
        if((ret = check_mounted(argv[optind])) < 0) {
                fprintf(stderr, "Could not check mount status: %s\n", strerror(-ret));
-               return ret;
+               goto err_out;
        } else if(ret) {
                fprintf(stderr, "%s is currently mounted. Aborting.\n", argv[optind]);
-               return -EBUSY;
+               ret = -EBUSY;
+               goto err_out;
        }
 
-       info = open_ctree_fs_info(argv[optind], bytenr, 0, rw, 1);
+       info = open_ctree_fs_info(argv[optind], bytenr, 0, ctree_flags);
        if (!info) {
                fprintf(stderr, "Couldn't open file system\n");
-               return -EIO;
+               ret = -EIO;
+               goto err_out;
        }
 
+       root = info->fs_root;
        uuid_unparse(info->super_copy->fsid, uuidbuf);
+       if (qgroup_report) {
+               printf("Print quota groups for %s\nUUID: %s\n", argv[optind],
+                      uuidbuf);
+               ret = qgroup_verify_all(info);
+               if (ret == 0)
+                       print_qgroup_report(1);
+               goto close_out;
+       }
+       if (subvolid) {
+               printf("Print extent state for subvolume %llu on %s\nUUID: %s\n",
+                      subvolid, argv[optind], uuidbuf);
+               ret = print_extent_state(info, subvolid);
+               goto close_out;
+       }
        printf("Checking filesystem on %s\nUUID: %s\n", argv[optind], uuidbuf);
 
        if (!extent_buffer_uptodate(info->tree_root->node) ||
            !extent_buffer_uptodate(info->dev_root->node) ||
-           !extent_buffer_uptodate(info->extent_root->node) ||
            !extent_buffer_uptodate(info->chunk_root->node)) {
                fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
-               return -EIO;
+               ret = -EIO;
+               goto close_out;
        }
 
-       root = info->fs_root;
-
-       if (init_extent_tree) {
-               printf("Creating a new extent tree\n");
-               ret = reinit_extent_tree(info);
-               if (ret)
-                       return ret;
-       }
-       fprintf(stderr, "checking extents\n");
-       if (init_csum_tree) {
+       if (init_extent_tree || init_csum_tree) {
                struct btrfs_trans_handle *trans;
 
-               fprintf(stderr, "Reinit crc root\n");
-               trans = btrfs_start_transaction(info->csum_root, 1);
+               trans = btrfs_start_transaction(info->extent_root, 0);
                if (IS_ERR(trans)) {
                        fprintf(stderr, "Error starting transaction\n");
-                       return PTR_ERR(trans);
+                       ret = PTR_ERR(trans);
+                       goto close_out;
                }
 
-               ret = btrfs_fsck_reinit_root(trans, info->csum_root, 0);
-               if (ret) {
-                       fprintf(stderr, "crc root initialization failed\n");
-                       return -EIO;
+               if (init_extent_tree) {
+                       printf("Creating a new extent tree\n");
+                       ret = reinit_extent_tree(trans, info);
+                       if (ret)
+                               goto close_out;
                }
 
-               ret = btrfs_commit_transaction(trans, root);
+               if (init_csum_tree) {
+                       fprintf(stderr, "Reinit crc root\n");
+                       ret = btrfs_fsck_reinit_root(trans, info->csum_root, 0);
+                       if (ret) {
+                               fprintf(stderr, "crc root initialization failed\n");
+                               ret = -EIO;
+                               goto close_out;
+                       }
+               }
+               /*
+                * Ok now we commit and run the normal fsck, which will add
+                * extent entries for all of the items it finds.
+                */
+               ret = btrfs_commit_transaction(trans, info->extent_root);
                if (ret)
-                       exit(1);
-               goto out;
+                       goto close_out;
        }
-       ret = check_chunks_and_extents(root, repair);
+       if (!extent_buffer_uptodate(info->extent_root->node)) {
+               fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
+               ret = -EIO;
+               goto close_out;
+       }
+
+       fprintf(stderr, "checking extents\n");
+       ret = check_chunks_and_extents(root);
        if (ret)
                fprintf(stderr, "Errors found in extent allocation tree or chunk allocation\n");
 
@@ -5814,6 +6729,14 @@ int cmd_check(int argc, char **argv)
        if (ret)
                goto out;
 
+       /*
+        * We used to have to have these hole extents in between our real
+        * extents so if we don't have this flag set we need to make sure there
+        * are no gaps in the file extents for inodes, otherwise we can just
+        * ignore it when this happens.
+        */
+       no_holes = btrfs_fs_incompat(root->fs_info,
+                                    BTRFS_FEATURE_INCOMPAT_NO_HOLES);
        fprintf(stderr, "checking fs roots\n");
        ret = check_fs_roots(root, &root_cache);
        if (ret)
@@ -5826,10 +6749,43 @@ int cmd_check(int argc, char **argv)
 
        fprintf(stderr, "checking root refs\n");
        ret = check_root_refs(root, &root_cache);
-out:
-       free_root_recs_tree(&root_cache);
-       close_ctree(root);
+       if (ret)
+               goto out;
 
+       while (repair && !list_empty(&root->fs_info->recow_ebs)) {
+               struct extent_buffer *eb;
+
+               eb = list_first_entry(&root->fs_info->recow_ebs,
+                                     struct extent_buffer, recow);
+               ret = recow_extent_buffer(root, eb);
+               if (ret)
+                       break;
+       }
+
+       while (!list_empty(&delete_items)) {
+               struct bad_item *bad;
+
+               bad = list_first_entry(&delete_items, struct bad_item, list);
+               list_del_init(&bad->list);
+               if (repair)
+                       ret = delete_bad_item(root, bad);
+               free(bad);
+       }
+
+       if (info->quota_enabled) {
+               int err;
+               fprintf(stderr, "checking quota groups\n");
+               err = qgroup_verify_all(info);
+               if (err)
+                       goto out;
+       }
+
+       if (!list_empty(&root->fs_info->recow_ebs)) {
+               fprintf(stderr, "Transid errors in file system\n");
+               ret = 1;
+       }
+out:
+       print_qgroup_report(0);
        if (found_old_backref) { /*
                 * there was a disk format change when mixed
                 * backref was in testing tree. The old format
@@ -5856,5 +6812,10 @@ out:
                (unsigned long long)data_bytes_allocated,
                (unsigned long long)data_bytes_referenced);
        printf("%s\n", BTRFS_BUILD_VERSION);
+
+       free_root_recs_tree(&root_cache);
+close_out:
+       close_ctree(root);
+err_out:
        return ret;
 }