Add packaging
[platform/upstream/btrfs-progs.git] / chunk-recover.c
index 613d715..705bcf5 100644 (file)
@@ -15,8 +15,9 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
-#define _XOPEN_SOURCE 500
-#define _GNU_SOURCE
+
+#include "kerncompat.h"
+#include "androidcompat.h"
 
 #include <stdio.h>
 #include <stdio_ext.h>
@@ -28,7 +29,6 @@
 #include <uuid/uuid.h>
 #include <pthread.h>
 
-#include "kerncompat.h"
 #include "list.h"
 #include "radix-tree.h"
 #include "ctree.h"
 #include "transaction.h"
 #include "crc32c.h"
 #include "utils.h"
-#include "version.h"
 #include "btrfsck.h"
 #include "commands.h"
 
-#define BTRFS_NUM_MIRRORS                      2
-
 struct recover_control {
        int verbose;
        int yes;
 
        u16 csum_size;
        u32 sectorsize;
-       u32 leafsize;
+       u32 nodesize;
        u64 generation;
        u64 chunk_root_generation;
 
@@ -63,6 +60,7 @@ struct recover_control {
 
        struct list_head good_chunks;
        struct list_head bad_chunks;
+       struct list_head rebuild_chunks;
        struct list_head unrepaired_chunks;
        pthread_mutex_t rc_lock;
 };
@@ -71,8 +69,8 @@ struct extent_record {
        struct cache_extent cache;
        u64 generation;
        u8 csum[BTRFS_CSUM_SIZE];
-       struct btrfs_device *devices[BTRFS_NUM_MIRRORS];
-       u64 offsets[BTRFS_NUM_MIRRORS];
+       struct btrfs_device *devices[BTRFS_MAX_MIRRORS];
+       u64 offsets[BTRFS_MAX_MIRRORS];
        int nmirrors;
 };
 
@@ -80,19 +78,19 @@ struct device_scan {
        struct recover_control *rc;
        struct btrfs_device *dev;
        int fd;
+       u64 bytenr;
 };
 
 static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb)
 {
        struct extent_record *rec;
 
-       rec = malloc(sizeof(*rec));
+       rec = calloc(1, sizeof(*rec));
        if (!rec) {
                fprintf(stderr, "Fail to allocate memory for extent record.\n");
                exit(1);
        }
 
-       memset(rec, 0, sizeof(*rec));
        rec->cache.start = btrfs_header_bytenr(eb);
        rec->cache.size = eb->len;
        rec->generation = btrfs_header_generation(eb);
@@ -128,7 +126,7 @@ again:
                            memcmp(exist->csum, rec->csum, BTRFS_CSUM_SIZE)) {
                                ret = -EEXIST;
                        } else {
-                               BUG_ON(exist->nmirrors >= BTRFS_NUM_MIRRORS);
+                               BUG_ON(exist->nmirrors >= BTRFS_MAX_MIRRORS);
                                exist->devices[exist->nmirrors] = device;
                                exist->offsets[exist->nmirrors] = offset;
                                exist->nmirrors++;
@@ -205,6 +203,7 @@ static void init_recover_control(struct recover_control *rc, int verbose,
 
        INIT_LIST_HEAD(&rc->good_chunks);
        INIT_LIST_HEAD(&rc->bad_chunks);
+       INIT_LIST_HEAD(&rc->rebuild_chunks);
        INIT_LIST_HEAD(&rc->unrepaired_chunks);
 
        rc->verbose = verbose;
@@ -248,7 +247,7 @@ again:
                                              generation);
                        /*
                         * According to the current kernel code, the following
-                        * case is impossble, or there is something wrong in
+                        * case is impossible, or there is something wrong in
                         * the kernel code.
                         */
                        if (memcmp(((void *)exist) + offset,
@@ -261,7 +260,7 @@ again:
                list_del_init(&exist->list);
                free(exist);
                /*
-                * We must do seach again to avoid the following cache.
+                * We must do search again to avoid the following cache.
                 * /--old bg 1--//--old bg 2--/
                 *        /--new bg--/
                 */
@@ -452,24 +451,6 @@ static void print_device_extent_tree(struct device_extent_tree *tree)
        printf("\n");
 }
 
-static void print_device_info(struct btrfs_device *device, char *prefix)
-{
-       if (prefix)
-               printf("%s", prefix);
-       printf("Device: id = %llu, name = %s\n",
-              device->devid, device->name);
-}
-
-static void print_all_devices(struct list_head *devices)
-{
-       struct btrfs_device *dev;
-
-       printf("All Devices:\n");
-       list_for_each_entry(dev, devices, dev_list)
-               print_device_info(dev, "\t");
-       printf("\n");
-}
-
 static void print_scan_result(struct recover_control *rc)
 {
        if (!rc->verbose)
@@ -478,7 +459,7 @@ static void print_scan_result(struct recover_control *rc)
        printf("DEVICE SCAN RESULT:\n");
        printf("Filesystem Information:\n");
        printf("\tsectorsize: %d\n", rc->sectorsize);
-       printf("\tleafsize: %d\n", rc->leafsize);
+       printf("\tnodesize: %d\n", rc->nodesize);
        printf("\ttree root generation: %llu\n", rc->generation);
        printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
        printf("\n");
@@ -531,22 +512,32 @@ static void print_check_result(struct recover_control *rc)
                return;
 
        printf("CHECK RESULT:\n");
-       printf("Healthy Chunks:\n");
+       printf("Recoverable Chunks:\n");
        list_for_each_entry(chunk, &rc->good_chunks, list) {
                print_chunk_info(chunk, "  ");
                good++;
                total++;
        }
-       printf("Bad Chunks:\n");
+       list_for_each_entry(chunk, &rc->rebuild_chunks, list) {
+               print_chunk_info(chunk, "  ");
+               good++;
+               total++;
+       }
+       list_for_each_entry(chunk, &rc->unrepaired_chunks, list) {
+               print_chunk_info(chunk, "  ");
+               good++;
+               total++;
+       }
+       printf("Unrecoverable Chunks:\n");
        list_for_each_entry(chunk, &rc->bad_chunks, list) {
                print_chunk_info(chunk, "  ");
                bad++;
                total++;
        }
        printf("\n");
-       printf("Total Chunks:\t%d\n", total);
-       printf("  Heathy:\t%d\n", good);
-       printf("  Bad:\t%d\n", bad);
+       printf("Total Chunks:\t\t%d\n", total);
+       printf("  Recoverable:\t\t%d\n", good);
+       printf("  Unrecoverable:\t%d\n", bad);
 
        printf("\n");
        printf("Orphan Block Groups:\n");
@@ -557,6 +548,7 @@ static void print_check_result(struct recover_control *rc)
        printf("Orphan Device Extents:\n");
        list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
                print_device_extent_info(devext, "  ");
+       printf("\n");
 }
 
 static int check_chunk_by_metadata(struct recover_control *rc,
@@ -608,7 +600,7 @@ static int check_chunk_by_metadata(struct recover_control *rc,
                    btrfs_dev_extent_chunk_offset(l, dev_extent)) {
                        if (rc->verbose)
                                fprintf(stderr,
-                                       "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
+                                       "Device tree mismatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
                                        btrfs_dev_extent_chunk_offset(l,
                                                                dev_extent),
                                        btrfs_dev_extent_length(l, dev_extent),
@@ -644,7 +636,7 @@ bg_check:
        if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
                if (rc->verbose)
                        fprintf(stderr,
-                               "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
+                               "Chunk[%llu, %llu]'s type(%llu) is different with Block Group's type(%llu)\n",
                                chunk->offset, chunk->length, chunk->type_flags,
                                btrfs_disk_block_group_flags(l, bg_ptr));
                btrfs_release_path(&path);
@@ -751,18 +743,20 @@ static int scan_one_device(void *dev_scan_struct)
        if (ret)
                return 1;
 
-       buf = malloc(sizeof(*buf) + rc->leafsize);
+       buf = malloc(sizeof(*buf) + rc->nodesize);
        if (!buf)
                return -ENOMEM;
-       buf->len = rc->leafsize;
+       buf->len = rc->nodesize;
 
        bytenr = 0;
        while (1) {
+               dev_scan->bytenr = bytenr;
+
                if (is_super_block_address(bytenr))
                        bytenr += rc->sectorsize;
 
-               if (pread64(fd, buf->data, rc->leafsize, bytenr) <
-                   rc->leafsize)
+               if (pread64(fd, buf->data, rc->nodesize, bytenr) <
+                   rc->nodesize)
                        break;
 
                if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
@@ -806,7 +800,7 @@ static int scan_one_device(void *dev_scan_struct)
                        break;
                }
 next_node:
-               bytenr += rc->leafsize;
+               bytenr += rc->nodesize;
        }
 out:
        close(fd);
@@ -821,12 +815,11 @@ static int scan_devices(struct recover_control *rc)
        struct btrfs_device *dev;
        struct device_scan *dev_scans;
        pthread_t *t_scans;
-       int *t_rets;
+       long *t_rets;
        int devnr = 0;
        int devidx = 0;
-       int cancel_from = 0;
-       int cancel_to = 0;
        int i;
+       int all_done;
 
        list_for_each_entry(dev, &rc->fs_devices->devices, dev_list)
                devnr++;
@@ -835,11 +828,16 @@ static int scan_devices(struct recover_control *rc)
        if (!dev_scans)
                return -ENOMEM;
        t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr);
-       if (!t_scans)
+       if (!t_scans) {
+               free(dev_scans);
                return -ENOMEM;
-       t_rets = (int *)malloc(sizeof(int) * devnr);
-       if (!t_rets)
+       }
+       t_rets = (long *)malloc(sizeof(long) * devnr);
+       if (!t_rets) {
+               free(dev_scans);
+               free(t_scans);
                return -ENOMEM;
+       }
 
        list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
                fd = open(dev->name, O_RDONLY);
@@ -852,32 +850,64 @@ static int scan_devices(struct recover_control *rc)
                dev_scans[devidx].rc = rc;
                dev_scans[devidx].dev = dev;
                dev_scans[devidx].fd = fd;
-               ret = pthread_create(&t_scans[devidx], NULL,
-                                    (void *)scan_one_device,
-                                    (void *)&dev_scans[devidx]);
-               if (ret) {
-                       cancel_from = 0;
-                       cancel_to = devidx - 1;
-                       goto out1;
-               }
+               dev_scans[devidx].bytenr = -1;
                devidx++;
        }
 
-       i = 0;
-       while (i < devidx) {
-               ret = pthread_join(t_scans[i], (void **)&t_rets[i]);
-               if (ret || t_rets[i]) {
-                       ret = 1;
-                       cancel_from = i + 1;
-                       cancel_to = devnr - 1;
+       for (i = 0; i < devidx; i++) {
+               ret = pthread_create(&t_scans[i], NULL,
+                                    (void *)scan_one_device,
+                                    (void *)&dev_scans[i]);
+               if (ret)
                        goto out1;
+
+               dev_scans[i].bytenr = 0;
+       }
+
+       while (1) {
+               all_done = 1;
+               for (i = 0; i < devidx; i++) {
+                       if (dev_scans[i].bytenr == -1)
+                               continue;
+                       ret = pthread_tryjoin_np(t_scans[i],
+                                                (void **)&t_rets[i]);
+                       if (ret == EBUSY) {
+                               all_done = 0;
+                               continue;
+                       }
+                       if (ret || t_rets[i]) {
+                               ret = 1;
+                               goto out1;
+                       }
+                       dev_scans[i].bytenr = -1;
+               }
+
+               printf("\rScanning: ");
+               for (i = 0; i < devidx; i++) {
+                       if (dev_scans[i].bytenr == -1)
+                               printf("%sDONE in dev%d",
+                                      i ? ", " : "", i);
+                       else
+                               printf("%s%llu in dev%d",
+                                      i ? ", " : "", dev_scans[i].bytenr, i);
+               }
+               /* clear chars if exist in tail */
+               printf("                ");
+               printf("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
+               fflush(stdout);
+
+               if (all_done) {
+                       printf("\n");
+                       break;
                }
-               i++;
+
+               sleep(1);
        }
 out1:
-       while (ret && (cancel_from <= cancel_to)) {
-               pthread_cancel(t_scans[cancel_from]);
-               cancel_from++;
+       for (i = 0; i < devidx; i++) {
+               if (dev_scans[i].bytenr == -1)
+                       continue;
+               pthread_cancel(t_scans[i]);
        }
 out2:
        free(dev_scans);
@@ -894,11 +924,12 @@ static int build_device_map_by_chunk_record(struct btrfs_root *root,
        u64 devid;
        u8 uuid[BTRFS_UUID_SIZE];
        u16 num_stripes;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_mapping_tree *map_tree;
        struct map_lookup *map;
        struct stripe *stripe;
 
-       map_tree = &root->fs_info->mapping_tree;
+       map_tree = &fs_info->mapping_tree;
        num_stripes = chunk->num_stripes;
        map = malloc(btrfs_map_lookup_size(num_stripes));
        if (!map)
@@ -917,10 +948,10 @@ static int build_device_map_by_chunk_record(struct btrfs_root *root,
                devid = stripe->devid;
                memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
                map->stripes[i].physical = stripe->offset;
-               map->stripes[i].dev = btrfs_find_device(root, devid,
+               map->stripes[i].dev = btrfs_find_device(fs_info, devid,
                                                        uuid, NULL);
                if (!map->stripes[i].dev) {
-                       kfree(map);
+                       free(map);
                        return -EIO;
                }
        }
@@ -940,6 +971,11 @@ static int build_device_maps_by_chunk_records(struct recover_control *rc,
                if (ret)
                        return ret;
        }
+       list_for_each_entry(chunk, &rc->rebuild_chunks, list) {
+               ret = build_device_map_by_chunk_record(root, chunk);
+               if (ret)
+                       return ret;
+       }
        return ret;
 }
 
@@ -1017,7 +1053,7 @@ again:
                    key.type == BTRFS_METADATA_ITEM_KEY) {
                        old_val = btrfs_super_bytes_used(fs_info->super_copy);
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
-                               old_val += root->leafsize;
+                               old_val += fs_info->nodesize;
                        else
                                old_val += key.offset;
                        btrfs_set_super_bytes_used(fs_info->super_copy,
@@ -1033,7 +1069,7 @@ again:
 
        if (key.objectid < end) {
                if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-                       key.objectid += root->sectorsize;
+                       key.objectid += fs_info->sectorsize;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        key.offset = 0;
                }
@@ -1045,8 +1081,7 @@ err:
        return ret;
 }
 
-static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+static int block_group_free_all_extent(struct btrfs_root *root,
                                       struct block_group_record *bg)
 {
        struct btrfs_block_group_cache *cache;
@@ -1063,8 +1098,8 @@ static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
        end = start + cache->key.offset - 1;
 
        set_extent_bits(&info->block_group_cache, start, end,
-                       BLOCK_GROUP_DIRTY, GFP_NOFS);
-       set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
+                       BLOCK_GROUP_DIRTY);
+       set_extent_dirty(&info->free_space_cache, start, end);
 
        btrfs_set_block_group_used(&cache->item, 0);
 
@@ -1086,7 +1121,7 @@ static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
                if (ret)
                        return ret;
 
-               ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
+               ret = block_group_free_all_extent(root, chunk->bg_rec);
                if (ret)
                        return ret;
        }
@@ -1107,11 +1142,11 @@ static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
                if (min_devid > dev->devid)
                        min_devid = dev->devid;
        }
-       disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
-       disk_key.type = BTRFS_DEV_ITEM_KEY;
-       disk_key.offset = min_devid;
+       btrfs_set_disk_key_objectid(&disk_key, BTRFS_DEV_ITEMS_OBJECTID);
+       btrfs_set_disk_key_type(&disk_key, BTRFS_DEV_ITEM_KEY);
+       btrfs_set_disk_key_offset(&disk_key, min_devid);
 
-       cow = btrfs_alloc_free_block(trans, root, root->nodesize,
+       cow = btrfs_alloc_free_block(trans, root, root->fs_info->nodesize,
                                     BTRFS_CHUNK_TREE_OBJECTID,
                                     &disk_key, 0, 0, 0);
        btrfs_set_header_bytenr(cow, cow->start);
@@ -1139,13 +1174,10 @@ static int __rebuild_device_items(struct btrfs_trans_handle *trans,
 {
        struct btrfs_device *dev;
        struct btrfs_key key;
-       struct btrfs_dev_item *dev_item;
+       struct btrfs_dev_item dev_item_tmp;
+       struct btrfs_dev_item *dev_item = &dev_item_tmp;
        int ret = 0;
 
-       dev_item = malloc(sizeof(struct btrfs_dev_item));
-       if (!dev_item)
-               return -ENOMEM;
-
        list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
                key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
                key.type = BTRFS_DEV_ITEM_KEY;
@@ -1166,7 +1198,27 @@ static int __rebuild_device_items(struct btrfs_trans_handle *trans,
                                        dev_item, sizeof(*dev_item));
        }
 
-       free(dev_item);
+       return ret;
+}
+
+static int __insert_chunk_item(struct btrfs_trans_handle *trans,
+                               struct chunk_record *chunk_rec,
+                               struct btrfs_root *chunk_root)
+{
+       struct btrfs_key key;
+       struct btrfs_chunk *chunk = NULL;
+       int ret = 0;
+
+       chunk = create_chunk_item(chunk_rec);
+       if (!chunk)
+               return -ENOMEM;
+       key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+       key.type = BTRFS_CHUNK_ITEM_KEY;
+       key.offset = chunk_rec->offset;
+
+       ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+                               btrfs_chunk_item_size(chunk_rec->num_stripes));
+       free(chunk);
        return ret;
 }
 
@@ -1174,8 +1226,6 @@ static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
                                 struct recover_control *rc,
                                 struct btrfs_root *root)
 {
-       struct btrfs_key key;
-       struct btrfs_chunk *chunk = NULL;
        struct btrfs_root *chunk_root;
        struct chunk_record *chunk_rec;
        int ret;
@@ -1183,17 +1233,12 @@ static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
        chunk_root = root->fs_info->chunk_root;
 
        list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
-               chunk = create_chunk_item(chunk_rec);
-               if (!chunk)
-                       return -ENOMEM;
-
-               key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-               key.type = BTRFS_CHUNK_ITEM_KEY;
-               key.offset = chunk_rec->offset;
-
-               ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
-                               btrfs_chunk_item_size(chunk->num_stripes));
-               free(chunk);
+               ret = __insert_chunk_item(trans, chunk_rec, chunk_root);
+               if (ret)
+                       return ret;
+       }
+       list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) {
+               ret = __insert_chunk_item(trans, chunk_rec, chunk_root);
                if (ret)
                        return ret;
        }
@@ -1224,13 +1269,14 @@ static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
 static int rebuild_sys_array(struct recover_control *rc,
                             struct btrfs_root *root)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_chunk *chunk;
        struct btrfs_key key;
        struct chunk_record *chunk_rec;
        int ret = 0;
        u16 num_stripes;
 
-       btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
+       btrfs_set_super_sys_array_size(fs_info->super_copy, 0);
 
        list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
                if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
@@ -1247,7 +1293,7 @@ static int rebuild_sys_array(struct recover_control *rc,
                key.type = BTRFS_CHUNK_ITEM_KEY;
                key.offset = chunk_rec->offset;
 
-               ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
+               ret = btrfs_add_system_chunk(fs_info, &key, chunk,
                                btrfs_chunk_item_size(num_stripes));
                free(chunk);
                if (ret)
@@ -1257,16 +1303,135 @@ static int rebuild_sys_array(struct recover_control *rc,
 
 }
 
+static int calculate_bg_used(struct btrfs_root *extent_root,
+                            struct chunk_record *chunk_rec,
+                            struct btrfs_path *path,
+                            u64 *used)
+{
+       struct extent_buffer *node;
+       struct btrfs_key found_key;
+       int slot;
+       int ret = 0;
+       u64 used_ret = 0;
+
+       while (1) {
+               node = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(node, &found_key, slot);
+               if (found_key.objectid >= chunk_rec->offset + chunk_rec->length)
+                       break;
+               if (found_key.type != BTRFS_METADATA_ITEM_KEY &&
+                   found_key.type != BTRFS_EXTENT_DATA_KEY)
+                       goto next;
+               if (found_key.type == BTRFS_METADATA_ITEM_KEY)
+                       used_ret += extent_root->fs_info->nodesize;
+               else
+                       used_ret += found_key.offset;
+next:
+               if (slot + 1 < btrfs_header_nritems(node)) {
+                       slot++;
+               } else {
+                       ret = btrfs_next_leaf(extent_root, path);
+                       if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       if (ret < 0)
+                               break;
+               }
+       }
+       if (!ret)
+               *used = used_ret;
+       return ret;
+}
+
+static int __insert_block_group(struct btrfs_trans_handle *trans,
+                               struct chunk_record *chunk_rec,
+                               struct btrfs_root *extent_root,
+                               u64 used)
+{
+       struct btrfs_block_group_item bg_item;
+       struct btrfs_key key;
+       int ret = 0;
+
+       btrfs_set_block_group_used(&bg_item, used);
+       btrfs_set_block_group_chunk_objectid(&bg_item, used);
+       btrfs_set_block_group_flags(&bg_item, chunk_rec->type_flags);
+       key.objectid = chunk_rec->offset;
+       key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+       key.offset = chunk_rec->length;
+
+       ret = btrfs_insert_item(trans, extent_root, &key, &bg_item,
+                               sizeof(bg_item));
+       return ret;
+}
+
+/*
+ * Search through the extent tree to rebuild the 'used' member of the block
+ * group.
+ * However, since block group and extent item shares the extent tree,
+ * the extent item may also missing.
+ * In that case, we fill the 'used' with the length of the block group to
+ * ensure no write into the block group.
+ * Btrfsck will hate it but we will inform user to call '--init-extent-tree'
+ * if possible, or just salvage as much data as possible from the fs.
+ */
+static int rebuild_block_group(struct btrfs_trans_handle *trans,
+                              struct recover_control *rc,
+                              struct btrfs_root *root)
+{
+       struct chunk_record *chunk_rec;
+       struct btrfs_key search_key;
+       struct btrfs_path path;
+       u64 used = 0;
+       int ret = 0;
+
+       if (list_empty(&rc->rebuild_chunks))
+               return 0;
+
+       btrfs_init_path(&path);
+       list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) {
+               search_key.objectid = chunk_rec->offset;
+               search_key.type = BTRFS_EXTENT_ITEM_KEY;
+               search_key.offset = 0;
+               ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
+                                       &search_key, &path, 0, 0);
+               if (ret < 0)
+                       goto out;
+               ret = calculate_bg_used(root->fs_info->extent_root,
+                                       chunk_rec, &path, &used);
+               /*
+                * Extent tree is damaged, better to rebuild the whole extent
+                * tree. Currently, change the used to chunk's len to prevent
+                * write/block reserve happening in that block group.
+                */
+               if (ret < 0) {
+                       fprintf(stderr,
+                               "Fail to search extent tree for block group: [%llu,%llu]\n",
+                               chunk_rec->offset,
+                               chunk_rec->offset + chunk_rec->length);
+                       fprintf(stderr,
+                               "Mark the block group full to prevent block rsv problems\n");
+                       used = chunk_rec->length;
+               }
+               btrfs_release_path(&path);
+               ret = __insert_block_group(trans, chunk_rec,
+                                          root->fs_info->extent_root,
+                                          used);
+               if (ret < 0)
+                       goto out;
+       }
+out:
+       btrfs_release_path(&path);
+       return ret;
+}
+
 static struct btrfs_root *
 open_ctree_with_broken_chunk(struct recover_control *rc)
 {
        struct btrfs_fs_info *fs_info;
        struct btrfs_super_block *disk_super;
        struct extent_buffer *eb;
-       u32 sectorsize;
-       u32 nodesize;
-       u32 leafsize;
-       u32 stripesize;
        int ret;
 
        fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
@@ -1283,25 +1448,24 @@ open_ctree_with_broken_chunk(struct recover_control *rc)
 
        disk_super = fs_info->super_copy;
        ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
-                                  disk_super, fs_info->super_bytenr);
+                                  disk_super, fs_info->super_bytenr,
+                                  SBREAD_RECOVER);
        if (ret) {
                fprintf(stderr, "No valid btrfs found\n");
                goto out_devices;
        }
 
        memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
+       fs_info->sectorsize = btrfs_super_sectorsize(disk_super);
+       fs_info->nodesize = btrfs_super_nodesize(disk_super);
+       fs_info->stripesize = btrfs_super_stripesize(disk_super);
 
-       ret = btrfs_check_fs_compatibility(disk_super, 1);
+       ret = btrfs_check_fs_compatibility(disk_super, OPEN_CTREE_WRITES);
        if (ret)
                goto out_devices;
 
-       nodesize = btrfs_super_nodesize(disk_super);
-       leafsize = btrfs_super_leafsize(disk_super);
-       sectorsize = btrfs_super_sectorsize(disk_super);
-       stripesize = btrfs_super_stripesize(disk_super);
-
-       __setup_root(nodesize, leafsize, sectorsize, stripesize,
-                    fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+       btrfs_setup_root(fs_info->chunk_root, fs_info,
+                        BTRFS_CHUNK_TREE_OBJECTID);
 
        ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
        if (ret)
@@ -1333,6 +1497,7 @@ static int recover_prepare(struct recover_control *rc, char *path)
        int ret;
        int fd;
        struct btrfs_super_block *sb;
+       char buf[BTRFS_SUPER_INFO_SIZE];
        struct btrfs_fs_devices *fs_devices;
 
        ret = 0;
@@ -1342,21 +1507,16 @@ static int recover_prepare(struct recover_control *rc, char *path)
                return -1;
        }
 
-       sb = malloc(sizeof(struct btrfs_super_block));
-       if (!sb) {
-               fprintf(stderr, "allocating memory for sb failed.\n");
-               ret = -ENOMEM;
-               goto fail_close_fd;
-       }
-
-       ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET);
+       sb = (struct btrfs_super_block*)buf;
+       ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET,
+                       SBREAD_RECOVER);
        if (ret) {
                fprintf(stderr, "read super block error\n");
-               goto fail_free_sb;
+               goto out_close_fd;
        }
 
        rc->sectorsize = btrfs_super_sectorsize(sb);
-       rc->leafsize = btrfs_super_leafsize(sb);
+       rc->nodesize = btrfs_super_nodesize(sb);
        rc->generation = btrfs_super_generation(sb);
        rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
        rc->csum_size = btrfs_super_csum_size(sb);
@@ -1365,21 +1525,19 @@ static int recover_prepare(struct recover_control *rc, char *path)
        if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
                fprintf(stderr, "this device is seed device\n");
                ret = -1;
-               goto fail_free_sb;
+               goto out_close_fd;
        }
 
-       ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, 1);
+       ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, SBREAD_RECOVER, 0);
        if (ret)
-               goto fail_free_sb;
+               goto out_close_fd;
 
        rc->fs_devices = fs_devices;
 
        if (rc->verbose)
                print_all_devices(&rc->fs_devices->devices);
 
-fail_free_sb:
-       free(sb);
-fail_close_fd:
+out_close_fd:
        close(fd);
        return ret;
 }
@@ -1427,16 +1585,19 @@ static int btrfs_verify_device_extents(struct block_group_record *bg,
                                       struct list_head *devexts, int ndevexts)
 {
        struct device_extent_record *devext;
-       u64 strpie_length;
+       u64 stripe_length;
        int expected_num_stripes;
 
        expected_num_stripes = calc_num_stripes(bg->flags);
        if (expected_num_stripes && expected_num_stripes != ndevexts)
                return 1;
 
-       strpie_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
+       if (check_num_stripes(bg->flags, ndevexts) < 0)
+               return 1;
+
+       stripe_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
        list_for_each_entry(devext, devexts, chunk_list) {
-               if (devext->length != strpie_length)
+               if (devext->length != stripe_length)
                        return 1;
        }
        return 0;
@@ -1493,7 +1654,7 @@ static int btrfs_calc_stripe_index(struct chunk_record *chunk, u64 logical)
                stripe_nr /= nr_data_stripes;
                index = (index + stripe_nr) % chunk->num_stripes;
        } else {
-               BUG_ON(1);
+               return -1;
        }
        return index;
 }
@@ -1556,6 +1717,7 @@ btrfs_rebuild_ordered_meta_chunk_stripes(struct recover_control *rc,
 again:
        er = container_of(cache, struct extent_record, cache);
        index = btrfs_calc_stripe_index(chunk, er->cache.start);
+       BUG_ON(index == -1);
        if (chunk->stripes[index].devid)
                goto next;
        list_for_each_entry_safe(devext, next, &devexts, chunk_list) {
@@ -1655,7 +1817,7 @@ static int next_csum(struct btrfs_root *root,
        int ret = 0;
        struct btrfs_root *csum_root = root->fs_info->csum_root;
        struct btrfs_csum_item *csum_item;
-       u32 blocksize = root->sectorsize;
+       u32 blocksize = root->fs_info->sectorsize;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size;
 
@@ -1727,8 +1889,8 @@ static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum)
                goto out;
        }
        ret = 0;
-       csum_result = btrfs_csum_data(NULL, data, csum_result, len);
-       btrfs_csum_final(csum_result, (char *)&csum_result);
+       csum_result = btrfs_csum_data(data, csum_result, len);
+       btrfs_csum_final(csum_result, (u8 *)&csum_result);
        if (csum_result != tree_csum)
                ret = 1;
 out:
@@ -1738,7 +1900,7 @@ out:
 
 static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key,
                           struct extent_buffer *leaf, int slot) {
-       u32 blocksize = root->sectorsize;
+       u32 blocksize = root->fs_info->sectorsize;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
        u64 offset = btrfs_item_size_nr(leaf, slot);
@@ -1761,9 +1923,12 @@ static int insert_stripe(struct list_head *devexts,
        dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
                                        0);
        if (!dev)
-               return 1;
-       BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
-                                       1));
+               return -ENOENT;
+       if (btrfs_find_device_by_devid(rc->fs_devices, devext->objectid, 1)) {
+               error("unexpected: found another device with id %llu",
+                               (unsigned long long)devext->objectid);
+               return -EINVAL;
+       }
 
        chunk->stripes[index].devid = devext->objectid;
        chunk->stripes[index].offset = devext->offset;
@@ -1774,6 +1939,34 @@ static int insert_stripe(struct list_head *devexts,
        return 0;
 }
 
+static inline int count_devext_records(struct list_head *record_list)
+{
+       int num_of_records = 0;
+       struct device_extent_record *devext;
+
+       list_for_each_entry(devext, record_list, chunk_list)
+               num_of_records++;
+
+       return num_of_records;
+}
+
+static int fill_chunk_up(struct chunk_record *chunk, struct list_head *devexts,
+                        struct recover_control *rc)
+{
+       int ret = 0;
+       int i;
+
+       for (i = 0; i < chunk->num_stripes; i++) {
+               if (!chunk->stripes[i].devid) {
+                       ret = insert_stripe(devexts, rc, chunk, i);
+                       if (ret)
+                               break;
+               }
+       }
+
+       return ret;
+}
+
 #define EQUAL_STRIPE (1 << 0)
 
 static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
@@ -1797,7 +1990,7 @@ static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
        u64 chunk_end = chunk->offset + chunk->length;
        u64 csum_offset = 0;
        u64 data_offset;
-       u32 blocksize = root->sectorsize;
+       u32 blocksize = root->fs_info->sectorsize;
        u32 tree_csum;
        int index = 0;
        int num_unordered = 0;
@@ -1877,8 +2070,6 @@ next_csum:
                fprintf(stderr, "Fetch csum failed\n");
                goto fail_out;
        } else if (ret == 1) {
-               list_for_each_entry(devext, &unordered, chunk_list)
-                       num_unordered++;
                if (!(*flags & EQUAL_STRIPE))
                        *flags |= EQUAL_STRIPE;
                goto out;
@@ -1906,22 +2097,21 @@ next_csum:
        }
 
        if (list_empty(&candidates)) {
-               list_for_each_entry(devext, &unordered, chunk_list)
-                       num_unordered++;
+               num_unordered = count_devext_records(&unordered);
                if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6
                                        && num_unordered == 2) {
-                       list_splice_init(&unordered, &chunk->dextents);
                        btrfs_release_path(&path);
-                       return 0;
-               } else
-                       ret = 1;
+                       ret = fill_chunk_up(chunk, &unordered, rc);
+                       return ret;
+               }
 
-               goto fail_out;
+               goto next_stripe;
        }
 
        if (list_is_last(candidates.next, &candidates)) {
                index = btrfs_calc_stripe_index(chunk,
                        key.offset + csum_offset * blocksize);
+               BUG_ON(index == -1);
                if (chunk->stripes[index].devid)
                        goto next_stripe;
                ret = insert_stripe(&candidates, rc, chunk, index);
@@ -1942,8 +2132,7 @@ next_stripe:
 out:
        ret = 0;
        list_splice_init(&candidates, &unordered);
-       list_for_each_entry(devext, &unordered, chunk_list)
-               num_unordered++;
+       num_unordered = count_devext_records(&unordered);
        if (num_unordered == 1) {
                for (i = 0; i < chunk->num_stripes; i++) {
                        if (!chunk->stripes[i].devid) {
@@ -1959,14 +2148,7 @@ out:
                        & BTRFS_BLOCK_GROUP_RAID5)
                 || (num_unordered == 3 && chunk->type_flags
                        & BTRFS_BLOCK_GROUP_RAID6)) {
-                       for (i = 0; i < chunk->num_stripes; i++) {
-                               if (!chunk->stripes[i].devid) {
-                                       ret = insert_stripe(&unordered, rc,
-                                                       chunk, i);
-                                       if (ret)
-                                               break;
-                               }
-                       }
+                       ret = fill_chunk_up(chunk, &unordered, rc);
                }
        }
 fail_out:
@@ -2023,10 +2205,9 @@ static int btrfs_recover_chunks(struct recover_control *rc)
                nstripes = btrfs_get_device_extents(bg->objectid,
                                                    &rc->devext.no_chunk_orphans,
                                                    &devexts);
-               chunk = malloc(btrfs_chunk_record_size(nstripes));
+               chunk = calloc(1, btrfs_chunk_record_size(nstripes));
                if (!chunk)
                        return -ENOMEM;
-               memset(chunk, 0, btrfs_chunk_record_size(nstripes));
                INIT_LIST_HEAD(&chunk->dextents);
                chunk->bg_rec = bg;
                chunk->cache.start = bg->objectid;
@@ -2045,8 +2226,16 @@ static int btrfs_recover_chunks(struct recover_control *rc)
                chunk->sub_stripes = calc_sub_nstripes(bg->flags);
 
                ret = insert_cache_extent(&rc->chunk, &chunk->cache);
+               if (ret == -EEXIST) {
+                       error("duplicate entry in cache start %llu size %llu",
+                                       (unsigned long long)chunk->cache.start,
+                                       (unsigned long long)chunk->cache.size);
+                       free(chunk);
+                       return ret;
+               }
                BUG_ON(ret);
 
+               list_del_init(&bg->list);
                if (!nstripes) {
                        list_add_tail(&chunk->list, &rc->bad_chunks);
                        continue;
@@ -2077,8 +2266,35 @@ static int btrfs_recover_chunks(struct recover_control *rc)
        return 0;
 }
 
+static inline int is_chunk_overlap(struct chunk_record *chunk1,
+                                  struct chunk_record *chunk2)
+{
+       if (chunk1->offset >= chunk2->offset + chunk2->length ||
+           chunk1->offset + chunk1->length <= chunk2->offset)
+               return 0;
+       return 1;
+}
+
+/* Move invalid(overlap with good chunks) rebuild chunks to bad chunk list */
+static void validate_rebuild_chunks(struct recover_control *rc)
+{
+       struct chunk_record *good;
+       struct chunk_record *rebuild;
+       struct chunk_record *tmp;
+
+       list_for_each_entry_safe(rebuild, tmp, &rc->rebuild_chunks, list) {
+               list_for_each_entry(good, &rc->good_chunks, list) {
+                       if (is_chunk_overlap(rebuild, good)) {
+                               list_move_tail(&rebuild->list,
+                                              &rc->bad_chunks);
+                               break;
+                       }
+               }
+       }
+}
+
 /*
- * Return 0 when succesful, < 0 on error and > 0 if aborted by user
+ * Return 0 when successful, < 0 on error and > 0 if aborted by user
  */
 int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
 {
@@ -2111,8 +2327,7 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        print_scan_result(&rc);
 
        ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
-                          &rc.bad_chunks, 1);
-       print_check_result(&rc);
+                          &rc.bad_chunks, &rc.rebuild_chunks, 1);
        if (ret) {
                if (!list_empty(&rc.bg.block_groups) ||
                    !list_empty(&rc.devext.no_chunk_orphans)) {
@@ -2120,17 +2335,13 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
                        if (ret)
                                goto fail_rc;
                }
-               /*
-                * If the chunk is healthy, its block group item and device
-                * extent item should be written on the disks. So, it is very
-                * likely that the bad chunk is a old one that has been
-                * droppped from the fs. Don't deal with them now, we will
-                * check it after the fs is opened.
-                */
        } else {
-               fprintf(stderr, "Check chunks successfully with no orphans\n");
+               print_check_result(&rc);
+               printf("Check chunks successfully with no orphans\n");
                goto fail_rc;
        }
+       validate_rebuild_chunks(&rc);
+       print_check_result(&rc);
 
        root = open_ctree_with_broken_chunk(&rc);
        if (IS_ERR(root)) {
@@ -2160,6 +2371,7 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        }
 
        trans = btrfs_start_transaction(root, 1);
+       BUG_ON(IS_ERR(trans));
        ret = remove_chunk_extent_item(trans, &rc, root);
        BUG_ON(ret);
 
@@ -2169,6 +2381,12 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        ret = rebuild_sys_array(&rc, root);
        BUG_ON(ret);
 
+       ret = rebuild_block_group(trans, &rc, root);
+       if (ret) {
+               printf("Fail to rebuild block groups.\n");
+               printf("Recommend to run 'btrfs check --init-extent-tree <dev>' after recovery\n");
+       }
+
        btrfs_commit_transaction(trans, root);
 fail_close_ctree:
        close_ctree(root);