btrfs-progs: mkfs/rootdir: Don't follow symbolic link when calcuating size
[platform/upstream/btrfs-progs.git] / chunk-recover.c
index 45da9a6..705bcf5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Fujitsu.  All rights reserved.
+ * Copyright (C) 2013 FUJITSU LIMITED.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -15,8 +15,9 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
-#define _XOPEN_SOURCE 500
-#define _GNU_SOURCE
+
+#include "kerncompat.h"
+#include "androidcompat.h"
 
 #include <stdio.h>
 #include <stdio_ext.h>
@@ -26,8 +27,8 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <uuid/uuid.h>
+#include <pthread.h>
 
-#include "kerncompat.h"
 #include "list.h"
 #include "radix-tree.h"
 #include "ctree.h"
 #include "transaction.h"
 #include "crc32c.h"
 #include "utils.h"
-#include "version.h"
 #include "btrfsck.h"
 #include "commands.h"
 
-#define BTRFS_STRIPE_LEN                       (64 * 1024)
-#define BTRFS_NUM_MIRRORS                      2
-
 struct recover_control {
        int verbose;
        int yes;
 
        u16 csum_size;
        u32 sectorsize;
-       u32 leafsize;
+       u32 nodesize;
        u64 generation;
        u64 chunk_root_generation;
 
@@ -63,29 +60,37 @@ struct recover_control {
 
        struct list_head good_chunks;
        struct list_head bad_chunks;
+       struct list_head rebuild_chunks;
        struct list_head unrepaired_chunks;
+       pthread_mutex_t rc_lock;
 };
 
 struct extent_record {
        struct cache_extent cache;
        u64 generation;
        u8 csum[BTRFS_CSUM_SIZE];
-       struct btrfs_device *devices[BTRFS_NUM_MIRRORS];
-       u64 offsets[BTRFS_NUM_MIRRORS];
+       struct btrfs_device *devices[BTRFS_MAX_MIRRORS];
+       u64 offsets[BTRFS_MAX_MIRRORS];
        int nmirrors;
 };
 
+struct device_scan {
+       struct recover_control *rc;
+       struct btrfs_device *dev;
+       int fd;
+       u64 bytenr;
+};
+
 static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb)
 {
        struct extent_record *rec;
 
-       rec = malloc(sizeof(*rec));
+       rec = calloc(1, sizeof(*rec));
        if (!rec) {
                fprintf(stderr, "Fail to allocate memory for extent record.\n");
                exit(1);
        }
 
-       memset(rec, 0, sizeof(*rec));
        rec->cache.start = btrfs_header_bytenr(eb);
        rec->cache.size = eb->len;
        rec->generation = btrfs_header_generation(eb);
@@ -121,7 +126,7 @@ again:
                            memcmp(exist->csum, rec->csum, BTRFS_CSUM_SIZE)) {
                                ret = -EEXIST;
                        } else {
-                               BUG_ON(exist->nmirrors >= BTRFS_NUM_MIRRORS);
+                               BUG_ON(exist->nmirrors >= BTRFS_MAX_MIRRORS);
                                exist->devices[exist->nmirrors] = device;
                                exist->offsets[exist->nmirrors] = offset;
                                exist->nmirrors++;
@@ -198,10 +203,12 @@ static void init_recover_control(struct recover_control *rc, int verbose,
 
        INIT_LIST_HEAD(&rc->good_chunks);
        INIT_LIST_HEAD(&rc->bad_chunks);
+       INIT_LIST_HEAD(&rc->rebuild_chunks);
        INIT_LIST_HEAD(&rc->unrepaired_chunks);
 
        rc->verbose = verbose;
        rc->yes = yes;
+       pthread_mutex_init(&rc->rc_lock, NULL);
 }
 
 static void free_recover_control(struct recover_control *rc)
@@ -210,6 +217,7 @@ static void free_recover_control(struct recover_control *rc)
        free_chunk_cache_tree(&rc->chunk);
        free_device_extent_tree(&rc->devext);
        free_extent_record_tree(&rc->eb_cache);
+       pthread_mutex_destroy(&rc->rc_lock);
 }
 
 static int process_block_group_item(struct block_group_tree *bg_cache,
@@ -239,7 +247,7 @@ again:
                                              generation);
                        /*
                         * According to the current kernel code, the following
-                        * case is impossble, or there is something wrong in
+                        * case is impossible, or there is something wrong in
                         * the kernel code.
                         */
                        if (memcmp(((void *)exist) + offset,
@@ -252,7 +260,7 @@ again:
                list_del_init(&exist->list);
                free(exist);
                /*
-                * We must do seach again to avoid the following cache.
+                * We must do search again to avoid the following cache.
                 * /--old bg 1--//--old bg 2--/
                 *        /--new bg--/
                 */
@@ -443,24 +451,6 @@ static void print_device_extent_tree(struct device_extent_tree *tree)
        printf("\n");
 }
 
-static void print_device_info(struct btrfs_device *device, char *prefix)
-{
-       if (prefix)
-               printf("%s", prefix);
-       printf("Device: id = %llu, name = %s\n",
-              device->devid, device->name);
-}
-
-static void print_all_devices(struct list_head *devices)
-{
-       struct btrfs_device *dev;
-
-       printf("All Devices:\n");
-       list_for_each_entry(dev, devices, dev_list)
-               print_device_info(dev, "\t");
-       printf("\n");
-}
-
 static void print_scan_result(struct recover_control *rc)
 {
        if (!rc->verbose)
@@ -469,7 +459,7 @@ static void print_scan_result(struct recover_control *rc)
        printf("DEVICE SCAN RESULT:\n");
        printf("Filesystem Information:\n");
        printf("\tsectorsize: %d\n", rc->sectorsize);
-       printf("\tleafsize: %d\n", rc->leafsize);
+       printf("\tnodesize: %d\n", rc->nodesize);
        printf("\ttree root generation: %llu\n", rc->generation);
        printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
        printf("\n");
@@ -522,22 +512,32 @@ static void print_check_result(struct recover_control *rc)
                return;
 
        printf("CHECK RESULT:\n");
-       printf("Healthy Chunks:\n");
+       printf("Recoverable Chunks:\n");
        list_for_each_entry(chunk, &rc->good_chunks, list) {
                print_chunk_info(chunk, "  ");
                good++;
                total++;
        }
-       printf("Bad Chunks:\n");
+       list_for_each_entry(chunk, &rc->rebuild_chunks, list) {
+               print_chunk_info(chunk, "  ");
+               good++;
+               total++;
+       }
+       list_for_each_entry(chunk, &rc->unrepaired_chunks, list) {
+               print_chunk_info(chunk, "  ");
+               good++;
+               total++;
+       }
+       printf("Unrecoverable Chunks:\n");
        list_for_each_entry(chunk, &rc->bad_chunks, list) {
                print_chunk_info(chunk, "  ");
                bad++;
                total++;
        }
        printf("\n");
-       printf("Total Chunks:\t%d\n", total);
-       printf("  Heathy:\t%d\n", good);
-       printf("  Bad:\t%d\n", bad);
+       printf("Total Chunks:\t\t%d\n", total);
+       printf("  Recoverable:\t\t%d\n", good);
+       printf("  Unrecoverable:\t%d\n", bad);
 
        printf("\n");
        printf("Orphan Block Groups:\n");
@@ -548,6 +548,7 @@ static void print_check_result(struct recover_control *rc)
        printf("Orphan Device Extents:\n");
        list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
                print_device_extent_info(devext, "  ");
+       printf("\n");
 }
 
 static int check_chunk_by_metadata(struct recover_control *rc,
@@ -599,7 +600,7 @@ static int check_chunk_by_metadata(struct recover_control *rc,
                    btrfs_dev_extent_chunk_offset(l, dev_extent)) {
                        if (rc->verbose)
                                fprintf(stderr,
-                                       "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
+                                       "Device tree mismatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
                                        btrfs_dev_extent_chunk_offset(l,
                                                                dev_extent),
                                        btrfs_dev_extent_length(l, dev_extent),
@@ -635,7 +636,7 @@ bg_check:
        if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
                if (rc->verbose)
                        fprintf(stderr,
-                               "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
+                               "Chunk[%llu, %llu]'s type(%llu) is different with Block Group's type(%llu)\n",
                                chunk->offset, chunk->length, chunk->type_flags,
                                btrfs_disk_block_group_flags(l, bg_ptr));
                btrfs_release_path(&path);
@@ -694,14 +695,20 @@ static int extract_metadata_record(struct recover_control *rc,
                btrfs_item_key_to_cpu(leaf, &key, i);
                switch (key.type) {
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
+                       pthread_mutex_lock(&rc->rc_lock);
                        ret = process_block_group_item(&rc->bg, leaf, &key, i);
+                       pthread_mutex_unlock(&rc->rc_lock);
                        break;
                case BTRFS_CHUNK_ITEM_KEY:
+                       pthread_mutex_lock(&rc->rc_lock);
                        ret = process_chunk_item(&rc->chunk, leaf, &key, i);
+                       pthread_mutex_unlock(&rc->rc_lock);
                        break;
                case BTRFS_DEV_EXTENT_KEY:
+                       pthread_mutex_lock(&rc->rc_lock);
                        ret = process_device_extent_item(&rc->devext, leaf,
                                                         &key, i);
+                       pthread_mutex_unlock(&rc->rc_lock);
                        break;
                }
                if (ret)
@@ -721,29 +728,39 @@ static inline int is_super_block_address(u64 offset)
        return 0;
 }
 
-static int scan_one_device(struct recover_control *rc, int fd,
-                          struct btrfs_device *device)
+static int scan_one_device(void *dev_scan_struct)
 {
        struct extent_buffer *buf;
        u64 bytenr;
        int ret = 0;
+       struct device_scan *dev_scan = (struct device_scan *)dev_scan_struct;
+       struct recover_control *rc = dev_scan->rc;
+       struct btrfs_device *device = dev_scan->dev;
+       int fd = dev_scan->fd;
+       int oldtype;
+
+       ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &oldtype);
+       if (ret)
+               return 1;
 
-       buf = malloc(sizeof(*buf) + rc->leafsize);
+       buf = malloc(sizeof(*buf) + rc->nodesize);
        if (!buf)
                return -ENOMEM;
-       buf->len = rc->leafsize;
+       buf->len = rc->nodesize;
 
        bytenr = 0;
        while (1) {
+               dev_scan->bytenr = bytenr;
+
                if (is_super_block_address(bytenr))
                        bytenr += rc->sectorsize;
 
-               if (pread64(fd, buf->data, rc->leafsize, bytenr) <
-                   rc->leafsize)
+               if (pread64(fd, buf->data, rc->nodesize, bytenr) <
+                   rc->nodesize)
                        break;
 
                if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
-                                        (unsigned long)btrfs_header_fsid(buf),
+                                        btrfs_header_fsid(),
                                         BTRFS_FSID_SIZE)) {
                        bytenr += rc->sectorsize;
                        continue;
@@ -754,7 +771,9 @@ static int scan_one_device(struct recover_control *rc, int fd,
                        continue;
                }
 
+               pthread_mutex_lock(&rc->rc_lock);
                ret = process_extent_buffer(&rc->eb_cache, buf, device, bytenr);
+               pthread_mutex_unlock(&rc->rc_lock);
                if (ret)
                        goto out;
 
@@ -781,9 +800,10 @@ static int scan_one_device(struct recover_control *rc, int fd,
                        break;
                }
 next_node:
-               bytenr += rc->leafsize;
+               bytenr += rc->nodesize;
        }
 out:
+       close(fd);
        free(buf);
        return ret;
 }
@@ -793,20 +813,107 @@ static int scan_devices(struct recover_control *rc)
        int ret = 0;
        int fd;
        struct btrfs_device *dev;
+       struct device_scan *dev_scans;
+       pthread_t *t_scans;
+       long *t_rets;
+       int devnr = 0;
+       int devidx = 0;
+       int i;
+       int all_done;
+
+       list_for_each_entry(dev, &rc->fs_devices->devices, dev_list)
+               devnr++;
+       dev_scans = (struct device_scan *)malloc(sizeof(struct device_scan)
+                                                * devnr);
+       if (!dev_scans)
+               return -ENOMEM;
+       t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr);
+       if (!t_scans) {
+               free(dev_scans);
+               return -ENOMEM;
+       }
+       t_rets = (long *)malloc(sizeof(long) * devnr);
+       if (!t_rets) {
+               free(dev_scans);
+               free(t_scans);
+               return -ENOMEM;
+       }
 
        list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
                fd = open(dev->name, O_RDONLY);
                if (fd < 0) {
                        fprintf(stderr, "Failed to open device %s\n",
                                dev->name);
-                       return -1;
+                       ret = 1;
+                       goto out2;
                }
-               ret = scan_one_device(rc, fd, dev);
-               close(fd);
+               dev_scans[devidx].rc = rc;
+               dev_scans[devidx].dev = dev;
+               dev_scans[devidx].fd = fd;
+               dev_scans[devidx].bytenr = -1;
+               devidx++;
+       }
+
+       for (i = 0; i < devidx; i++) {
+               ret = pthread_create(&t_scans[i], NULL,
+                                    (void *)scan_one_device,
+                                    (void *)&dev_scans[i]);
                if (ret)
-                       return ret;
+                       goto out1;
+
+               dev_scans[i].bytenr = 0;
        }
-       return ret;
+
+       while (1) {
+               all_done = 1;
+               for (i = 0; i < devidx; i++) {
+                       if (dev_scans[i].bytenr == -1)
+                               continue;
+                       ret = pthread_tryjoin_np(t_scans[i],
+                                                (void **)&t_rets[i]);
+                       if (ret == EBUSY) {
+                               all_done = 0;
+                               continue;
+                       }
+                       if (ret || t_rets[i]) {
+                               ret = 1;
+                               goto out1;
+                       }
+                       dev_scans[i].bytenr = -1;
+               }
+
+               printf("\rScanning: ");
+               for (i = 0; i < devidx; i++) {
+                       if (dev_scans[i].bytenr == -1)
+                               printf("%sDONE in dev%d",
+                                      i ? ", " : "", i);
+                       else
+                               printf("%s%llu in dev%d",
+                                      i ? ", " : "", dev_scans[i].bytenr, i);
+               }
+               /* clear chars if exist in tail */
+               printf("                ");
+               printf("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
+               fflush(stdout);
+
+               if (all_done) {
+                       printf("\n");
+                       break;
+               }
+
+               sleep(1);
+       }
+out1:
+       for (i = 0; i < devidx; i++) {
+               if (dev_scans[i].bytenr == -1)
+                       continue;
+               pthread_cancel(t_scans[i]);
+       }
+out2:
+       free(dev_scans);
+       free(t_scans);
+       free(t_rets);
+       return !!ret;
 }
 
 static int build_device_map_by_chunk_record(struct btrfs_root *root,
@@ -817,11 +924,12 @@ static int build_device_map_by_chunk_record(struct btrfs_root *root,
        u64 devid;
        u8 uuid[BTRFS_UUID_SIZE];
        u16 num_stripes;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_mapping_tree *map_tree;
        struct map_lookup *map;
        struct stripe *stripe;
 
-       map_tree = &root->fs_info->mapping_tree;
+       map_tree = &fs_info->mapping_tree;
        num_stripes = chunk->num_stripes;
        map = malloc(btrfs_map_lookup_size(num_stripes));
        if (!map)
@@ -840,10 +948,10 @@ static int build_device_map_by_chunk_record(struct btrfs_root *root,
                devid = stripe->devid;
                memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
                map->stripes[i].physical = stripe->offset;
-               map->stripes[i].dev = btrfs_find_device(root, devid,
+               map->stripes[i].dev = btrfs_find_device(fs_info, devid,
                                                        uuid, NULL);
                if (!map->stripes[i].dev) {
-                       kfree(map);
+                       free(map);
                        return -EIO;
                }
        }
@@ -863,6 +971,11 @@ static int build_device_maps_by_chunk_records(struct recover_control *rc,
                if (ret)
                        return ret;
        }
+       list_for_each_entry(chunk, &rc->rebuild_chunks, list) {
+               ret = build_device_map_by_chunk_record(root, chunk);
+               if (ret)
+                       return ret;
+       }
        return ret;
 }
 
@@ -940,7 +1053,7 @@ again:
                    key.type == BTRFS_METADATA_ITEM_KEY) {
                        old_val = btrfs_super_bytes_used(fs_info->super_copy);
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
-                               old_val += root->leafsize;
+                               old_val += fs_info->nodesize;
                        else
                                old_val += key.offset;
                        btrfs_set_super_bytes_used(fs_info->super_copy,
@@ -956,7 +1069,7 @@ again:
 
        if (key.objectid < end) {
                if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-                       key.objectid += root->sectorsize;
+                       key.objectid += fs_info->sectorsize;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        key.offset = 0;
                }
@@ -968,8 +1081,7 @@ err:
        return ret;
 }
 
-static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+static int block_group_free_all_extent(struct btrfs_root *root,
                                       struct block_group_record *bg)
 {
        struct btrfs_block_group_cache *cache;
@@ -986,8 +1098,8 @@ static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
        end = start + cache->key.offset - 1;
 
        set_extent_bits(&info->block_group_cache, start, end,
-                       BLOCK_GROUP_DIRTY, GFP_NOFS);
-       set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
+                       BLOCK_GROUP_DIRTY);
+       set_extent_dirty(&info->free_space_cache, start, end);
 
        btrfs_set_block_group_used(&cache->item, 0);
 
@@ -1009,7 +1121,7 @@ static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
                if (ret)
                        return ret;
 
-               ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
+               ret = block_group_free_all_extent(root, chunk->bg_rec);
                if (ret)
                        return ret;
        }
@@ -1030,11 +1142,11 @@ static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
                if (min_devid > dev->devid)
                        min_devid = dev->devid;
        }
-       disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
-       disk_key.type = BTRFS_DEV_ITEM_KEY;
-       disk_key.offset = min_devid;
+       btrfs_set_disk_key_objectid(&disk_key, BTRFS_DEV_ITEMS_OBJECTID);
+       btrfs_set_disk_key_type(&disk_key, BTRFS_DEV_ITEM_KEY);
+       btrfs_set_disk_key_offset(&disk_key, min_devid);
 
-       cow = btrfs_alloc_free_block(trans, root, root->sectorsize,
+       cow = btrfs_alloc_free_block(trans, root, root->fs_info->nodesize,
                                     BTRFS_CHUNK_TREE_OBJECTID,
                                     &disk_key, 0, 0, 0);
        btrfs_set_header_bytenr(cow, cow->start);
@@ -1044,11 +1156,10 @@ static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(cow, BTRFS_CHUNK_TREE_OBJECTID);
        write_extent_buffer(cow, root->fs_info->fsid,
-                       (unsigned long)btrfs_header_fsid(cow),
-                       BTRFS_FSID_SIZE);
+                       btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
        write_extent_buffer(cow, root->fs_info->chunk_tree_uuid,
-                       (unsigned long)btrfs_header_chunk_tree_uuid(cow),
+                       btrfs_header_chunk_tree_uuid(cow),
                        BTRFS_UUID_SIZE);
 
        root->node = cow;
@@ -1063,13 +1174,10 @@ static int __rebuild_device_items(struct btrfs_trans_handle *trans,
 {
        struct btrfs_device *dev;
        struct btrfs_key key;
-       struct btrfs_dev_item *dev_item;
+       struct btrfs_dev_item dev_item_tmp;
+       struct btrfs_dev_item *dev_item = &dev_item_tmp;
        int ret = 0;
 
-       dev_item = malloc(sizeof(struct btrfs_dev_item));
-       if (!dev_item)
-               return -ENOMEM;
-
        list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
                key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
                key.type = BTRFS_DEV_ITEM_KEY;
@@ -1090,7 +1198,27 @@ static int __rebuild_device_items(struct btrfs_trans_handle *trans,
                                        dev_item, sizeof(*dev_item));
        }
 
-       free(dev_item);
+       return ret;
+}
+
+static int __insert_chunk_item(struct btrfs_trans_handle *trans,
+                               struct chunk_record *chunk_rec,
+                               struct btrfs_root *chunk_root)
+{
+       struct btrfs_key key;
+       struct btrfs_chunk *chunk = NULL;
+       int ret = 0;
+
+       chunk = create_chunk_item(chunk_rec);
+       if (!chunk)
+               return -ENOMEM;
+       key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+       key.type = BTRFS_CHUNK_ITEM_KEY;
+       key.offset = chunk_rec->offset;
+
+       ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+                               btrfs_chunk_item_size(chunk_rec->num_stripes));
+       free(chunk);
        return ret;
 }
 
@@ -1098,8 +1226,6 @@ static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
                                 struct recover_control *rc,
                                 struct btrfs_root *root)
 {
-       struct btrfs_key key;
-       struct btrfs_chunk *chunk = NULL;
        struct btrfs_root *chunk_root;
        struct chunk_record *chunk_rec;
        int ret;
@@ -1107,17 +1233,12 @@ static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
        chunk_root = root->fs_info->chunk_root;
 
        list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
-               chunk = create_chunk_item(chunk_rec);
-               if (!chunk)
-                       return -ENOMEM;
-
-               key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-               key.type = BTRFS_CHUNK_ITEM_KEY;
-               key.offset = chunk_rec->offset;
-
-               ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
-                               btrfs_chunk_item_size(chunk->num_stripes));
-               free(chunk);
+               ret = __insert_chunk_item(trans, chunk_rec, chunk_root);
+               if (ret)
+                       return ret;
+       }
+       list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) {
+               ret = __insert_chunk_item(trans, chunk_rec, chunk_root);
                if (ret)
                        return ret;
        }
@@ -1148,13 +1269,14 @@ static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
 static int rebuild_sys_array(struct recover_control *rc,
                             struct btrfs_root *root)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_chunk *chunk;
        struct btrfs_key key;
        struct chunk_record *chunk_rec;
        int ret = 0;
        u16 num_stripes;
 
-       btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
+       btrfs_set_super_sys_array_size(fs_info->super_copy, 0);
 
        list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
                if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
@@ -1171,7 +1293,7 @@ static int rebuild_sys_array(struct recover_control *rc,
                key.type = BTRFS_CHUNK_ITEM_KEY;
                key.offset = chunk_rec->offset;
 
-               ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
+               ret = btrfs_add_system_chunk(fs_info, &key, chunk,
                                btrfs_chunk_item_size(num_stripes));
                free(chunk);
                if (ret)
@@ -1181,16 +1303,135 @@ static int rebuild_sys_array(struct recover_control *rc,
 
 }
 
+static int calculate_bg_used(struct btrfs_root *extent_root,
+                            struct chunk_record *chunk_rec,
+                            struct btrfs_path *path,
+                            u64 *used)
+{
+       struct extent_buffer *node;
+       struct btrfs_key found_key;
+       int slot;
+       int ret = 0;
+       u64 used_ret = 0;
+
+       while (1) {
+               node = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(node, &found_key, slot);
+               if (found_key.objectid >= chunk_rec->offset + chunk_rec->length)
+                       break;
+               if (found_key.type != BTRFS_METADATA_ITEM_KEY &&
+                   found_key.type != BTRFS_EXTENT_DATA_KEY)
+                       goto next;
+               if (found_key.type == BTRFS_METADATA_ITEM_KEY)
+                       used_ret += extent_root->fs_info->nodesize;
+               else
+                       used_ret += found_key.offset;
+next:
+               if (slot + 1 < btrfs_header_nritems(node)) {
+                       slot++;
+               } else {
+                       ret = btrfs_next_leaf(extent_root, path);
+                       if (ret > 0) {
+                               ret = 0;
+                               break;
+                       }
+                       if (ret < 0)
+                               break;
+               }
+       }
+       if (!ret)
+               *used = used_ret;
+       return ret;
+}
+
+static int __insert_block_group(struct btrfs_trans_handle *trans,
+                               struct chunk_record *chunk_rec,
+                               struct btrfs_root *extent_root,
+                               u64 used)
+{
+       struct btrfs_block_group_item bg_item;
+       struct btrfs_key key;
+       int ret = 0;
+
+       btrfs_set_block_group_used(&bg_item, used);
+       btrfs_set_block_group_chunk_objectid(&bg_item, used);
+       btrfs_set_block_group_flags(&bg_item, chunk_rec->type_flags);
+       key.objectid = chunk_rec->offset;
+       key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+       key.offset = chunk_rec->length;
+
+       ret = btrfs_insert_item(trans, extent_root, &key, &bg_item,
+                               sizeof(bg_item));
+       return ret;
+}
+
+/*
+ * Search through the extent tree to rebuild the 'used' member of the block
+ * group.
+ * However, since block group and extent item shares the extent tree,
+ * the extent item may also missing.
+ * In that case, we fill the 'used' with the length of the block group to
+ * ensure no write into the block group.
+ * Btrfsck will hate it but we will inform user to call '--init-extent-tree'
+ * if possible, or just salvage as much data as possible from the fs.
+ */
+static int rebuild_block_group(struct btrfs_trans_handle *trans,
+                              struct recover_control *rc,
+                              struct btrfs_root *root)
+{
+       struct chunk_record *chunk_rec;
+       struct btrfs_key search_key;
+       struct btrfs_path path;
+       u64 used = 0;
+       int ret = 0;
+
+       if (list_empty(&rc->rebuild_chunks))
+               return 0;
+
+       btrfs_init_path(&path);
+       list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) {
+               search_key.objectid = chunk_rec->offset;
+               search_key.type = BTRFS_EXTENT_ITEM_KEY;
+               search_key.offset = 0;
+               ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
+                                       &search_key, &path, 0, 0);
+               if (ret < 0)
+                       goto out;
+               ret = calculate_bg_used(root->fs_info->extent_root,
+                                       chunk_rec, &path, &used);
+               /*
+                * Extent tree is damaged, better to rebuild the whole extent
+                * tree. Currently, change the used to chunk's len to prevent
+                * write/block reserve happening in that block group.
+                */
+               if (ret < 0) {
+                       fprintf(stderr,
+                               "Fail to search extent tree for block group: [%llu,%llu]\n",
+                               chunk_rec->offset,
+                               chunk_rec->offset + chunk_rec->length);
+                       fprintf(stderr,
+                               "Mark the block group full to prevent block rsv problems\n");
+                       used = chunk_rec->length;
+               }
+               btrfs_release_path(&path);
+               ret = __insert_block_group(trans, chunk_rec,
+                                          root->fs_info->extent_root,
+                                          used);
+               if (ret < 0)
+                       goto out;
+       }
+out:
+       btrfs_release_path(&path);
+       return ret;
+}
+
 static struct btrfs_root *
 open_ctree_with_broken_chunk(struct recover_control *rc)
 {
        struct btrfs_fs_info *fs_info;
        struct btrfs_super_block *disk_super;
        struct extent_buffer *eb;
-       u32 sectorsize;
-       u32 nodesize;
-       u32 leafsize;
-       u32 stripesize;
        int ret;
 
        fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
@@ -1198,6 +1439,7 @@ open_ctree_with_broken_chunk(struct recover_control *rc)
                fprintf(stderr, "Failed to allocate memory for fs_info\n");
                return ERR_PTR(-ENOMEM);
        }
+       fs_info->is_chunk_recover = 1;
 
        fs_info->fs_devices = rc->fs_devices;
        ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR);
@@ -1206,25 +1448,24 @@ open_ctree_with_broken_chunk(struct recover_control *rc)
 
        disk_super = fs_info->super_copy;
        ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
-                                  disk_super, fs_info->super_bytenr);
+                                  disk_super, fs_info->super_bytenr,
+                                  SBREAD_RECOVER);
        if (ret) {
                fprintf(stderr, "No valid btrfs found\n");
                goto out_devices;
        }
 
        memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
+       fs_info->sectorsize = btrfs_super_sectorsize(disk_super);
+       fs_info->nodesize = btrfs_super_nodesize(disk_super);
+       fs_info->stripesize = btrfs_super_stripesize(disk_super);
 
-       ret = btrfs_check_fs_compatibility(disk_super, 1);
+       ret = btrfs_check_fs_compatibility(disk_super, OPEN_CTREE_WRITES);
        if (ret)
                goto out_devices;
 
-       nodesize = btrfs_super_nodesize(disk_super);
-       leafsize = btrfs_super_leafsize(disk_super);
-       sectorsize = btrfs_super_sectorsize(disk_super);
-       stripesize = btrfs_super_stripesize(disk_super);
-
-       __setup_root(nodesize, leafsize, sectorsize, stripesize,
-                    fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+       btrfs_setup_root(fs_info->chunk_root, fs_info,
+                        BTRFS_CHUNK_TREE_OBJECTID);
 
        ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
        if (ret)
@@ -1236,7 +1477,7 @@ open_ctree_with_broken_chunk(struct recover_control *rc)
 
        eb = fs_info->tree_root->node;
        read_extent_buffer(eb, fs_info->chunk_tree_uuid,
-                          (unsigned long)btrfs_header_chunk_tree_uuid(eb),
+                          btrfs_header_chunk_tree_uuid(eb),
                           BTRFS_UUID_SIZE);
 
        return fs_info->fs_root;
@@ -1256,6 +1497,7 @@ static int recover_prepare(struct recover_control *rc, char *path)
        int ret;
        int fd;
        struct btrfs_super_block *sb;
+       char buf[BTRFS_SUPER_INFO_SIZE];
        struct btrfs_fs_devices *fs_devices;
 
        ret = 0;
@@ -1265,21 +1507,16 @@ static int recover_prepare(struct recover_control *rc, char *path)
                return -1;
        }
 
-       sb = malloc(sizeof(struct btrfs_super_block));
-       if (!sb) {
-               fprintf(stderr, "allocating memory for sb failed.\n");
-               ret = -ENOMEM;
-               goto fail_close_fd;
-       }
-
-       ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET);
+       sb = (struct btrfs_super_block*)buf;
+       ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET,
+                       SBREAD_RECOVER);
        if (ret) {
                fprintf(stderr, "read super block error\n");
-               goto fail_free_sb;
+               goto out_close_fd;
        }
 
        rc->sectorsize = btrfs_super_sectorsize(sb);
-       rc->leafsize = btrfs_super_leafsize(sb);
+       rc->nodesize = btrfs_super_nodesize(sb);
        rc->generation = btrfs_super_generation(sb);
        rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
        rc->csum_size = btrfs_super_csum_size(sb);
@@ -1288,43 +1525,23 @@ static int recover_prepare(struct recover_control *rc, char *path)
        if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
                fprintf(stderr, "this device is seed device\n");
                ret = -1;
-               goto fail_free_sb;
+               goto out_close_fd;
        }
 
-       ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0);
+       ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, SBREAD_RECOVER, 0);
        if (ret)
-               goto fail_free_sb;
+               goto out_close_fd;
 
        rc->fs_devices = fs_devices;
 
        if (rc->verbose)
                print_all_devices(&rc->fs_devices->devices);
 
-fail_free_sb:
-       free(sb);
-fail_close_fd:
+out_close_fd:
        close(fd);
        return ret;
 }
 
-/*
- * This reads a line from the stdin and only returns non-zero if the
- * first whitespace delimited token is a case insensitive match with yes
- * or y.
- */
-static int ask_user(char *question)
-{
-       char buf[30] = {0,};
-       char *saveptr = NULL;
-       char *answer;
-
-       printf("%s [y/N]: ", question);
-
-       return fgets(buf, sizeof(buf) - 1, stdin) &&
-              (answer = strtok_r(buf, " \t\n\r", &saveptr)) &&
-              (!strcasecmp(answer, "yes") || !strcasecmp(answer, "y"));
-}
-
 static int btrfs_get_device_extents(u64 chunk_object,
                                    struct list_head *orphan_devexts,
                                    struct list_head *ret_list)
@@ -1368,16 +1585,19 @@ static int btrfs_verify_device_extents(struct block_group_record *bg,
                                       struct list_head *devexts, int ndevexts)
 {
        struct device_extent_record *devext;
-       u64 strpie_length;
+       u64 stripe_length;
        int expected_num_stripes;
 
        expected_num_stripes = calc_num_stripes(bg->flags);
        if (expected_num_stripes && expected_num_stripes != ndevexts)
                return 1;
 
-       strpie_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
+       if (check_num_stripes(bg->flags, ndevexts) < 0)
+               return 1;
+
+       stripe_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
        list_for_each_entry(devext, devexts, chunk_list) {
-               if (devext->length != strpie_length)
+               if (devext->length != stripe_length)
                        return 1;
        }
        return 0;
@@ -1434,7 +1654,7 @@ static int btrfs_calc_stripe_index(struct chunk_record *chunk, u64 logical)
                stripe_nr /= nr_data_stripes;
                index = (index + stripe_nr) % chunk->num_stripes;
        } else {
-               BUG_ON(1);
+               return -1;
        }
        return index;
 }
@@ -1497,6 +1717,7 @@ btrfs_rebuild_ordered_meta_chunk_stripes(struct recover_control *rc,
 again:
        er = container_of(cache, struct extent_record, cache);
        index = btrfs_calc_stripe_index(chunk, er->cache.start);
+       BUG_ON(index == -1);
        if (chunk->stripes[index].devid)
                goto next;
        list_for_each_entry_safe(devext, next, &devexts, chunk_list) {
@@ -1584,6 +1805,391 @@ static int btrfs_rebuild_chunk_stripes(struct recover_control *rc,
        return ret;
 }
 
+static int next_csum(struct btrfs_root *root,
+                    struct extent_buffer **leaf,
+                    struct btrfs_path *path,
+                    int *slot,
+                    u64 *csum_offset,
+                    u32 *tree_csum,
+                    u64 end,
+                    struct btrfs_key *key)
+{
+       int ret = 0;
+       struct btrfs_root *csum_root = root->fs_info->csum_root;
+       struct btrfs_csum_item *csum_item;
+       u32 blocksize = root->fs_info->sectorsize;
+       u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+       int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size;
+
+       if (*csum_offset >= csums_in_item) {
+               ++(*slot);
+               *csum_offset = 0;
+               if (*slot >= btrfs_header_nritems(*leaf)) {
+                       ret = btrfs_next_leaf(csum_root, path);
+                       if (ret < 0)
+                               return -1;
+                       else if (ret > 0)
+                               return 1;
+                       *leaf = path->nodes[0];
+                       *slot = path->slots[0];
+               }
+               btrfs_item_key_to_cpu(*leaf, key, *slot);
+       }
+
+       if (key->offset + (*csum_offset) * blocksize >= end)
+               return 2;
+       csum_item = btrfs_item_ptr(*leaf, *slot, struct btrfs_csum_item);
+       csum_item = (struct btrfs_csum_item *)((unsigned char *)csum_item
+                                            + (*csum_offset) * csum_size);
+       read_extent_buffer(*leaf, tree_csum,
+                         (unsigned long)csum_item, csum_size);
+       return ret;
+}
+
+static u64 calc_data_offset(struct btrfs_key *key,
+                           struct chunk_record *chunk,
+                           u64 dev_offset,
+                           u64 csum_offset,
+                           u32 blocksize)
+{
+       u64 data_offset;
+       int logical_stripe_nr;
+       int dev_stripe_nr;
+       int nr_data_stripes;
+
+       data_offset = key->offset + csum_offset * blocksize - chunk->offset;
+       nr_data_stripes = chunk->num_stripes;
+
+       if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5)
+               nr_data_stripes -= 1;
+       else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6)
+               nr_data_stripes -= 2;
+
+       logical_stripe_nr = data_offset / chunk->stripe_len;
+       dev_stripe_nr = logical_stripe_nr / nr_data_stripes;
+
+       data_offset -= logical_stripe_nr * chunk->stripe_len;
+       data_offset += dev_stripe_nr * chunk->stripe_len;
+
+       return dev_offset + data_offset;
+}
+
+static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum)
+{
+       char *data;
+       int ret = 0;
+       u32 csum_result = ~(u32)0;
+
+       data = malloc(len);
+       if (!data)
+               return -1;
+       ret = pread64(fd, data, len, start);
+       if (ret < 0 || ret != len) {
+               ret = -1;
+               goto out;
+       }
+       ret = 0;
+       csum_result = btrfs_csum_data(data, csum_result, len);
+       btrfs_csum_final(csum_result, (u8 *)&csum_result);
+       if (csum_result != tree_csum)
+               ret = 1;
+out:
+       free(data);
+       return ret;
+}
+
+static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key,
+                          struct extent_buffer *leaf, int slot) {
+       u32 blocksize = root->fs_info->sectorsize;
+       u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+       u64 offset = btrfs_item_size_nr(leaf, slot);
+       offset /= csum_size;
+       offset *= blocksize;
+       offset += key->offset;
+
+       return offset;
+}
+
+static int insert_stripe(struct list_head *devexts,
+                        struct recover_control *rc,
+                        struct chunk_record *chunk,
+                        int index) {
+       struct device_extent_record *devext;
+       struct btrfs_device *dev;
+
+       devext = list_entry(devexts->next, struct device_extent_record,
+                           chunk_list);
+       dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
+                                       0);
+       if (!dev)
+               return -ENOENT;
+       if (btrfs_find_device_by_devid(rc->fs_devices, devext->objectid, 1)) {
+               error("unexpected: found another device with id %llu",
+                               (unsigned long long)devext->objectid);
+               return -EINVAL;
+       }
+
+       chunk->stripes[index].devid = devext->objectid;
+       chunk->stripes[index].offset = devext->offset;
+       memcpy(chunk->stripes[index].dev_uuid, dev->uuid, BTRFS_UUID_SIZE);
+
+       list_move(&devext->chunk_list, &chunk->dextents);
+
+       return 0;
+}
+
+static inline int count_devext_records(struct list_head *record_list)
+{
+       int num_of_records = 0;
+       struct device_extent_record *devext;
+
+       list_for_each_entry(devext, record_list, chunk_list)
+               num_of_records++;
+
+       return num_of_records;
+}
+
+static int fill_chunk_up(struct chunk_record *chunk, struct list_head *devexts,
+                        struct recover_control *rc)
+{
+       int ret = 0;
+       int i;
+
+       for (i = 0; i < chunk->num_stripes; i++) {
+               if (!chunk->stripes[i].devid) {
+                       ret = insert_stripe(devexts, rc, chunk, i);
+                       if (ret)
+                               break;
+               }
+       }
+
+       return ret;
+}
+
+#define EQUAL_STRIPE (1 << 0)
+
+static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
+                                          struct btrfs_root *root,
+                                          struct chunk_record *chunk,
+                                          u8 *flags)
+{
+       int i;
+       int ret = 0;
+       int slot;
+       struct btrfs_path path;
+       struct btrfs_key prev_key;
+       struct btrfs_key key;
+       struct btrfs_root *csum_root;
+       struct extent_buffer *leaf;
+       struct device_extent_record *devext;
+       struct device_extent_record *next;
+       struct btrfs_device *dev;
+       u64 start = chunk->offset;
+       u64 end = start + chunk->stripe_len;
+       u64 chunk_end = chunk->offset + chunk->length;
+       u64 csum_offset = 0;
+       u64 data_offset;
+       u32 blocksize = root->fs_info->sectorsize;
+       u32 tree_csum;
+       int index = 0;
+       int num_unordered = 0;
+       LIST_HEAD(unordered);
+       LIST_HEAD(candidates);
+
+       csum_root = root->fs_info->csum_root;
+       btrfs_init_path(&path);
+       list_splice_init(&chunk->dextents, &candidates);
+again:
+       if (list_is_last(candidates.next, &candidates))
+               goto out;
+
+       key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key.type = BTRFS_EXTENT_CSUM_KEY;
+       key.offset = start;
+
+       ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0);
+       if (ret < 0) {
+               fprintf(stderr, "Search csum failed(%d)\n", ret);
+               goto fail_out;
+       }
+       leaf = path.nodes[0];
+       slot = path.slots[0];
+       if (ret > 0) {
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(csum_root, &path);
+                       if (ret < 0) {
+                               fprintf(stderr,
+                                       "Walk tree failed(%d)\n", ret);
+                               goto fail_out;
+                       } else if (ret > 0) {
+                               slot = btrfs_header_nritems(leaf) - 1;
+                               btrfs_item_key_to_cpu(leaf, &key, slot);
+                               if (item_end_offset(root, &key, leaf, slot)
+                                                               > start) {
+                                       csum_offset = start - key.offset;
+                                       csum_offset /= blocksize;
+                                       goto next_csum;
+                               }
+                               goto next_stripe;
+                       }
+                       leaf = path.nodes[0];
+                       slot = path.slots[0];
+               }
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               ret = btrfs_previous_item(csum_root, &path, 0,
+                                         BTRFS_EXTENT_CSUM_KEY);
+               if (ret < 0)
+                       goto fail_out;
+               else if (ret > 0) {
+                       if (key.offset >= end)
+                               goto next_stripe;
+                       else
+                               goto next_csum;
+               }
+               leaf = path.nodes[0];
+               slot = path.slots[0];
+
+               btrfs_item_key_to_cpu(leaf, &prev_key, slot);
+               if (item_end_offset(root, &prev_key, leaf, slot) > start) {
+                       csum_offset = start - prev_key.offset;
+                       csum_offset /= blocksize;
+                       btrfs_item_key_to_cpu(leaf, &key, slot);
+               } else {
+                       if (key.offset >= end)
+                               goto next_stripe;
+               }
+
+               if (key.offset + csum_offset * blocksize > chunk_end)
+                       goto out;
+       }
+next_csum:
+       ret = next_csum(root, &leaf, &path, &slot, &csum_offset, &tree_csum,
+                       end, &key);
+       if (ret < 0) {
+               fprintf(stderr, "Fetch csum failed\n");
+               goto fail_out;
+       } else if (ret == 1) {
+               if (!(*flags & EQUAL_STRIPE))
+                       *flags |= EQUAL_STRIPE;
+               goto out;
+       } else if (ret == 2)
+               goto next_stripe;
+
+       list_for_each_entry_safe(devext, next, &candidates, chunk_list) {
+               data_offset = calc_data_offset(&key, chunk, devext->offset,
+                                              csum_offset, blocksize);
+               dev = btrfs_find_device_by_devid(rc->fs_devices,
+                                                devext->objectid, 0);
+               if (!dev) {
+                       ret = 1;
+                       goto fail_out;
+               }
+               BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
+                                                 devext->objectid, 1));
+
+               ret = check_one_csum(dev->fd, data_offset, blocksize,
+                                    tree_csum);
+               if (ret < 0)
+                       goto fail_out;
+               else if (ret > 0)
+                       list_move(&devext->chunk_list, &unordered);
+       }
+
+       if (list_empty(&candidates)) {
+               num_unordered = count_devext_records(&unordered);
+               if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6
+                                       && num_unordered == 2) {
+                       btrfs_release_path(&path);
+                       ret = fill_chunk_up(chunk, &unordered, rc);
+                       return ret;
+               }
+
+               goto next_stripe;
+       }
+
+       if (list_is_last(candidates.next, &candidates)) {
+               index = btrfs_calc_stripe_index(chunk,
+                       key.offset + csum_offset * blocksize);
+               BUG_ON(index == -1);
+               if (chunk->stripes[index].devid)
+                       goto next_stripe;
+               ret = insert_stripe(&candidates, rc, chunk, index);
+               if (ret)
+                       goto fail_out;
+       } else {
+               csum_offset++;
+               goto next_csum;
+       }
+next_stripe:
+       start = btrfs_next_stripe_logical_offset(chunk, start);
+       end = min(start + chunk->stripe_len, chunk_end);
+       list_splice_init(&unordered, &candidates);
+       btrfs_release_path(&path);
+       csum_offset = 0;
+       if (end < chunk_end)
+               goto again;
+out:
+       ret = 0;
+       list_splice_init(&candidates, &unordered);
+       num_unordered = count_devext_records(&unordered);
+       if (num_unordered == 1) {
+               for (i = 0; i < chunk->num_stripes; i++) {
+                       if (!chunk->stripes[i].devid) {
+                               index = i;
+                               break;
+                       }
+               }
+               ret = insert_stripe(&unordered, rc, chunk, index);
+               if (ret)
+                       goto fail_out;
+       } else {
+               if ((num_unordered == 2 && chunk->type_flags
+                       & BTRFS_BLOCK_GROUP_RAID5)
+                || (num_unordered == 3 && chunk->type_flags
+                       & BTRFS_BLOCK_GROUP_RAID6)) {
+                       ret = fill_chunk_up(chunk, &unordered, rc);
+               }
+       }
+fail_out:
+       ret = !!ret || (list_empty(&unordered) ? 0 : 1);
+       list_splice_init(&candidates, &chunk->dextents);
+       list_splice_init(&unordered, &chunk->dextents);
+       btrfs_release_path(&path);
+
+       return ret;
+}
+
+static int btrfs_rebuild_ordered_data_chunk_stripes(struct recover_control *rc,
+                                          struct btrfs_root *root)
+{
+       struct chunk_record *chunk;
+       struct chunk_record *next;
+       int ret = 0;
+       int err;
+       u8 flags;
+
+       list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
+               if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA)
+                && (chunk->type_flags & BTRFS_ORDERED_RAID)) {
+                       flags = 0;
+                       err = rebuild_raid_data_chunk_stripes(rc, root, chunk,
+                                                             &flags);
+                       if (err) {
+                               list_move(&chunk->list, &rc->bad_chunks);
+                               if (flags & EQUAL_STRIPE)
+                                       fprintf(stderr,
+                       "Failure: too many equal stripes in chunk[%llu %llu]\n",
+                                               chunk->offset, chunk->length);
+                               if (!ret)
+                                       ret = err;
+                       } else
+                               list_move(&chunk->list, &rc->good_chunks);
+               }
+       }
+       return ret;
+}
+
 static int btrfs_recover_chunks(struct recover_control *rc)
 {
        struct chunk_record *chunk;
@@ -1599,10 +2205,9 @@ static int btrfs_recover_chunks(struct recover_control *rc)
                nstripes = btrfs_get_device_extents(bg->objectid,
                                                    &rc->devext.no_chunk_orphans,
                                                    &devexts);
-               chunk = malloc(btrfs_chunk_record_size(nstripes));
+               chunk = calloc(1, btrfs_chunk_record_size(nstripes));
                if (!chunk)
                        return -ENOMEM;
-               memset(chunk, 0, btrfs_chunk_record_size(nstripes));
                INIT_LIST_HEAD(&chunk->dextents);
                chunk->bg_rec = bg;
                chunk->cache.start = bg->objectid;
@@ -1621,8 +2226,16 @@ static int btrfs_recover_chunks(struct recover_control *rc)
                chunk->sub_stripes = calc_sub_nstripes(bg->flags);
 
                ret = insert_cache_extent(&rc->chunk, &chunk->cache);
+               if (ret == -EEXIST) {
+                       error("duplicate entry in cache start %llu size %llu",
+                                       (unsigned long long)chunk->cache.start,
+                                       (unsigned long long)chunk->cache.size);
+                       free(chunk);
+                       return ret;
+               }
                BUG_ON(ret);
 
+               list_del_init(&bg->list);
                if (!nstripes) {
                        list_add_tail(&chunk->list, &rc->bad_chunks);
                        continue;
@@ -1653,8 +2266,35 @@ static int btrfs_recover_chunks(struct recover_control *rc)
        return 0;
 }
 
+static inline int is_chunk_overlap(struct chunk_record *chunk1,
+                                  struct chunk_record *chunk2)
+{
+       if (chunk1->offset >= chunk2->offset + chunk2->length ||
+           chunk1->offset + chunk1->length <= chunk2->offset)
+               return 0;
+       return 1;
+}
+
+/* Move invalid(overlap with good chunks) rebuild chunks to bad chunk list */
+static void validate_rebuild_chunks(struct recover_control *rc)
+{
+       struct chunk_record *good;
+       struct chunk_record *rebuild;
+       struct chunk_record *tmp;
+
+       list_for_each_entry_safe(rebuild, tmp, &rc->rebuild_chunks, list) {
+               list_for_each_entry(good, &rc->good_chunks, list) {
+                       if (is_chunk_overlap(rebuild, good)) {
+                               list_move_tail(&rebuild->list,
+                                              &rc->bad_chunks);
+                               break;
+                       }
+               }
+       }
+}
+
 /*
- * Return 0 when succesful, < 0 on error and > 0 if aborted by user
+ * Return 0 when successful, < 0 on error and > 0 if aborted by user
  */
 int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
 {
@@ -1687,8 +2327,7 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        print_scan_result(&rc);
 
        ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
-                          &rc.bad_chunks, 1);
-       print_check_result(&rc);
+                          &rc.bad_chunks, &rc.rebuild_chunks, 1);
        if (ret) {
                if (!list_empty(&rc.bg.block_groups) ||
                    !list_empty(&rc.devext.no_chunk_orphans)) {
@@ -1696,14 +2335,13 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
                        if (ret)
                                goto fail_rc;
                }
-               /*
-                * If the chunk is healthy, its block group item and device
-                * extent item should be written on the disks. So, it is very
-                * likely that the bad chunk is a old one that has been
-                * droppped from the fs. Don't deal with them now, we will
-                * check it after the fs is opened.
-                */
+       } else {
+               print_check_result(&rc);
+               printf("Check chunks successfully with no orphans\n");
+               goto fail_rc;
        }
+       validate_rebuild_chunks(&rc);
+       print_check_result(&rc);
 
        root = open_ctree_with_broken_chunk(&rc);
        if (IS_ERR(root)) {
@@ -1718,6 +2356,12 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
                goto fail_close_ctree;
        }
 
+       ret = btrfs_rebuild_ordered_data_chunk_stripes(&rc, root);
+       if (ret) {
+               fprintf(stderr, "Failed to rebuild ordered chunk stripes.\n");
+               goto fail_close_ctree;
+       }
+
        if (!rc.yes) {
                ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure?");
                if (!ret) {
@@ -1727,6 +2371,7 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        }
 
        trans = btrfs_start_transaction(root, 1);
+       BUG_ON(IS_ERR(trans));
        ret = remove_chunk_extent_item(trans, &rc, root);
        BUG_ON(ret);
 
@@ -1736,6 +2381,12 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
        ret = rebuild_sys_array(&rc, root);
        BUG_ON(ret);
 
+       ret = rebuild_block_group(trans, &rc, root);
+       if (ret) {
+               printf("Fail to rebuild block groups.\n");
+               printf("Recommend to run 'btrfs check --init-extent-tree <dev>' after recovery\n");
+       }
+
        btrfs_commit_transaction(trans, root);
 fail_close_ctree:
        close_ctree(root);