btrfs-progs: build: Do not use cp -a to install library links
[platform/upstream/btrfs-progs.git] / disk-io.c
index 6ef35bb..58eae70 100644 (file)
--- a/disk-io.c
+++ b/disk-io.c
  * Boston, MA 021110-1307, USA.
  */
 
-#define _XOPEN_SOURCE 600
-#define __USE_XOPEN2K
-#define _GNU_SOURCE 1
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+#include <uuid/uuid.h>
 #include "kerncompat.h"
 #include "radix-tree.h"
 #include "ctree.h"
 #include "crc32c.h"
 #include "utils.h"
 #include "print-tree.h"
+#include "rbtree-utils.h"
 
-static int close_all_devices(struct btrfs_fs_info *fs_info);
+/* specified errno for check_tree_block */
+#define BTRFS_BAD_BYTENR               (-1)
+#define BTRFS_BAD_FSID                 (-2)
+#define BTRFS_BAD_LEVEL                        (-3)
+#define BTRFS_BAD_NRITEMS              (-4)
 
-static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
+/* Calculate max possible nritems for a leaf/node */
+static u32 max_nritems(u8 level, u32 nodesize)
 {
 
-       struct btrfs_fs_devices *fs_devices;
-       int ret = 1;
+       if (level == 0)
+               return ((nodesize - sizeof(struct btrfs_header)) /
+                       sizeof(struct btrfs_item));
+       return ((nodesize - sizeof(struct btrfs_header)) /
+               sizeof(struct btrfs_key_ptr));
+}
 
-       if (buf->start != btrfs_header_bytenr(buf)) {
-               printk("Check tree block failed, want=%Lu, have=%Lu\n",
-                      buf->start, btrfs_header_bytenr(buf));
-               return ret;
-       }
+static int check_tree_block(struct btrfs_fs_info *fs_info,
+                           struct extent_buffer *buf)
+{
 
-       fs_devices = root->fs_info->fs_devices;
+       struct btrfs_fs_devices *fs_devices;
+       u32 nodesize = fs_info->nodesize;
+       int ret = BTRFS_BAD_FSID;
+
+       if (buf->start != btrfs_header_bytenr(buf))
+               return BTRFS_BAD_BYTENR;
+       if (btrfs_header_level(buf) >= BTRFS_MAX_LEVEL)
+               return BTRFS_BAD_LEVEL;
+       if (btrfs_header_nritems(buf) > max_nritems(btrfs_header_level(buf),
+                                                   nodesize))
+               return BTRFS_BAD_NRITEMS;
+
+       /* Only leaf can be empty */
+       if (btrfs_header_nritems(buf) == 0 &&
+           btrfs_header_level(buf) != 0)
+               return BTRFS_BAD_NRITEMS;
+
+       fs_devices = fs_info->fs_devices;
        while (fs_devices) {
-               if (!memcmp_extent_buffer(buf, fs_devices->fsid,
-                                         (unsigned long)btrfs_header_fsid(buf),
+               if (fs_info->ignore_fsid_mismatch ||
+                   !memcmp_extent_buffer(buf, fs_devices->fsid,
+                                         btrfs_header_fsid(),
                                          BTRFS_FSID_SIZE)) {
                        ret = 0;
                        break;
@@ -62,93 +86,128 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
        return ret;
 }
 
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+static void print_tree_block_error(struct btrfs_fs_info *fs_info,
+                               struct extent_buffer *eb,
+                               int err)
+{
+       char fs_uuid[BTRFS_UUID_UNPARSED_SIZE] = {'\0'};
+       char found_uuid[BTRFS_UUID_UNPARSED_SIZE] = {'\0'};
+       u8 buf[BTRFS_UUID_SIZE];
+
+       switch (err) {
+       case BTRFS_BAD_FSID:
+               read_extent_buffer(eb, buf, btrfs_header_fsid(),
+                                  BTRFS_UUID_SIZE);
+               uuid_unparse(buf, found_uuid);
+               uuid_unparse(fs_info->fsid, fs_uuid);
+               fprintf(stderr, "fsid mismatch, want=%s, have=%s\n",
+                       fs_uuid, found_uuid);
+               break;
+       case BTRFS_BAD_BYTENR:
+               fprintf(stderr, "bytenr mismatch, want=%llu, have=%llu\n",
+                       eb->start, btrfs_header_bytenr(eb));
+               break;
+       case BTRFS_BAD_LEVEL:
+               fprintf(stderr, "bad level, %u > %u\n",
+                       btrfs_header_level(eb), BTRFS_MAX_LEVEL);
+               break;
+       case BTRFS_BAD_NRITEMS:
+               fprintf(stderr, "invalid nr_items: %u\n",
+                       btrfs_header_nritems(eb));
+               break;
+       }
+}
+
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
        return crc32c(seed, data, len);
 }
 
-void btrfs_csum_final(u32 crc, char *result)
+void btrfs_csum_final(u32 crc, u8 *result)
 {
-       *(__le32 *)result = ~cpu_to_le32(crc);
+       put_unaligned_le32(~crc, result);
 }
 
-int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
-                        int verify)
+static int __csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
+                                 int verify, int silent)
 {
-       char *result;
+       u8 result[BTRFS_CSUM_SIZE];
        u32 len;
        u32 crc = ~(u32)0;
 
-       result = malloc(csum_size * sizeof(char));
-       if (!result)
-               return 1;
-
        len = buf->len - BTRFS_CSUM_SIZE;
        crc = crc32c(crc, buf->data + BTRFS_CSUM_SIZE, len);
        btrfs_csum_final(crc, result);
 
        if (verify) {
                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
-                       printk("checksum verify failed on %llu found %X "
-                              "wanted %X\n", (unsigned long long)buf->start,
-                              *((int *)result), *((char *)buf->data));
-                       free(result);
+                       if (!silent)
+                               printk("checksum verify failed on %llu found %08X wanted %08X\n",
+                                      (unsigned long long)buf->start,
+                                      *((u32 *)result),
+                                      *((u32*)(char *)buf->data));
                        return 1;
                }
        } else {
                write_extent_buffer(buf, result, 0, csum_size);
        }
-       free(result);
        return 0;
 }
 
-int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
-                   int verify)
+int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size, int verify)
+{
+       return __csum_tree_block_size(buf, csum_size, verify, 0);
+}
+
+int verify_tree_block_csum_silent(struct extent_buffer *buf, u16 csum_size)
+{
+       return __csum_tree_block_size(buf, csum_size, 1, 1);
+}
+
+int csum_tree_block(struct btrfs_fs_info *fs_info,
+                   struct extent_buffer *buf, int verify)
 {
        u16 csum_size =
-               btrfs_super_csum_size(root->fs_info->super_copy);
+               btrfs_super_csum_size(fs_info->super_copy);
+       if (verify && fs_info->suppress_check_block_errors)
+               return verify_tree_block_csum_silent(buf, csum_size);
        return csum_tree_block_size(buf, csum_size, verify);
 }
 
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
                                            u64 bytenr, u32 blocksize)
 {
-       return find_extent_buffer(&root->fs_info->extent_cache,
+       return find_extent_buffer(&fs_info->extent_cache,
                                  bytenr, blocksize);
 }
 
-struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                u64 bytenr, u32 blocksize)
+struct extent_buffer* btrfs_find_create_tree_block(
+               struct btrfs_fs_info *fs_info, u64 bytenr)
 {
-       return alloc_extent_buffer(&root->fs_info->extent_cache, bytenr,
-                                  blocksize);
+       return alloc_extent_buffer(&fs_info->extent_cache, bytenr,
+                       fs_info->nodesize);
 }
 
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
-                        u64 parent_transid)
+void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+               u64 parent_transid)
 {
-       int ret;
        struct extent_buffer *eb;
        u64 length;
        struct btrfs_multi_bio *multi = NULL;
        struct btrfs_device *device;
 
-       eb = btrfs_find_tree_block(root, bytenr, blocksize);
-       if (eb && btrfs_buffer_uptodate(eb, parent_transid)) {
-               free_extent_buffer(eb);
-               return 0;
+       eb = btrfs_find_tree_block(fs_info, bytenr, fs_info->nodesize);
+       if (!(eb && btrfs_buffer_uptodate(eb, parent_transid)) &&
+           !btrfs_map_block(fs_info, READ, bytenr, &length, &multi, 0,
+                            NULL)) {
+               device = multi->stripes[0].dev;
+               device->total_ios++;
+               readahead(device->fd, multi->stripes[0].physical,
+                               fs_info->nodesize);
        }
 
-       length = blocksize;
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-                             bytenr, &length, &multi, 0, NULL);
-       BUG_ON(ret);
-       device = multi->stripes[0].dev;
-       device->total_ios++;
-       blocksize = min(blocksize, (u32)(64 * 1024));
-       readahead(device->fd, multi->stripes[0].physical, blocksize);
+       free_extent_buffer(eb);
        kfree(multi);
-       return 0;
 }
 
 static int verify_parent_transid(struct extent_io_tree *io_tree,
@@ -170,19 +229,20 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
               (unsigned long long)parent_transid,
               (unsigned long long)btrfs_header_generation(eb));
        if (ignore) {
+               eb->flags |= EXTENT_BAD_TRANSID;
                printk("Ignoring transid failure\n");
                return 0;
        }
 
        ret = 1;
 out:
-       clear_extent_buffer_uptodate(io_tree, eb);
+       clear_extent_buffer_uptodate(eb);
        return ret;
 
 }
 
 
-static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
+int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
 {
        unsigned long offset = 0;
        struct btrfs_multi_bio *multi = NULL;
@@ -193,26 +253,40 @@ static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, i
 
        while (bytes_left) {
                read_len = bytes_left;
-               ret = btrfs_map_block(&info->mapping_tree, READ,
-                                     eb->start + offset, &read_len, &multi,
-                                     mirror, NULL);
-               if (ret) {
-                       printk("Couldn't map the block %Lu\n", eb->start + offset);
-                       kfree(multi);
-                       return -EIO;
-               }
-               device = multi->stripes[0].dev;
+               device = NULL;
+
+               if (!info->on_restoring &&
+                   eb->start != BTRFS_SUPER_INFO_OFFSET) {
+                       ret = btrfs_map_block(info, READ, eb->start + offset,
+                                             &read_len, &multi, mirror, NULL);
+                       if (ret) {
+                               printk("Couldn't map the block %Lu\n", eb->start + offset);
+                               kfree(multi);
+                               return -EIO;
+                       }
+                       device = multi->stripes[0].dev;
+
+                       if (device->fd <= 0) {
+                               kfree(multi);
+                               return -EIO;
+                       }
 
-               if (device->fd == 0) {
+                       eb->fd = device->fd;
+                       device->total_ios++;
+                       eb->dev_bytenr = multi->stripes[0].physical;
                        kfree(multi);
-                       return -EIO;
-               }
+                       multi = NULL;
+               } else {
+                       /* special case for restore metadump */
+                       list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+                               if (device->devid == 1)
+                                       break;
+                       }
 
-               eb->fd = device->fd;
-               device->total_ios++;
-               eb->dev_bytenr = multi->stripes[0].physical;
-               kfree(multi);
-               multi = NULL;
+                       eb->fd = device->fd;
+                       eb->dev_bytenr = eb->start;
+                       device->total_ios++;
+               }
 
                if (read_len > bytes_left)
                        read_len = bytes_left;
@@ -226,47 +300,70 @@ static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, i
        return 0;
 }
 
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-                                    u32 blocksize, u64 parent_transid)
+struct extent_buffer* read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+               u64 parent_transid)
 {
        int ret;
        struct extent_buffer *eb;
        u64 best_transid = 0;
+       u32 sectorsize = fs_info->sectorsize;
        int mirror_num = 0;
        int good_mirror = 0;
        int num_copies;
        int ignore = 0;
 
-       eb = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       /*
+        * Don't even try to create tree block for unaligned tree block
+        * bytenr.
+        * Such unaligned tree block will free overlapping extent buffer,
+        * causing use-after-free bugs for fuzzed images.
+        */
+       if (bytenr < sectorsize || !IS_ALIGNED(bytenr, sectorsize)) {
+               error("tree block bytenr %llu is not aligned to sectorsize %u",
+                     bytenr, sectorsize);
+               return ERR_PTR(-EIO);
+       }
+
+       eb = btrfs_find_create_tree_block(fs_info, bytenr);
        if (!eb)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        if (btrfs_buffer_uptodate(eb, parent_transid))
                return eb;
 
        while (1) {
-               ret = read_whole_eb(root->fs_info, eb, mirror_num);
-               if (ret == 0 && check_tree_block(root, eb) == 0 &&
-                   csum_tree_block(root, eb, 1) == 0 &&
+               ret = read_whole_eb(fs_info, eb, mirror_num);
+               if (ret == 0 && csum_tree_block(fs_info, eb, 1) == 0 &&
+                   check_tree_block(fs_info, eb) == 0 &&
                    verify_parent_transid(eb->tree, eb, parent_transid, ignore)
                    == 0) {
+                       if (eb->flags & EXTENT_BAD_TRANSID &&
+                           list_empty(&eb->recow)) {
+                               list_add_tail(&eb->recow,
+                                             &fs_info->recow_ebs);
+                               eb->refs++;
+                       }
                        btrfs_set_buffer_uptodate(eb);
                        return eb;
                }
                if (ignore) {
-                       if (check_tree_block(root, eb))
-                               printk("read block failed check_tree_block\n");
-                       else
-                               printk("Csum didn't match\n");
+                       if (check_tree_block(fs_info, eb)) {
+                               if (!fs_info->suppress_check_block_errors)
+                                       print_tree_block_error(fs_info, eb,
+                                               check_tree_block(fs_info, eb));
+                       } else {
+                               if (!fs_info->suppress_check_block_errors)
+                                       fprintf(stderr, "Csum didn't match\n");
+                       }
+                       ret = -EIO;
                        break;
                }
-               num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
-                                             eb->start, eb->len);
+               num_copies = btrfs_num_copies(fs_info, eb->start, eb->len);
                if (num_copies == 1) {
                        ignore = 1;
                        continue;
                }
-               if (btrfs_header_generation(eb) > best_transid) {
+               if (btrfs_header_generation(eb) > best_transid && mirror_num) {
                        best_transid = btrfs_header_generation(eb);
                        good_mirror = mirror_num;
                }
@@ -278,154 +375,45 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                }
        }
        free_extent_buffer(eb);
-       return NULL;
-}
-
-static int rmw_eb(struct btrfs_fs_info *info,
-                 struct extent_buffer *eb, struct extent_buffer *orig_eb)
-{
-       int ret;
-       unsigned long orig_off = 0;
-       unsigned long dest_off = 0;
-       unsigned long copy_len = eb->len;
-
-       ret = read_whole_eb(info, eb, 0);
-       if (ret)
-               return ret;
-
-       if (eb->start + eb->len <= orig_eb->start ||
-           eb->start >= orig_eb->start + orig_eb->len)
-               return 0;
-       /*
-        * | ----- orig_eb ------- |
-        *         | ----- stripe -------  |
-        *         | ----- orig_eb ------- |
-        *              | ----- orig_eb ------- |
-        */
-       if (eb->start > orig_eb->start)
-               orig_off = eb->start - orig_eb->start;
-       if (orig_eb->start > eb->start)
-               dest_off = orig_eb->start - eb->start;
-
-       if (copy_len > orig_eb->len - orig_off)
-               copy_len = orig_eb->len - orig_off;
-       if (copy_len > eb->len - dest_off)
-               copy_len = eb->len - dest_off;
-
-       memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
-       return 0;
+       return ERR_PTR(ret);
 }
 
-static void split_eb_for_raid56(struct btrfs_fs_info *info,
-                               struct extent_buffer *orig_eb,
-                              struct extent_buffer **ebs,
-                              u64 stripe_len, u64 *raid_map,
-                              int num_stripes)
+int read_extent_data(struct btrfs_fs_info *fs_info, char *data, u64 logical,
+                    u64 *len, int mirror)
 {
-       struct extent_buffer *eb;
-       u64 start = orig_eb->start;
-       u64 this_eb_start;
-       int i;
-       int ret;
-
-       for (i = 0; i < num_stripes; i++) {
-               if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
-                       break;
-
-               eb = malloc(sizeof(struct extent_buffer) + stripe_len);
-               if (!eb)
-                       BUG();
-               memset(eb, 0, sizeof(struct extent_buffer) + stripe_len);
-
-               eb->start = raid_map[i];
-               eb->len = stripe_len;
-               eb->refs = 1;
-               eb->flags = 0;
-               eb->fd = -1;
-               eb->dev_bytenr = (u64)-1;
-
-               this_eb_start = raid_map[i];
+       u64 offset = 0;
+       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_device *device;
+       int ret = 0;
+       u64 max_len = *len;
 
-               if (start > this_eb_start ||
-                   start + orig_eb->len < this_eb_start + stripe_len) {
-                       ret = rmw_eb(info, eb, orig_eb);
-                       BUG_ON(ret);
-               } else {
-                       memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len);
-               }
-               ebs[i] = eb;
+       ret = btrfs_map_block(fs_info, READ, logical, len, &multi, mirror,
+                             NULL);
+       if (ret) {
+               fprintf(stderr, "Couldn't map the block %llu\n",
+                               logical + offset);
+               goto err;
        }
-}
-
-static int write_raid56_with_parity(struct btrfs_fs_info *info,
-                                   struct extent_buffer *eb,
-                                   struct btrfs_multi_bio *multi,
-                                   u64 stripe_len, u64 *raid_map)
-{
-       struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL;
-       int i;
-       int j;
-       int ret;
-       int alloc_size = eb->len;
-
-       if (stripe_len > alloc_size)
-               alloc_size = stripe_len;
-
-       split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
-                           multi->num_stripes);
+       device = multi->stripes[0].dev;
 
-       for (i = 0; i < multi->num_stripes; i++) {
-               struct extent_buffer *new_eb;
-               if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
-                       ebs[i]->dev_bytenr = multi->stripes[i].physical;
-                       ebs[i]->fd = multi->stripes[i].dev->fd;
-                       multi->stripes[i].dev->total_ios++;
-                       BUG_ON(ebs[i]->start != raid_map[i]);
-                       continue;
-               }
-               new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS);
-               BUG_ON(!new_eb);
-               new_eb->dev_bytenr = multi->stripes[i].physical;
-               new_eb->fd = multi->stripes[i].dev->fd;
-               multi->stripes[i].dev->total_ios++;
-               new_eb->len = stripe_len;
-
-               if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
-                       p_eb = new_eb;
-               else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
-                       q_eb = new_eb;
-       }
-       if (q_eb) {
-               void *pointers[multi->num_stripes];
-               ebs[multi->num_stripes - 2] = p_eb;
-               ebs[multi->num_stripes - 1] = q_eb;
-
-               for (i = 0; i < multi->num_stripes; i++)
-                       pointers[i] = ebs[i]->data;
-
-               raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
-       } else {
-               ebs[multi->num_stripes - 1] = p_eb;
-               memcpy(p_eb->data, ebs[0]->data, stripe_len);
-               for (j = 1; j < multi->num_stripes - 1; j++) {
-                       for (i = 0; i < stripe_len; i += sizeof(unsigned long)) {
-                               *(unsigned long *)(p_eb->data + i) ^=
-                                       *(unsigned long *)(ebs[j]->data + i);
-                       }
-               }
+       if (*len > max_len)
+               *len = max_len;
+       if (device->fd < 0) {
+               ret = -EIO;
+               goto err;
        }
 
-       for (i = 0; i < multi->num_stripes; i++) {
-               ret = write_extent_to_disk(ebs[i]);
-               BUG_ON(ret);
-               if (ebs[i] != eb)
-                       kfree(ebs[i]);
-       }
-       return 0;
+       ret = pread64(device->fd, data, *len, multi->stripes[0].physical);
+       if (ret != *len)
+               ret = -EIO;
+       else
+               ret = 0;
+err:
+       kfree(multi);
+       return ret;
 }
 
-int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct extent_buffer *eb)
+int write_and_map_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
        int ret;
        int dev_nr;
@@ -433,21 +421,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        u64 *raid_map = NULL;
        struct btrfs_multi_bio *multi = NULL;
 
-       if (check_tree_block(root, eb))
-               BUG();
-       if (!btrfs_buffer_uptodate(eb, trans->transid))
-               BUG();
-
-       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-       csum_tree_block(root, eb, 0);
-
        dev_nr = 0;
        length = eb->len;
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,
-                             eb->start, &length, &multi, 0, &raid_map);
+       ret = btrfs_map_block(fs_info, WRITE, eb->start, &length,
+                             &multi, 0, &raid_map);
 
        if (raid_map) {
-               ret = write_raid56_with_parity(root->fs_info, eb, multi,
+               ret = write_raid56_with_parity(fs_info, eb, multi,
                                               length, raid_map);
                BUG_ON(ret);
        } else while (dev_nr < multi->num_stripes) {
@@ -459,149 +439,48 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                ret = write_extent_to_disk(eb);
                BUG_ON(ret);
        }
+       kfree(raid_map);
        kfree(multi);
        return 0;
 }
 
-int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
-                       u32 stripesize, struct btrfs_root *root,
-                       struct btrfs_fs_info *fs_info, u64 objectid)
+int write_tree_block(struct btrfs_trans_handle *trans,
+                    struct btrfs_fs_info *fs_info,
+                    struct extent_buffer *eb)
+{
+       if (check_tree_block(fs_info, eb)) {
+               print_tree_block_error(fs_info, eb,
+                               check_tree_block(fs_info, eb));
+               BUG();
+       }
+
+       if (trans && !btrfs_buffer_uptodate(eb, trans->transid))
+               BUG();
+
+       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+       csum_tree_block(fs_info, eb, 0);
+
+       return write_and_map_eb(fs_info, eb);
+}
+
+void btrfs_setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+                     u64 objectid)
 {
        root->node = NULL;
        root->commit_root = NULL;
-       root->sectorsize = sectorsize;
-       root->nodesize = nodesize;
-       root->leafsize = leafsize;
-       root->stripesize = stripesize;
        root->ref_cows = 0;
        root->track_dirty = 0;
 
        root->fs_info = fs_info;
        root->objectid = objectid;
        root->last_trans = 0;
-       root->highest_inode = 0;
        root->last_inode_alloc = 0;
 
        INIT_LIST_HEAD(&root->dirty_list);
+       INIT_LIST_HEAD(&root->orphan_data_extents);
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        root->root_key.objectid = objectid;
-       return 0;
-}
-
-static int update_cowonly_root(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root)
-{
-       int ret;
-       u64 old_root_bytenr;
-       struct btrfs_root *tree_root = root->fs_info->tree_root;
-
-       btrfs_write_dirty_block_groups(trans, root);
-       while(1) {
-               old_root_bytenr = btrfs_root_bytenr(&root->root_item);
-               if (old_root_bytenr == root->node->start)
-                       break;
-               btrfs_set_root_bytenr(&root->root_item,
-                                      root->node->start);
-               btrfs_set_root_generation(&root->root_item,
-                                         trans->transid);
-               root->root_item.level = btrfs_header_level(root->node);
-               ret = btrfs_update_root(trans, tree_root,
-                                       &root->root_key,
-                                       &root->root_item);
-               BUG_ON(ret);
-               btrfs_write_dirty_block_groups(trans, root);
-       }
-       return 0;
-}
-
-static int commit_tree_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_root *root;
-       struct list_head *next;
-       struct extent_buffer *eb;
-       int ret;
-
-       if (fs_info->readonly)
-               return 0;
-
-       eb = fs_info->tree_root->node;
-       extent_buffer_get(eb);
-       ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
-       free_extent_buffer(eb);
-       if (ret)
-               return ret;
-
-       while(!list_empty(&fs_info->dirty_cowonly_roots)) {
-               next = fs_info->dirty_cowonly_roots.next;
-               list_del_init(next);
-               root = list_entry(next, struct btrfs_root, dirty_list);
-               update_cowonly_root(trans, root);
-       }
-       return 0;
-}
-
-static int __commit_transaction(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root)
-{
-       u64 start;
-       u64 end;
-       struct extent_buffer *eb;
-       struct extent_io_tree *tree = &root->fs_info->extent_cache;
-       int ret;
-
-       while(1) {
-               ret = find_first_extent_bit(tree, 0, &start, &end,
-                                           EXTENT_DIRTY);
-               if (ret)
-                       break;
-               while(start <= end) {
-                       eb = find_first_extent_buffer(tree, start);
-                       BUG_ON(!eb || eb->start != start);
-                       ret = write_tree_block(trans, root, eb);
-                       BUG_ON(ret);
-                       start += eb->len;
-                       clear_extent_buffer_dirty(eb);
-                       free_extent_buffer(eb);
-               }
-       }
-       return 0;
-}
-
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root)
-{
-       u64 transid = trans->transid;
-       int ret = 0;
-       struct btrfs_fs_info *fs_info = root->fs_info;
-
-       if (root->commit_root == root->node)
-               goto commit_tree;
-
-       free_extent_buffer(root->commit_root);
-       root->commit_root = NULL;
-
-       btrfs_set_root_bytenr(&root->root_item, root->node->start);
-       btrfs_set_root_generation(&root->root_item, trans->transid);
-       root->root_item.level = btrfs_header_level(root->node);
-       ret = btrfs_update_root(trans, root->fs_info->tree_root,
-                               &root->root_key, &root->root_item);
-       BUG_ON(ret);
-commit_tree:
-       ret = commit_tree_roots(trans, fs_info);
-       BUG_ON(ret);
-       ret = __commit_transaction(trans, root);
-       BUG_ON(ret);
-       write_ctree_super(trans, root);
-       btrfs_finish_extent_commit(trans, fs_info->extent_root,
-                                  &fs_info->pinned_extents);
-       btrfs_free_transaction(root, trans);
-       free_extent_buffer(root->commit_root);
-       root->commit_root = NULL;
-       fs_info->running_transaction = NULL;
-       fs_info->last_trans_committed = transid;
-       return 0;
 }
 
 static int find_and_setup_root(struct btrfs_root *tree_root,
@@ -609,21 +488,17 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
                               u64 objectid, struct btrfs_root *root)
 {
        int ret;
-       u32 blocksize;
        u64 generation;
 
-       __setup_root(tree_root->nodesize, tree_root->leafsize,
-                    tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, objectid);
+       btrfs_setup_root(root, fs_info, objectid);
        ret = btrfs_find_last_root(tree_root, objectid,
                                   &root->root_item, &root->root_key);
        if (ret)
                return ret;
 
-       blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        generation = btrfs_root_generation(&root->root_item);
-       root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                    blocksize, generation);
+       root->node = read_tree_block(fs_info,
+                       btrfs_root_bytenr(&root->root_item), generation);
        if (!extent_buffer_uptodate(root->node))
                return -EIO;
 
@@ -634,7 +509,6 @@ static int find_and_setup_log_root(struct btrfs_root *tree_root,
                               struct btrfs_fs_info *fs_info,
                               struct btrfs_super_block *disk_super)
 {
-       u32 blocksize;
        u64 blocknr = btrfs_super_log_root(disk_super);
        struct btrfs_root *log_root = malloc(sizeof(struct btrfs_root));
 
@@ -646,15 +520,10 @@ static int find_and_setup_log_root(struct btrfs_root *tree_root,
                return 0;
        }
 
-       blocksize = btrfs_level_size(tree_root,
-                            btrfs_super_log_root_level(disk_super));
-
-       __setup_root(tree_root->nodesize, tree_root->leafsize,
-                    tree_root->sectorsize, tree_root->stripesize,
-                    log_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+       btrfs_setup_root(log_root, fs_info,
+                        BTRFS_TREE_LOG_OBJECTID);
 
-       log_root->node = read_tree_block(tree_root, blocknr,
-                                    blocksize,
+       log_root->node = read_tree_block(fs_info, blocknr,
                                     btrfs_super_generation(disk_super) + 1);
 
        fs_info->log_root_tree = log_root;
@@ -669,9 +538,7 @@ static int find_and_setup_log_root(struct btrfs_root *tree_root,
        return 0;
 }
 
-
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info,
-                      struct btrfs_root *root)
+int btrfs_free_fs_root(struct btrfs_root *root)
 {
        if (root->node)
                free_extent_buffer(root->node);
@@ -681,22 +548,16 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-static int free_fs_roots(struct btrfs_fs_info *fs_info)
+static void __free_fs_root(struct rb_node *node)
 {
-       struct cache_extent *cache;
        struct btrfs_root *root;
 
-       while (1) {
-               cache = find_first_cache_extent(&fs_info->fs_root_cache, 0);
-               if (!cache)
-                       break;
-               root = container_of(cache, struct btrfs_root, cache);
-               remove_cache_extent(&fs_info->fs_root_cache, cache);
-               btrfs_free_fs_root(fs_info, root);
-       }
-       return 0;
+       root = container_of(node, struct btrfs_root, rb_node);
+       btrfs_free_fs_root(root);
 }
 
+FREE_RB_BASED_TREE(fs_roots, __free_fs_root);
+
 struct btrfs_root *btrfs_read_fs_root_no_cache(struct btrfs_fs_info *fs_info,
                                               struct btrfs_key *location)
 {
@@ -705,13 +566,11 @@ struct btrfs_root *btrfs_read_fs_root_no_cache(struct btrfs_fs_info *fs_info,
        struct btrfs_path *path;
        struct extent_buffer *l;
        u64 generation;
-       u32 blocksize;
        int ret = 0;
 
-       root = malloc(sizeof(*root));
+       root = calloc(1, sizeof(*root));
        if (!root)
                return ERR_PTR(-ENOMEM);
-       memset(root, 0, sizeof(*root));
        if (location->offset == (u64)-1) {
                ret = find_and_setup_root(tree_root, fs_info,
                                          location->objectid, root);
@@ -722,12 +581,15 @@ struct btrfs_root *btrfs_read_fs_root_no_cache(struct btrfs_fs_info *fs_info,
                goto insert;
        }
 
-       __setup_root(tree_root->nodesize, tree_root->leafsize,
-                    tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, location->objectid);
+       btrfs_setup_root(root, fs_info,
+                        location->objectid);
 
        path = btrfs_alloc_path();
-       BUG_ON(!path);
+       if (!path) {
+               free(root);
+               return ERR_PTR(-ENOMEM);
+       }
+
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
        if (ret != 0) {
                if (ret > 0)
@@ -741,28 +603,54 @@ struct btrfs_root *btrfs_read_fs_root_no_cache(struct btrfs_fs_info *fs_info,
        memcpy(&root->root_key, location, sizeof(*location));
        ret = 0;
 out:
-       btrfs_release_path(root, path);
        btrfs_free_path(path);
        if (ret) {
                free(root);
                return ERR_PTR(ret);
        }
        generation = btrfs_root_generation(&root->root_item);
-       blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-       root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                    blocksize, generation);
-       BUG_ON(!root->node);
+       root->node = read_tree_block(fs_info,
+                       btrfs_root_bytenr(&root->root_item), generation);
+       if (!extent_buffer_uptodate(root->node)) {
+               free(root);
+               return ERR_PTR(-EIO);
+       }
 insert:
        root->ref_cows = 1;
        return root;
 }
 
+static int btrfs_fs_roots_compare_objectids(struct rb_node *node,
+                                           void *data)
+{
+       u64 objectid = *((u64 *)data);
+       struct btrfs_root *root;
+
+       root = rb_entry(node, struct btrfs_root, rb_node);
+       if (objectid > root->objectid)
+               return 1;
+       else if (objectid < root->objectid)
+               return -1;
+       else
+               return 0;
+}
+
+static int btrfs_fs_roots_compare_roots(struct rb_node *node1,
+                                       struct rb_node *node2)
+{
+       struct btrfs_root *root;
+
+       root = rb_entry(node2, struct btrfs_root, rb_node);
+       return btrfs_fs_roots_compare_objectids(node1, (void *)&root->objectid);
+}
+
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                                      struct btrfs_key *location)
 {
        struct btrfs_root *root;
-       struct cache_extent *cache;
+       struct rb_node *node;
        int ret;
+       u64 objectid = location->objectid;
 
        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
                return fs_info->tree_root;
@@ -774,82 +662,66 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                return fs_info->dev_root;
        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
                return fs_info->csum_root;
+       if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+               return fs_info->quota_enabled ? fs_info->quota_root :
+                               ERR_PTR(-ENOENT);
 
        BUG_ON(location->objectid == BTRFS_TREE_RELOC_OBJECTID ||
               location->offset != (u64)-1);
 
-       cache = find_cache_extent(&fs_info->fs_root_cache,
-                                 location->objectid, 1);
-       if (cache)
-               return container_of(cache, struct btrfs_root, cache);
+       node = rb_search(&fs_info->fs_root_tree, (void *)&objectid,
+                        btrfs_fs_roots_compare_objectids, NULL);
+       if (node)
+               return container_of(node, struct btrfs_root, rb_node);
 
        root = btrfs_read_fs_root_no_cache(fs_info, location);
        if (IS_ERR(root))
                return root;
 
-       root->cache.start = location->objectid;
-       root->cache.size = 1;
-       ret = insert_existing_cache_extent(&fs_info->fs_root_cache,
-                                          &root->cache);
+       ret = rb_insert(&fs_info->fs_root_tree, &root->rb_node,
+                       btrfs_fs_roots_compare_roots);
        BUG_ON(ret);
        return root;
 }
 
-static struct btrfs_fs_info *__open_ctree_fd(int fp, const char *path,
-                                            u64 sb_bytenr,
-                                            u64 root_tree_bytenr, int writes,
-                                            int partial)
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 {
-       u32 sectorsize;
-       u32 nodesize;
-       u32 leafsize;
-       u32 blocksize;
-       u32 stripesize;
-       u64 generation;
-       struct btrfs_key key;
-       struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root));
-       struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
-       struct btrfs_root *chunk_root = malloc(sizeof(struct btrfs_root));
-       struct btrfs_root *dev_root = malloc(sizeof(struct btrfs_root));
-       struct btrfs_root *csum_root = malloc(sizeof(struct btrfs_root));
-       struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info));
-       int ret;
-       struct btrfs_super_block *disk_super;
-       struct btrfs_fs_devices *fs_devices = NULL;
-       u64 total_devs;
-       u64 features;
+       if (fs_info->quota_root)
+               free(fs_info->quota_root);
 
-       if (sb_bytenr == 0)
-               sb_bytenr = BTRFS_SUPER_INFO_OFFSET;
-
-       /* try to drop all the caches */
-       if (posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED))
-               fprintf(stderr, "Warning, could not drop caches\n");
-
-       ret = btrfs_scan_one_device(fp, path, &fs_devices,
-                                   &total_devs, sb_bytenr);
+       free(fs_info->tree_root);
+       free(fs_info->extent_root);
+       free(fs_info->chunk_root);
+       free(fs_info->dev_root);
+       free(fs_info->csum_root);
+       free(fs_info->free_space_root);
+       free(fs_info->super_copy);
+       free(fs_info->log_root_tree);
+       free(fs_info);
+}
 
-       if (ret) {
-               fprintf(stderr, "No valid Btrfs found on %s\n", path);
-               goto out;
-       }
+struct btrfs_fs_info *btrfs_new_fs_info(int writable, u64 sb_bytenr)
+{
+       struct btrfs_fs_info *fs_info;
 
-       if (total_devs != 1) {
-               ret = btrfs_scan_for_fsid(fs_devices, total_devs, 1);
-               if (ret)
-                       goto out;
-       }
+       fs_info = calloc(1, sizeof(struct btrfs_fs_info));
+       if (!fs_info)
+               return NULL;
 
-       memset(fs_info, 0, sizeof(*fs_info));
+       fs_info->tree_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->extent_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->chunk_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->dev_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->csum_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->quota_root = calloc(1, sizeof(struct btrfs_root));
+       fs_info->free_space_root = calloc(1, sizeof(struct btrfs_root));
        fs_info->super_copy = calloc(1, BTRFS_SUPER_INFO_SIZE);
-       fs_info->tree_root = tree_root;
-       fs_info->extent_root = extent_root;
-       fs_info->chunk_root = chunk_root;
-       fs_info->dev_root = dev_root;
-       fs_info->csum_root = csum_root;
 
-       if (!writes)
-               fs_info->readonly = 1;
+       if (!fs_info->tree_root || !fs_info->extent_root ||
+           !fs_info->chunk_root || !fs_info->dev_root ||
+           !fs_info->csum_root || !fs_info->quota_root ||
+           !fs_info->free_space_root || !fs_info->super_copy)
+               goto free_all;
 
        extent_io_tree_init(&fs_info->extent_cache);
        extent_io_tree_init(&fs_info->free_space_cache);
@@ -857,160 +729,224 @@ static struct btrfs_fs_info *__open_ctree_fd(int fp, const char *path,
        extent_io_tree_init(&fs_info->pinned_extents);
        extent_io_tree_init(&fs_info->pending_del);
        extent_io_tree_init(&fs_info->extent_ins);
-       cache_tree_init(&fs_info->fs_root_cache);
+       fs_info->excluded_extents = NULL;
 
+       fs_info->fs_root_tree = RB_ROOT;
        cache_tree_init(&fs_info->mapping_tree.cache_tree);
 
        mutex_init(&fs_info->fs_mutex);
-       fs_info->fs_devices = fs_devices;
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
+       INIT_LIST_HEAD(&fs_info->recow_ebs);
 
-       __setup_root(4096, 4096, 4096, 4096, tree_root,
-                    fs_info, BTRFS_ROOT_TREE_OBJECTID);
-
-       if (writes)
-               ret = btrfs_open_devices(fs_devices, O_RDWR);
-       else
-               ret = btrfs_open_devices(fs_devices, O_RDONLY);
-       if (ret)
-               goto out_cleanup;
+       if (!writable)
+               fs_info->readonly = 1;
 
        fs_info->super_bytenr = sb_bytenr;
-       disk_super = fs_info->super_copy;
-       ret = btrfs_read_dev_super(fs_devices->latest_bdev,
-                                  disk_super, sb_bytenr);
-       if (ret) {
-               printk("No valid btrfs found\n");
-               goto out_devices;
-       }
-
-       memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
+       fs_info->data_alloc_profile = (u64)-1;
+       fs_info->metadata_alloc_profile = (u64)-1;
+       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+       return fs_info;
+free_all:
+       btrfs_free_fs_info(fs_info);
+       return NULL;
+}
 
+int btrfs_check_fs_compatibility(struct btrfs_super_block *sb,
+                                unsigned int flags)
+{
+       u64 features;
 
-       features = btrfs_super_incompat_flags(disk_super) &
+       features = btrfs_super_incompat_flags(sb) &
                   ~BTRFS_FEATURE_INCOMPAT_SUPP;
        if (features) {
                printk("couldn't open because of unsupported "
                       "option features (%Lx).\n",
                       (unsigned long long)features);
-               goto out_devices;
+               return -ENOTSUP;
        }
 
-       features = btrfs_super_incompat_flags(disk_super);
+       features = btrfs_super_incompat_flags(sb);
        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-               btrfs_set_super_incompat_flags(disk_super, features);
-       }
-
-       features = btrfs_super_compat_ro_flags(disk_super) &
-               ~BTRFS_FEATURE_COMPAT_RO_SUPP;
-       if (writes && features) {
-               printk("couldn't open RDWR because of unsupported "
-                      "option features (%Lx).\n",
-                      (unsigned long long)features);
-               goto out_devices;
+               btrfs_set_super_incompat_flags(sb, features);
        }
 
-       nodesize = btrfs_super_nodesize(disk_super);
-       leafsize = btrfs_super_leafsize(disk_super);
-       sectorsize = btrfs_super_sectorsize(disk_super);
-       stripesize = btrfs_super_stripesize(disk_super);
-       tree_root->nodesize = nodesize;
-       tree_root->leafsize = leafsize;
-       tree_root->sectorsize = sectorsize;
-       tree_root->stripesize = stripesize;
+       features = btrfs_super_compat_ro_flags(sb);
+       if (flags & OPEN_CTREE_WRITES) {
+               if (flags & OPEN_CTREE_INVALIDATE_FST) {
+                       /* Clear the FREE_SPACE_TREE_VALID bit on disk... */
+                       features &= ~BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID;
+                       btrfs_set_super_compat_ro_flags(sb, features);
+                       /* ... and ignore the free space tree bit. */
+                       features &= ~BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE;
+               }
+               if (features & ~BTRFS_FEATURE_COMPAT_RO_SUPP) {
+                       printk("couldn't open RDWR because of unsupported "
+                              "option features (%Lx).\n",
+                              (unsigned long long)features);
+                       return -ENOTSUP;
+               }
 
-       ret = btrfs_read_sys_array(tree_root);
-       if (ret)
-               goto out_devices;
-       blocksize = btrfs_level_size(tree_root,
-                                    btrfs_super_chunk_root_level(disk_super));
-       generation = btrfs_super_chunk_root_generation(disk_super);
-
-       __setup_root(nodesize, leafsize, sectorsize, stripesize,
-                    chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
-
-       chunk_root->node = read_tree_block(chunk_root,
-                                          btrfs_super_chunk_root(disk_super),
-                                          blocksize, generation);
-       if (!extent_buffer_uptodate(chunk_root->node)) {
-               printk("Couldn't read chunk root\n");
-               goto out_devices;
        }
+       return 0;
+}
 
-       read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-                (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-                BTRFS_UUID_SIZE);
+static int find_best_backup_root(struct btrfs_super_block *super)
+{
+       struct btrfs_root_backup *backup;
+       u64 orig_gen = btrfs_super_generation(super);
+       u64 gen = 0;
+       int best_index = 0;
+       int i;
 
-       if (!(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_METADUMP)) {
-               ret = btrfs_read_chunk_tree(chunk_root);
-               if (ret)
-                       goto out_failed;
+       for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+               backup = super->super_roots + i;
+               if (btrfs_backup_tree_root_gen(backup) != orig_gen &&
+                   btrfs_backup_tree_root_gen(backup) > gen) {
+                       best_index = i;
+                       gen = btrfs_backup_tree_root_gen(backup);
+               }
        }
+       return best_index;
+}
 
-       blocksize = btrfs_level_size(tree_root,
-                                    btrfs_super_root_level(disk_super));
-       generation = btrfs_super_generation(disk_super);
+static int setup_root_or_create_block(struct btrfs_fs_info *fs_info,
+                                     unsigned flags,
+                                     struct btrfs_root *info_root,
+                                     u64 objectid, char *str)
+{
+       struct btrfs_root *root = fs_info->tree_root;
+       int ret;
 
-       if (!root_tree_bytenr)
-               root_tree_bytenr = btrfs_super_root(disk_super);
-       tree_root->node = read_tree_block(tree_root,
-                                         root_tree_bytenr,
-                                         blocksize, generation);
-       if (!extent_buffer_uptodate(tree_root->node)) {
-               printk("Couldn't read tree root\n");
-               goto out_failed;
-       }
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+       ret = find_and_setup_root(root, fs_info, objectid, info_root);
        if (ret) {
-               printk("Couldn't setup extent tree\n");
-               goto out_failed;
+               printk("Couldn't setup %s tree\n", str);
+               if (!(flags & OPEN_CTREE_PARTIAL))
+                       return -EIO;
+               /*
+                * Need a blank node here just so we don't screw up in the
+                * million of places that assume a root has a valid ->node
+                */
+               info_root->node =
+                       btrfs_find_create_tree_block(fs_info, 0);
+               if (!info_root->node)
+                       return -ENOMEM;
+               clear_extent_buffer_uptodate(info_root->node);
+       }
+
+       return 0;
+}
+
+int btrfs_setup_all_roots(struct btrfs_fs_info *fs_info, u64 root_tree_bytenr,
+                         unsigned flags)
+{
+       struct btrfs_super_block *sb = fs_info->super_copy;
+       struct btrfs_root *root;
+       struct btrfs_key key;
+       u64 generation;
+       int ret;
+
+       root = fs_info->tree_root;
+       btrfs_setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+       generation = btrfs_super_generation(sb);
+
+       if (!root_tree_bytenr && !(flags & OPEN_CTREE_BACKUP_ROOT)) {
+               root_tree_bytenr = btrfs_super_root(sb);
+       } else if (flags & OPEN_CTREE_BACKUP_ROOT) {
+               struct btrfs_root_backup *backup;
+               int index = find_best_backup_root(sb);
+               if (index >= BTRFS_NUM_BACKUP_ROOTS) {
+                       fprintf(stderr, "Invalid backup root number\n");
+                       return -EIO;
+               }
+               backup = fs_info->super_copy->super_roots + index;
+               root_tree_bytenr = btrfs_backup_tree_root(backup);
+               generation = btrfs_backup_tree_root_gen(backup);
+       }
+
+       root->node = read_tree_block(fs_info, root_tree_bytenr, generation);
+       if (!extent_buffer_uptodate(root->node)) {
+               fprintf(stderr, "Couldn't read tree root\n");
+               return -EIO;
        }
-       extent_root->track_dirty = 1;
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_DEV_TREE_OBJECTID, dev_root);
+       ret = setup_root_or_create_block(fs_info, flags, fs_info->extent_root,
+                                        BTRFS_EXTENT_TREE_OBJECTID, "extent");
+       if (ret)
+               return ret;
+       fs_info->extent_root->track_dirty = 1;
+
+       ret = find_and_setup_root(root, fs_info, BTRFS_DEV_TREE_OBJECTID,
+                                 fs_info->dev_root);
        if (ret) {
                printk("Couldn't setup device tree\n");
-               goto out_failed;
+               return -EIO;
        }
-       dev_root->track_dirty = 1;
+       fs_info->dev_root->track_dirty = 1;
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_CSUM_TREE_OBJECTID, csum_root);
+       ret = setup_root_or_create_block(fs_info, flags, fs_info->csum_root,
+                                        BTRFS_CSUM_TREE_OBJECTID, "csum");
+       if (ret)
+               return ret;
+       fs_info->csum_root->track_dirty = 1;
+
+       ret = find_and_setup_root(root, fs_info, BTRFS_QUOTA_TREE_OBJECTID,
+                                 fs_info->quota_root);
        if (ret) {
-               printk("Couldn't setup csum tree\n");
-               if (!partial)
-                       goto out_failed;
+               free(fs_info->quota_root);
+               fs_info->quota_root = NULL;
+       } else {
+               fs_info->quota_enabled = 1;
        }
-       csum_root->track_dirty = 1;
 
-       find_and_setup_log_root(tree_root, fs_info, disk_super);
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               ret = find_and_setup_root(root, fs_info, BTRFS_FREE_SPACE_TREE_OBJECTID,
+                                         fs_info->free_space_root);
+               if (ret) {
+                       printk("Couldn't read free space tree\n");
+                       return -EIO;
+               }
+               fs_info->free_space_root->track_dirty = 1;
+       }
+
+       ret = find_and_setup_log_root(root, fs_info, sb);
+       if (ret) {
+               printk("Couldn't setup log root tree\n");
+               if (!(flags & OPEN_CTREE_PARTIAL))
+                       return -EIO;
+       }
 
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       btrfs_read_block_groups(fs_info->tree_root);
+       if (extent_buffer_uptodate(fs_info->extent_root->node) &&
+           !(flags & OPEN_CTREE_NO_BLOCK_GROUPS)) {
+               ret = btrfs_read_block_groups(fs_info->tree_root);
+               /*
+                * If we don't find any blockgroups (ENOENT) we're either
+                * restoring or creating the filesystem, where it's expected,
+                * anything else is error
+                */
+               if (ret != -ENOENT)
+                       return -EIO;
+       }
 
        key.objectid = BTRFS_FS_TREE_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
        fs_info->fs_root = btrfs_read_fs_root(fs_info, &key);
 
-       if (!fs_info->fs_root)
-               goto out_failed;
-
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
-
-       return fs_info;
-
-out_failed:
-       if (partial)
-               return fs_info;
+       if (IS_ERR(fs_info->fs_root))
+               return -EIO;
+       return 0;
+}
 
+void btrfs_release_all_roots(struct btrfs_fs_info *fs_info)
+{
+       if (fs_info->free_space_root)
+               free_extent_buffer(fs_info->free_space_root->node);
+       if (fs_info->quota_root)
+               free_extent_buffer(fs_info->quota_root->node);
        if (fs_info->csum_root)
                free_extent_buffer(fs_info->csum_root->node);
        if (fs_info->dev_root)
@@ -1019,130 +955,547 @@ out_failed:
                free_extent_buffer(fs_info->extent_root->node);
        if (fs_info->tree_root)
                free_extent_buffer(fs_info->tree_root->node);
+       if (fs_info->log_root_tree)
+               free_extent_buffer(fs_info->log_root_tree->node);
        if (fs_info->chunk_root)
                free_extent_buffer(fs_info->chunk_root->node);
-out_devices:
-       close_all_devices(fs_info);
-out_cleanup:
+}
+
+static void free_map_lookup(struct cache_extent *ce)
+{
+       struct map_lookup *map;
+
+       map = container_of(ce, struct map_lookup, ce);
+       kfree(map);
+}
+
+FREE_EXTENT_CACHE_BASED_TREE(mapping_cache, free_map_lookup);
+
+void btrfs_cleanup_all_caches(struct btrfs_fs_info *fs_info)
+{
+       while (!list_empty(&fs_info->recow_ebs)) {
+               struct extent_buffer *eb;
+               eb = list_first_entry(&fs_info->recow_ebs,
+                                     struct extent_buffer, recow);
+               list_del_init(&eb->recow);
+               free_extent_buffer(eb);
+       }
+       free_mapping_cache_tree(&fs_info->mapping_tree.cache_tree);
        extent_io_tree_cleanup(&fs_info->extent_cache);
        extent_io_tree_cleanup(&fs_info->free_space_cache);
        extent_io_tree_cleanup(&fs_info->block_group_cache);
        extent_io_tree_cleanup(&fs_info->pinned_extents);
        extent_io_tree_cleanup(&fs_info->pending_del);
        extent_io_tree_cleanup(&fs_info->extent_ins);
+}
+
+int btrfs_scan_fs_devices(int fd, const char *path,
+                         struct btrfs_fs_devices **fs_devices,
+                         u64 sb_bytenr, unsigned sbflags,
+                         int skip_devices)
+{
+       u64 total_devs;
+       u64 dev_size;
+       off_t seek_ret;
+       int ret;
+       if (!sb_bytenr)
+               sb_bytenr = BTRFS_SUPER_INFO_OFFSET;
+
+       seek_ret = lseek(fd, 0, SEEK_END);
+       if (seek_ret < 0)
+               return -errno;
+
+       dev_size = seek_ret;
+       lseek(fd, 0, SEEK_SET);
+       if (sb_bytenr > dev_size) {
+               error("superblock bytenr %llu is larger than device size %llu",
+                               (unsigned long long)sb_bytenr,
+                               (unsigned long long)dev_size);
+               return -EINVAL;
+       }
+
+       ret = btrfs_scan_one_device(fd, path, fs_devices,
+                                   &total_devs, sb_bytenr, sbflags);
+       if (ret) {
+               fprintf(stderr, "No valid Btrfs found on %s\n", path);
+               return ret;
+       }
+
+       if (!skip_devices && total_devs != 1) {
+               ret = btrfs_scan_devices();
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+int btrfs_setup_chunk_tree_and_device_map(struct btrfs_fs_info *fs_info,
+                                         u64 chunk_root_bytenr)
+{
+       struct btrfs_super_block *sb = fs_info->super_copy;
+       u64 generation;
+       int ret;
+
+       btrfs_setup_root(fs_info->chunk_root, fs_info,
+                       BTRFS_CHUNK_TREE_OBJECTID);
+
+       ret = btrfs_read_sys_array(fs_info);
+       if (ret)
+               return ret;
+
+       generation = btrfs_super_chunk_root_generation(sb);
+
+       if (chunk_root_bytenr && !IS_ALIGNED(chunk_root_bytenr,
+                                           fs_info->sectorsize)) {
+               warning("chunk_root_bytenr %llu is unaligned to %u, ignore it",
+                       chunk_root_bytenr, fs_info->sectorsize);
+               chunk_root_bytenr = 0;
+       }
+
+       if (!chunk_root_bytenr)
+               chunk_root_bytenr = btrfs_super_chunk_root(sb);
+       else
+               generation = 0;
+
+       fs_info->chunk_root->node = read_tree_block(fs_info,
+                                                   chunk_root_bytenr,
+                                                   generation);
+       if (!extent_buffer_uptodate(fs_info->chunk_root->node)) {
+               if (fs_info->ignore_chunk_tree_error) {
+                       warning("cannot read chunk root, continue anyway");
+                       fs_info->chunk_root = NULL;
+                       return 0;
+               } else {
+                       error("cannot read chunk root");
+                       return -EIO;
+               }
+       }
+
+       if (!(btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_METADUMP)) {
+               ret = btrfs_read_chunk_tree(fs_info);
+               if (ret) {
+                       fprintf(stderr, "Couldn't read chunk tree\n");
+                       return ret;
+               }
+       }
+       return 0;
+}
+
+static struct btrfs_fs_info *__open_ctree_fd(int fp, const char *path,
+                                            u64 sb_bytenr,
+                                            u64 root_tree_bytenr,
+                                            u64 chunk_root_bytenr,
+                                            unsigned flags)
+{
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_super_block *disk_super;
+       struct btrfs_fs_devices *fs_devices = NULL;
+       struct extent_buffer *eb;
+       int ret;
+       int oflags;
+       unsigned sbflags = SBREAD_DEFAULT;
+
+       if (sb_bytenr == 0)
+               sb_bytenr = BTRFS_SUPER_INFO_OFFSET;
+
+       /* try to drop all the caches */
+       if (posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED))
+               fprintf(stderr, "Warning, could not drop caches\n");
+
+       fs_info = btrfs_new_fs_info(flags & OPEN_CTREE_WRITES, sb_bytenr);
+       if (!fs_info) {
+               fprintf(stderr, "Failed to allocate memory for fs_info\n");
+               return NULL;
+       }
+       if (flags & OPEN_CTREE_RESTORE)
+               fs_info->on_restoring = 1;
+       if (flags & OPEN_CTREE_SUPPRESS_CHECK_BLOCK_ERRORS)
+               fs_info->suppress_check_block_errors = 1;
+       if (flags & OPEN_CTREE_IGNORE_FSID_MISMATCH)
+               fs_info->ignore_fsid_mismatch = 1;
+       if (flags & OPEN_CTREE_IGNORE_CHUNK_TREE_ERROR)
+               fs_info->ignore_chunk_tree_error = 1;
+
+       if ((flags & OPEN_CTREE_RECOVER_SUPER)
+            && (flags & OPEN_CTREE_FS_PARTIAL)) {
+               fprintf(stderr,
+                   "cannot open a partially created filesystem for recovery");
+               goto out;
+       }
+
+       if (flags & OPEN_CTREE_FS_PARTIAL)
+               sbflags = SBREAD_PARTIAL;
+
+       ret = btrfs_scan_fs_devices(fp, path, &fs_devices, sb_bytenr, sbflags,
+                       (flags & OPEN_CTREE_NO_DEVICES));
+       if (ret)
+               goto out;
+
+       fs_info->fs_devices = fs_devices;
+       if (flags & OPEN_CTREE_WRITES)
+               oflags = O_RDWR;
+       else
+               oflags = O_RDONLY;
+
+       if (flags & OPEN_CTREE_EXCLUSIVE)
+               oflags |= O_EXCL;
+
+       ret = btrfs_open_devices(fs_devices, oflags);
+       if (ret)
+               goto out;
+
+       disk_super = fs_info->super_copy;
+       if (flags & OPEN_CTREE_RECOVER_SUPER)
+               ret = btrfs_read_dev_super(fs_devices->latest_bdev, disk_super,
+                               sb_bytenr, SBREAD_RECOVER);
+       else
+               ret = btrfs_read_dev_super(fp, disk_super, sb_bytenr,
+                               sbflags);
+       if (ret) {
+               printk("No valid btrfs found\n");
+               goto out_devices;
+       }
+
+       if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID &&
+           !fs_info->ignore_fsid_mismatch) {
+               fprintf(stderr, "ERROR: Filesystem UUID change in progress\n");
+               goto out_devices;
+       }
+
+       memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
+       fs_info->sectorsize = btrfs_super_sectorsize(disk_super);
+       fs_info->nodesize = btrfs_super_nodesize(disk_super);
+       fs_info->stripesize = btrfs_super_stripesize(disk_super);
+
+       ret = btrfs_check_fs_compatibility(fs_info->super_copy, flags);
+       if (ret)
+               goto out_devices;
+
+       ret = btrfs_setup_chunk_tree_and_device_map(fs_info, chunk_root_bytenr);
+       if (ret)
+               goto out_chunk;
+
+       /* Chunk tree root is unable to read, return directly */
+       if (!fs_info->chunk_root)
+               return fs_info;
+
+       eb = fs_info->chunk_root->node;
+       read_extent_buffer(eb, fs_info->chunk_tree_uuid,
+                          btrfs_header_chunk_tree_uuid(eb),
+                          BTRFS_UUID_SIZE);
+
+       ret = btrfs_setup_all_roots(fs_info, root_tree_bytenr, flags);
+       if (ret && !(flags & __OPEN_CTREE_RETURN_CHUNK_ROOT) &&
+           !fs_info->ignore_chunk_tree_error)
+               goto out_chunk;
+
+       return fs_info;
+
+out_chunk:
+       btrfs_release_all_roots(fs_info);
+       btrfs_cleanup_all_caches(fs_info);
+out_devices:
+       btrfs_close_devices(fs_devices);
 out:
-       free(tree_root);
-       free(extent_root);
-       free(chunk_root);
-       free(dev_root);
-       free(csum_root);
-       free(fs_info);
+       btrfs_free_fs_info(fs_info);
        return NULL;
 }
 
 struct btrfs_fs_info *open_ctree_fs_info(const char *filename,
-                                        u64 sb_bytenr, int writes,
-                                        int partial)
+                                        u64 sb_bytenr, u64 root_tree_bytenr,
+                                        u64 chunk_root_bytenr,
+                                        unsigned flags)
 {
        int fp;
+       int ret;
        struct btrfs_fs_info *info;
-       int flags = O_CREAT | O_RDWR;
+       int oflags = O_RDWR;
+       struct stat st;
 
-       if (!writes)
-               flags = O_RDONLY;
+       ret = stat(filename, &st);
+       if (ret < 0) {
+               error("cannot stat '%s': %m", filename);
+               return NULL;
+       }
+       if (!(((st.st_mode & S_IFMT) == S_IFREG) || ((st.st_mode & S_IFMT) == S_IFBLK))) {
+               error("not a regular file or block device: %s", filename);
+               return NULL;
+       }
+
+       if (!(flags & OPEN_CTREE_WRITES))
+               oflags = O_RDONLY;
 
-       fp = open(filename, flags, 0600);
+       fp = open(filename, oflags);
        if (fp < 0) {
-               fprintf (stderr, "Could not open %s\n", filename);
+               error("cannot open '%s': %m", filename);
                return NULL;
        }
-       info = __open_ctree_fd(fp, filename, sb_bytenr, 0, writes, partial);
+       info = __open_ctree_fd(fp, filename, sb_bytenr, root_tree_bytenr,
+                              chunk_root_bytenr, flags);
        close(fp);
        return info;
 }
 
-struct btrfs_root *open_ctree(const char *filename, u64 sb_bytenr, int writes)
+struct btrfs_root *open_ctree(const char *filename, u64 sb_bytenr,
+                             unsigned flags)
 {
        struct btrfs_fs_info *info;
 
-       info = open_ctree_fs_info(filename, sb_bytenr, writes, 0);
+       /* This flags may not return fs_info with any valid root */
+       BUG_ON(flags & OPEN_CTREE_IGNORE_CHUNK_TREE_ERROR);
+       info = open_ctree_fs_info(filename, sb_bytenr, 0, 0, flags);
        if (!info)
                return NULL;
+       if (flags & __OPEN_CTREE_RETURN_CHUNK_ROOT)
+               return info->chunk_root;
        return info->fs_root;
 }
 
-struct btrfs_root *open_ctree_recovery(const char *filename, u64 sb_bytenr,
-                                      u64 root_tree_bytenr)
+struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
+                                unsigned flags)
 {
-       int fp;
        struct btrfs_fs_info *info;
 
-
-       fp = open(filename, O_RDONLY);
-       if (fp < 0) {
-               fprintf (stderr, "Could not open %s\n", filename);
+       /* This flags may not return fs_info with any valid root */
+       if (flags & OPEN_CTREE_IGNORE_CHUNK_TREE_ERROR) {
+               error("invalid open_ctree flags: 0x%llx",
+                               (unsigned long long)flags);
                return NULL;
        }
-       info = __open_ctree_fd(fp, filename, sb_bytenr,
-                              root_tree_bytenr, 0, 0);
-       close(fp);
-
+       info = __open_ctree_fd(fp, path, sb_bytenr, 0, 0, flags);
        if (!info)
                return NULL;
+       if (flags & __OPEN_CTREE_RETURN_CHUNK_ROOT)
+               return info->chunk_root;
        return info->fs_root;
 }
 
-struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
-                                int writes)
+/*
+ * Check if the super is valid:
+ * - nodesize/sectorsize - minimum, maximum, alignment
+ * - tree block starts   - alignment
+ * - number of devices   - something sane
+ * - sys array size      - maximum
+ */
+static int check_super(struct btrfs_super_block *sb, unsigned sbflags)
 {
-       struct btrfs_fs_info *info;
-       info = __open_ctree_fd(fp, path, sb_bytenr, 0, writes, 0);
-       if (!info)
-               return NULL;
-       return info->fs_root;
+       u8 result[BTRFS_CSUM_SIZE];
+       u32 crc;
+       u16 csum_type;
+       int csum_size;
+
+       if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+               if (btrfs_super_magic(sb) == BTRFS_MAGIC_PARTIAL) {
+                       if (!(sbflags & SBREAD_PARTIAL)) {
+                               error("superblock magic doesn't match");
+                               return -EIO;
+                       }
+               }
+       }
+
+       csum_type = btrfs_super_csum_type(sb);
+       if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+               error("unsupported checksum algorithm %u", csum_type);
+               return -EIO;
+       }
+       csum_size = btrfs_csum_sizes[csum_type];
+
+       crc = ~(u32)0;
+       crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc,
+                             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, result);
+
+       if (memcmp(result, sb->csum, csum_size)) {
+               error("superblock checksum mismatch");
+               return -EIO;
+       }
+       if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               error("tree_root level too big: %d >= %d",
+                       btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+               goto error_out;
+       }
+       if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               error("chunk_root level too big: %d >= %d",
+                       btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+               goto error_out;
+       }
+       if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               error("log_root level too big: %d >= %d",
+                       btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+               goto error_out;
+       }
+
+       if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) {
+               error("tree_root block unaligned: %llu", btrfs_super_root(sb));
+               goto error_out;
+       }
+       if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) {
+               error("chunk_root block unaligned: %llu",
+                       btrfs_super_chunk_root(sb));
+               goto error_out;
+       }
+       if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) {
+               error("log_root block unaligned: %llu",
+                       btrfs_super_log_root(sb));
+               goto error_out;
+       }
+       if (btrfs_super_nodesize(sb) < 4096) {
+               error("nodesize too small: %u < 4096",
+                       btrfs_super_nodesize(sb));
+               goto error_out;
+       }
+       if (!IS_ALIGNED(btrfs_super_nodesize(sb), 4096)) {
+               error("nodesize unaligned: %u", btrfs_super_nodesize(sb));
+               goto error_out;
+       }
+       if (btrfs_super_sectorsize(sb) < 4096) {
+               error("sectorsize too small: %u < 4096",
+                       btrfs_super_sectorsize(sb));
+               goto error_out;
+       }
+       if (!IS_ALIGNED(btrfs_super_sectorsize(sb), 4096)) {
+               error("sectorsize unaligned: %u", btrfs_super_sectorsize(sb));
+               goto error_out;
+       }
+       if (btrfs_super_total_bytes(sb) == 0) {
+               error("invalid total_bytes 0");
+               goto error_out;
+       }
+       if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+               error("invalid bytes_used %llu", btrfs_super_bytes_used(sb));
+               goto error_out;
+       }
+       if ((btrfs_super_stripesize(sb) != 4096)
+               && (btrfs_super_stripesize(sb) != btrfs_super_sectorsize(sb))) {
+               error("invalid stripesize %u", btrfs_super_stripesize(sb));
+               goto error_out;
+       }
+
+       if (memcmp(sb->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+               char fsid[BTRFS_UUID_UNPARSED_SIZE];
+               char dev_fsid[BTRFS_UUID_UNPARSED_SIZE];
+
+               uuid_unparse(sb->fsid, fsid);
+               uuid_unparse(sb->dev_item.fsid, dev_fsid);
+               error("dev_item UUID does not match fsid: %s != %s",
+                       dev_fsid, fsid);
+               goto error_out;
+       }
+
+       /*
+        * Hint to catch really bogus numbers, bitflips or so
+        */
+       if (btrfs_super_num_devices(sb) > (1UL << 31)) {
+               warning("suspicious number of devices: %llu",
+                       btrfs_super_num_devices(sb));
+       }
+
+       if (btrfs_super_num_devices(sb) == 0) {
+               error("number of devices is 0");
+               goto error_out;
+       }
+
+       /*
+        * Obvious sys_chunk_array corruptions, it must hold at least one key
+        * and one chunk
+        */
+       if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+               error("system chunk array too big %u > %u",
+                     btrfs_super_sys_array_size(sb),
+                     BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+               goto error_out;
+       }
+       if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+                       + sizeof(struct btrfs_chunk)) {
+               error("system chunk array too small %u < %zu",
+                     btrfs_super_sys_array_size(sb),
+                     sizeof(struct btrfs_disk_key) +
+                     sizeof(struct btrfs_chunk));
+               goto error_out;
+       }
+
+       return 0;
+
+error_out:
+       error("superblock checksum matches but it has invalid members");
+       return -EIO;
 }
 
-int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr)
+/*
+ * btrfs_read_dev_super - read a valid superblock from a block device
+ * @fd:                file descriptor of the device
+ * @sb:                buffer where the superblock is going to be read in
+ * @sb_bytenr:  offset of the particular superblock copy we want
+ * @sbflags:   flags controlling how the superblock is read
+ *
+ * This function is used by various btrfs comands to obtain a valid superblock.
+ *
+ * It's mode of operation is controlled by the @sb_bytenr and @sbdflags
+ * parameters. If SBREAD_RECOVER flag is set and @sb_bytenr is
+ * BTRFS_SUPER_INFO_OFFSET then the function reads all 3 superblock copies and
+ * returns the newest one. If SBREAD_RECOVER is not set then only a single
+ * copy is read, which one is decided by @sb_bytenr. If @sb_bytenr !=
+ * BTRFS_SUPER_INFO_OFFSET then the @sbflags is effectively ignored and only a
+ * single copy is read.
+ */
+int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr,
+                        unsigned sbflags)
 {
        u8 fsid[BTRFS_FSID_SIZE];
        int fsid_is_initialized = 0;
-       struct btrfs_super_block buf;
+       char tmp[BTRFS_SUPER_INFO_SIZE];
+       struct btrfs_super_block *buf = (struct btrfs_super_block *)tmp;
        int i;
        int ret;
+       int max_super = sbflags & SBREAD_RECOVER ? BTRFS_SUPER_MIRROR_MAX : 1;
        u64 transid = 0;
        u64 bytenr;
 
        if (sb_bytenr != BTRFS_SUPER_INFO_OFFSET) {
-               ret = pread64(fd, &buf, sizeof(buf), sb_bytenr);
-               if (ret < sizeof(buf))
-                       return -1;
+               ret = pread64(fd, buf, BTRFS_SUPER_INFO_SIZE, sb_bytenr);
+               /* real error */
+               if (ret < 0)
+                       return -errno;
 
-               if (btrfs_super_bytenr(&buf) != sb_bytenr ||
-                   buf.magic != cpu_to_le64(BTRFS_MAGIC))
-                       return -1;
+               /* Not large enough sb, return -ENOENT instead of normal -EIO */
+               if (ret < BTRFS_SUPER_INFO_SIZE)
+                       return -ENOENT;
+
+               if (btrfs_super_bytenr(buf) != sb_bytenr)
+                       return -EIO;
 
-               memcpy(sb, &buf, sizeof(*sb));
+               ret = check_super(buf, sbflags);
+               if (ret < 0)
+                       return ret;
+               memcpy(sb, buf, BTRFS_SUPER_INFO_SIZE);
                return 0;
        }
 
-       for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+       /*
+       * we would like to check all the supers, but that would make
+       * a btrfs mount succeed after a mkfs from a different FS.
+       * So, we need to add a special mount option to scan for
+       * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+       */
+
+       for (i = 0; i < max_super; i++) {
                bytenr = btrfs_sb_offset(i);
-               ret = pread64(fd, &buf, sizeof(buf), bytenr);
-               if (ret < sizeof(buf))
+               ret = pread64(fd, buf, BTRFS_SUPER_INFO_SIZE, bytenr);
+               if (ret < BTRFS_SUPER_INFO_SIZE)
                        break;
 
-               if (btrfs_super_bytenr(&buf) != bytenr )
+               if (btrfs_super_bytenr(buf) != bytenr )
                        continue;
                /* if magic is NULL, the device was removed */
-               if (buf.magic == 0 && i == 0) 
-                       return -1;
-               if (buf.magic != cpu_to_le64(BTRFS_MAGIC))
+               if (btrfs_super_magic(buf) == 0 && i == 0)
+                       break;
+               if (check_super(buf, sbflags))
                        continue;
 
                if (!fsid_is_initialized) {
-                       memcpy(fsid, buf.fsid, sizeof(fsid));
+                       memcpy(fsid, buf->fsid, sizeof(fsid));
                        fsid_is_initialized = 1;
-               } else if (memcmp(fsid, buf.fsid, sizeof(fsid))) {
+               } else if (memcmp(fsid, buf->fsid, sizeof(fsid))) {
                        /*
                         * the superblocks (the original one and
                         * its backups) contain data of different
@@ -1151,37 +1504,39 @@ int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr)
                        continue;
                }
 
-               if (btrfs_super_generation(&buf) > transid) {
-                       memcpy(sb, &buf, sizeof(*sb));
-                       transid = btrfs_super_generation(&buf);
+               if (btrfs_super_generation(buf) > transid) {
+                       memcpy(sb, buf, BTRFS_SUPER_INFO_SIZE);
+                       transid = btrfs_super_generation(buf);
                }
        }
 
        return transid > 0 ? 0 : -1;
 }
 
-int write_dev_supers(struct btrfs_root *root, struct btrfs_super_block *sb,
-                    struct btrfs_device *device)
+static int write_dev_supers(struct btrfs_fs_info *fs_info,
+                           struct btrfs_super_block *sb,
+                           struct btrfs_device *device)
 {
        u64 bytenr;
        u32 crc;
        int i, ret;
 
-       if (root->fs_info->super_bytenr != BTRFS_SUPER_INFO_OFFSET) {
-               btrfs_set_super_bytenr(sb, root->fs_info->super_bytenr);
+       if (fs_info->super_bytenr != BTRFS_SUPER_INFO_OFFSET) {
+               btrfs_set_super_bytenr(sb, fs_info->super_bytenr);
                crc = ~(u32)0;
-               crc = btrfs_csum_data(NULL, (char *)sb + BTRFS_CSUM_SIZE, crc,
+               crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc,
                                      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-               btrfs_csum_final(crc, (char *)&sb->csum[0]);
+               btrfs_csum_final(crc, &sb->csum[0]);
 
                /*
                 * super_copy is BTRFS_SUPER_INFO_SIZE bytes and is
                 * zero filled, we can use it directly
                 */
-               ret = pwrite64(device->fd, root->fs_info->super_copy,
+               ret = pwrite64(device->fd, fs_info->super_copy,
                                BTRFS_SUPER_INFO_SIZE,
-                               root->fs_info->super_bytenr);
-               BUG_ON(ret != BTRFS_SUPER_INFO_SIZE);
+                               fs_info->super_bytenr);
+               if (ret != BTRFS_SUPER_INFO_SIZE)
+                       goto write_err;
                return 0;
        }
 
@@ -1193,36 +1548,42 @@ int write_dev_supers(struct btrfs_root *root, struct btrfs_super_block *sb,
                btrfs_set_super_bytenr(sb, bytenr);
 
                crc = ~(u32)0;
-               crc = btrfs_csum_data(NULL, (char *)sb + BTRFS_CSUM_SIZE, crc,
+               crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc,
                                      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-               btrfs_csum_final(crc, (char *)&sb->csum[0]);
+               btrfs_csum_final(crc, &sb->csum[0]);
 
                /*
                 * super_copy is BTRFS_SUPER_INFO_SIZE bytes and is
                 * zero filled, we can use it directly
                 */
-               ret = pwrite64(device->fd, root->fs_info->super_copy,
+               ret = pwrite64(device->fd, fs_info->super_copy,
                                BTRFS_SUPER_INFO_SIZE, bytenr);
-               BUG_ON(ret != BTRFS_SUPER_INFO_SIZE);
+               if (ret != BTRFS_SUPER_INFO_SIZE)
+                       goto write_err;
        }
 
        return 0;
+
+write_err:
+       if (ret > 0)
+               fprintf(stderr, "WARNING: failed to write all sb data\n");
+       else
+               fprintf(stderr, "WARNING: failed to write sb: %m\n");
+       return ret;
 }
 
-int write_all_supers(struct btrfs_root *root)
+int write_all_supers(struct btrfs_fs_info *fs_info)
 {
-       struct list_head *cur;
-       struct list_head *head = &root->fs_info->fs_devices->devices;
+       struct list_head *head = &fs_info->fs_devices->devices;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
        int ret;
        u64 flags;
 
-       sb = root->fs_info->super_copy;
+       sb = fs_info->super_copy;
        dev_item = &sb->dev_item;
-       list_for_each(cur, head) {
-               dev = list_entry(cur, struct btrfs_device, dev_list);
+       list_for_each_entry(dev, head, dev_list) {
                if (!dev->writeable)
                        continue;
 
@@ -1240,117 +1601,88 @@ int write_all_supers(struct btrfs_root *root)
                flags = btrfs_super_flags(sb);
                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-               ret = write_dev_supers(root, sb, dev);
+               ret = write_dev_supers(fs_info, sb, dev);
                BUG_ON(ret);
        }
        return 0;
 }
 
 int write_ctree_super(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root)
+                     struct btrfs_fs_info *fs_info)
 {
        int ret;
-       struct btrfs_root *tree_root = root->fs_info->tree_root;
-       struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+       struct btrfs_root *tree_root = fs_info->tree_root;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
 
-       if (root->fs_info->readonly)
+       if (fs_info->readonly)
                return 0;
 
-       btrfs_set_super_generation(root->fs_info->super_copy,
+       btrfs_set_super_generation(fs_info->super_copy,
                                   trans->transid);
-       btrfs_set_super_root(root->fs_info->super_copy,
+       btrfs_set_super_root(fs_info->super_copy,
                             tree_root->node->start);
-       btrfs_set_super_root_level(root->fs_info->super_copy,
+       btrfs_set_super_root_level(fs_info->super_copy,
                                   btrfs_header_level(tree_root->node));
-       btrfs_set_super_chunk_root(root->fs_info->super_copy,
+       btrfs_set_super_chunk_root(fs_info->super_copy,
                                   chunk_root->node->start);
-       btrfs_set_super_chunk_root_level(root->fs_info->super_copy,
+       btrfs_set_super_chunk_root_level(fs_info->super_copy,
                                         btrfs_header_level(chunk_root->node));
-       btrfs_set_super_chunk_root_generation(root->fs_info->super_copy,
+       btrfs_set_super_chunk_root_generation(fs_info->super_copy,
                                btrfs_header_generation(chunk_root->node));
 
-       ret = write_all_supers(root);
+       ret = write_all_supers(fs_info);
        if (ret)
                fprintf(stderr, "failed to write new super block err %d\n", ret);
        return ret;
 }
 
-static int close_all_devices(struct btrfs_fs_info *fs_info)
-{
-       struct list_head *list;
-       struct list_head *next;
-       struct btrfs_device *device;
-
-       return 0;
-
-       list = &fs_info->fs_devices->devices;
-       list_for_each(next, list) {
-               device = list_entry(next, struct btrfs_device, dev_list);
-               if (device->fd) {
-                       fsync(device->fd);
-                       if (posix_fadvise(device->fd, 0, 0, POSIX_FADV_DONTNEED))
-                               fprintf(stderr, "Warning, could not drop caches\n");
-               }
-               close(device->fd);
-       }
-       return 0;
-}
-
-int close_ctree(struct btrfs_root *root)
+int close_ctree_fs_info(struct btrfs_fs_info *fs_info)
 {
        int ret;
+       int err = 0;
        struct btrfs_trans_handle *trans;
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_root *root = fs_info->tree_root;
 
        if (fs_info->last_trans_committed !=
            fs_info->generation) {
+               BUG_ON(!root);
                trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       err = PTR_ERR(trans);
+                       goto skip_commit;
+               }
                btrfs_commit_transaction(trans, root);
                trans = btrfs_start_transaction(root, 1);
+               BUG_ON(IS_ERR(trans));
                ret = commit_tree_roots(trans, fs_info);
                BUG_ON(ret);
                ret = __commit_transaction(trans, root);
                BUG_ON(ret);
-               write_ctree_super(trans, root);
-               btrfs_free_transaction(root, trans);
+               write_ctree_super(trans, fs_info);
+               kfree(trans);
        }
-       btrfs_free_block_groups(fs_info);
 
-       free_fs_roots(fs_info);
-
-       if (fs_info->extent_root->node)
-               free_extent_buffer(fs_info->extent_root->node);
-       if (fs_info->tree_root->node)
-               free_extent_buffer(fs_info->tree_root->node);
-       if (fs_info->chunk_root->node)
-               free_extent_buffer(fs_info->chunk_root->node);
-       if (fs_info->dev_root->node)
-               free_extent_buffer(fs_info->dev_root->node);
-       if (fs_info->csum_root->node)
-               free_extent_buffer(fs_info->csum_root->node);
-
-       if (fs_info->log_root_tree) {
-               if (fs_info->log_root_tree->node)
-                       free_extent_buffer(fs_info->log_root_tree->node);
-               free(fs_info->log_root_tree);
+       if (fs_info->finalize_on_close) {
+               btrfs_set_super_magic(fs_info->super_copy, BTRFS_MAGIC);
+               root->fs_info->finalize_on_close = 0;
+               ret = write_all_supers(fs_info);
+               if (ret)
+                       fprintf(stderr,
+                               "failed to write new super block err %d\n", ret);
        }
 
-       close_all_devices(fs_info);
-       extent_io_tree_cleanup(&fs_info->extent_cache);
-       extent_io_tree_cleanup(&fs_info->free_space_cache);
-       extent_io_tree_cleanup(&fs_info->block_group_cache);
-       extent_io_tree_cleanup(&fs_info->pinned_extents);
-       extent_io_tree_cleanup(&fs_info->pending_del);
-       extent_io_tree_cleanup(&fs_info->extent_ins);
+skip_commit:
+       btrfs_free_block_groups(fs_info);
 
-       free(fs_info->tree_root);
-       free(fs_info->extent_root);
-       free(fs_info->chunk_root);
-       free(fs_info->dev_root);
-       free(fs_info->csum_root);
-       free(fs_info);
+       free_fs_roots_tree(&fs_info->fs_root_tree);
 
-       return 0;
+       btrfs_release_all_roots(fs_info);
+       ret = btrfs_close_devices(fs_info->fs_devices);
+       btrfs_cleanup_all_caches(fs_info);
+       btrfs_free_fs_info(fs_info);
+       if (!err)
+               err = ret;
+       return err;
 }
 
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -1359,12 +1691,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return clear_extent_buffer_dirty(eb);
 }
 
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-                                struct extent_buffer *eb)
-{
-       return 0;
-}
-
 void btrfs_mark_buffer_dirty(struct extent_buffer *eb)
 {
        set_extent_buffer_dirty(eb);