btrfs-progs: build btrsfck to keep compatibility
[platform/upstream/btrfs-progs.git] / disk-io.c
index addebe1..5aa9aa3 100644 (file)
--- a/disk-io.c
+++ b/disk-io.c
 #include "utils.h"
 #include "print-tree.h"
 
+static int close_all_devices(struct btrfs_fs_info *fs_info);
+
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 {
 
        struct btrfs_fs_devices *fs_devices;
        int ret = 1;
 
-       if (buf->start != btrfs_header_bytenr(buf))
+       if (buf->start != btrfs_header_bytenr(buf)) {
+               printk("Check tree block failed, want=%Lu, have=%Lu\n",
+                      buf->start, btrfs_header_bytenr(buf));
                return ret;
+       }
 
        fs_devices = root->fs_info->fs_devices;
        while (fs_devices) {
@@ -84,9 +89,9 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
 
        if (verify) {
                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
-                       printk("checksum verify failed on %llu wanted %X "
-                              "found %X\n", (unsigned long long)buf->start,
-                              *((int *)result), *((int *)buf));
+                       printk("checksum verify failed on %llu found %X "
+                              "wanted %X\n", (unsigned long long)buf->start,
+                              *((int *)result), *((char *)buf->data));
                        free(result);
                        return 1;
                }
@@ -123,7 +128,6 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                         u64 parent_transid)
 {
        int ret;
-       int dev_nr;
        struct extent_buffer *eb;
        u64 length;
        struct btrfs_multi_bio *multi = NULL;
@@ -135,10 +139,9 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                return 0;
        }
 
-       dev_nr = 0;
        length = blocksize;
        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-                             bytenr, &length, &multi, 0);
+                             bytenr, &length, &multi, 0, NULL);
        BUG_ON(ret);
        device = multi->stripes[0].dev;
        device->total_ios++;
@@ -149,7 +152,8 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 }
 
 static int verify_parent_transid(struct extent_io_tree *io_tree,
-                                struct extent_buffer *eb, u64 parent_transid)
+                                struct extent_buffer *eb, u64 parent_transid,
+                                int ignore)
 {
        int ret;
 
@@ -165,6 +169,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
               (unsigned long long)eb->start,
               (unsigned long long)parent_transid,
               (unsigned long long)btrfs_header_generation(eb));
+       if (ignore) {
+               printk("Ignoring transid failure\n");
+               return 0;
+       }
+
        ret = 1;
 out:
        clear_extent_buffer_uptodate(io_tree, eb);
@@ -173,17 +182,56 @@ out:
 }
 
 
+static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
+{
+       unsigned long offset = 0;
+       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_device *device;
+       int ret = 0;
+       u64 read_len;
+       unsigned long bytes_left = eb->len;
+
+       while (bytes_left) {
+               read_len = bytes_left;
+               ret = btrfs_map_block(&info->mapping_tree, READ,
+                                     eb->start + offset, &read_len, &multi,
+                                     mirror, NULL);
+               if (ret) {
+                       printk("Couldn't map the block %Lu\n", eb->start + offset);
+                       return -EIO;
+               }
+               device = multi->stripes[0].dev;
+
+               if (device->fd == 0)
+                       return -EIO;
+
+               eb->fd = device->fd;
+               device->total_ios++;
+               eb->dev_bytenr = multi->stripes[0].physical;
+               kfree(multi);
+
+               if (read_len > bytes_left)
+                       read_len = bytes_left;
+
+               ret = read_extent_from_disk(eb, offset, read_len);
+               if (ret)
+                       return -EIO;
+               offset += read_len;
+               bytes_left -= read_len;
+       }
+       return 0;
+}
+
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                     u32 blocksize, u64 parent_transid)
 {
        int ret;
-       int dev_nr;
        struct extent_buffer *eb;
-       u64 length;
-       struct btrfs_multi_bio *multi = NULL;
-       struct btrfs_device *device;
+       u64 best_transid = 0;
        int mirror_num = 0;
+       int good_mirror = 0;
        int num_copies;
+       int ignore = 0;
 
        eb = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!eb)
@@ -192,44 +240,193 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
        if (btrfs_buffer_uptodate(eb, parent_transid))
                return eb;
 
-       dev_nr = 0;
-       length = blocksize;
        while (1) {
-               ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-                                     eb->start, &length, &multi, mirror_num);
-               BUG_ON(ret);
-               device = multi->stripes[0].dev;
-               eb->fd = device->fd;
-               device->total_ios++;
-               eb->dev_bytenr = multi->stripes[0].physical;
-               kfree(multi);
-               ret = read_extent_from_disk(eb);
+               ret = read_whole_eb(root->fs_info, eb, mirror_num);
                if (ret == 0 && check_tree_block(root, eb) == 0 &&
                    csum_tree_block(root, eb, 1) == 0 &&
-                   verify_parent_transid(eb->tree, eb, parent_transid) == 0) {
+                   verify_parent_transid(eb->tree, eb, parent_transid, ignore)
+                   == 0) {
                        btrfs_set_buffer_uptodate(eb);
                        return eb;
                }
+               if (ignore) {
+                       if (check_tree_block(root, eb))
+                               printk("read block failed check_tree_block\n");
+                       else
+                               printk("Csum didn't match\n");
+                       break;
+               }
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
                                              eb->start, eb->len);
                if (num_copies == 1) {
-                       break;
+                       ignore = 1;
+                       continue;
+               }
+               if (btrfs_header_generation(eb) > best_transid) {
+                       best_transid = btrfs_header_generation(eb);
+                       good_mirror = mirror_num;
                }
                mirror_num++;
                if (mirror_num > num_copies) {
-                       break;
+                       mirror_num = good_mirror;
+                       ignore = 1;
+                       continue;
                }
        }
        free_extent_buffer(eb);
        return NULL;
 }
 
+static int rmw_eb(struct btrfs_fs_info *info,
+                 struct extent_buffer *eb, struct extent_buffer *orig_eb)
+{
+       int ret;
+       unsigned long orig_off = 0;
+       unsigned long dest_off = 0;
+       unsigned long copy_len = eb->len;
+
+       ret = read_whole_eb(info, eb, 0);
+       if (ret)
+               return ret;
+
+       if (eb->start + eb->len <= orig_eb->start ||
+           eb->start >= orig_eb->start + orig_eb->len)
+               return 0;
+       /*
+        * | ----- orig_eb ------- |
+        *         | ----- stripe -------  |
+        *         | ----- orig_eb ------- |
+        *              | ----- orig_eb ------- |
+        */
+       if (eb->start > orig_eb->start)
+               orig_off = eb->start - orig_eb->start;
+       if (orig_eb->start > eb->start)
+               dest_off = orig_eb->start - eb->start;
+
+       if (copy_len > orig_eb->len - orig_off)
+               copy_len = orig_eb->len - orig_off;
+       if (copy_len > eb->len - dest_off)
+               copy_len = eb->len - dest_off;
+
+       memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
+       return 0;
+}
+
+static void split_eb_for_raid56(struct btrfs_fs_info *info,
+                               struct extent_buffer *orig_eb,
+                              struct extent_buffer **ebs,
+                              u64 stripe_len, u64 *raid_map,
+                              int num_stripes)
+{
+       struct extent_buffer *eb;
+       u64 start = orig_eb->start;
+       u64 this_eb_start;
+       int i;
+       int ret;
+
+       for (i = 0; i < num_stripes; i++) {
+               if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
+                       break;
+
+               eb = malloc(sizeof(struct extent_buffer) + stripe_len);
+               if (!eb)
+                       BUG();
+               memset(eb, 0, sizeof(struct extent_buffer) + stripe_len);
+
+               eb->start = raid_map[i];
+               eb->len = stripe_len;
+               eb->refs = 1;
+               eb->flags = 0;
+               eb->fd = -1;
+               eb->dev_bytenr = (u64)-1;
+
+               this_eb_start = raid_map[i];
+
+               if (start > this_eb_start ||
+                   start + orig_eb->len < this_eb_start + stripe_len) {
+                       ret = rmw_eb(info, eb, orig_eb);
+                       BUG_ON(ret);
+               } else {
+                       memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len);
+               }
+               ebs[i] = eb;
+       }
+}
+
+static int write_raid56_with_parity(struct btrfs_fs_info *info,
+                                   struct extent_buffer *eb,
+                                   struct btrfs_multi_bio *multi,
+                                   u64 stripe_len, u64 *raid_map)
+{
+       struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL;
+       int i;
+       int j;
+       int ret;
+       int alloc_size = eb->len;
+
+       if (stripe_len > alloc_size)
+               alloc_size = stripe_len;
+
+       split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
+                           multi->num_stripes);
+
+       for (i = 0; i < multi->num_stripes; i++) {
+               struct extent_buffer *new_eb;
+               if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
+                       ebs[i]->dev_bytenr = multi->stripes[i].physical;
+                       ebs[i]->fd = multi->stripes[i].dev->fd;
+                       multi->stripes[i].dev->total_ios++;
+                       BUG_ON(ebs[i]->start != raid_map[i]);
+                       continue;
+               }
+               new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS);
+               BUG_ON(!new_eb);
+               new_eb->dev_bytenr = multi->stripes[i].physical;
+               new_eb->fd = multi->stripes[i].dev->fd;
+               multi->stripes[i].dev->total_ios++;
+               new_eb->len = stripe_len;
+
+               if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
+                       p_eb = new_eb;
+               else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
+                       q_eb = new_eb;
+       }
+       if (q_eb) {
+               void *pointers[multi->num_stripes];
+               ebs[multi->num_stripes - 2] = p_eb;
+               ebs[multi->num_stripes - 1] = q_eb;
+
+               for (i = 0; i < multi->num_stripes; i++)
+                       pointers[i] = ebs[i]->data;
+
+               raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
+       } else {
+               ebs[multi->num_stripes - 1] = p_eb;
+               memcpy(p_eb->data, ebs[0]->data, stripe_len);
+               for (j = 1; j < multi->num_stripes - 1; j++) {
+                       for (i = 0; i < stripe_len; i += sizeof(unsigned long)) {
+                               *(unsigned long *)(p_eb->data + i) ^=
+                                       *(unsigned long *)(ebs[j]->data + i);
+                       }
+               }
+       }
+
+       for (i = 0; i < multi->num_stripes; i++) {
+               ret = write_extent_to_disk(ebs[i]);
+               BUG_ON(ret);
+               if (ebs[i] != eb)
+                       kfree(ebs[i]);
+       }
+       return 0;
+}
+
 int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                     struct extent_buffer *eb)
 {
        int ret;
        int dev_nr;
        u64 length;
+       u64 *raid_map = NULL;
        struct btrfs_multi_bio *multi = NULL;
 
        if (check_tree_block(root, eb))
@@ -243,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        dev_nr = 0;
        length = eb->len;
        ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,
-                             eb->start, &length, &multi, 0);
+                             eb->start, &length, &multi, 0, &raid_map);
 
-       while(dev_nr < multi->num_stripes) {
+       if (raid_map) {
+               ret = write_raid56_with_parity(root->fs_info, eb, multi,
+                                              length, raid_map);
+               BUG_ON(ret);
+       } else while (dev_nr < multi->num_stripes) {
                BUG_ON(ret);
                eb->fd = multi->stripes[dev_nr].dev->fd;
                eb->dev_bytenr = multi->stripes[dev_nr].physical;
@@ -258,7 +459,7 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return 0;
 }
 
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
                        u32 stripesize, struct btrfs_root *root,
                        struct btrfs_fs_info *fs_info, u64 objectid)
 {
@@ -316,14 +517,17 @@ static int commit_tree_roots(struct btrfs_trans_handle *trans,
        struct btrfs_root *root;
        struct list_head *next;
        struct extent_buffer *eb;
+       int ret;
 
        if (fs_info->readonly)
                return 0;
 
        eb = fs_info->tree_root->node;
        extent_buffer_get(eb);
-       btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+       ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
        free_extent_buffer(eb);
+       if (ret)
+               return ret;
 
        while(!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
@@ -364,6 +568,7 @@ static int __commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
+       u64 transid = trans->transid;
        int ret = 0;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
@@ -391,6 +596,7 @@ commit_tree:
        free_extent_buffer(root->commit_root);
        root->commit_root = NULL;
        fs_info->running_transaction = NULL;
+       fs_info->last_trans_committed = transid;
        return 0;
 }
 
@@ -407,13 +613,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
                     root, fs_info, objectid);
        ret = btrfs_find_last_root(tree_root, objectid,
                                   &root->root_item, &root->root_key);
-       BUG_ON(ret);
+       if (ret)
+               return ret;
 
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        generation = btrfs_root_generation(&root->root_item);
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-       BUG_ON(!root->node);
+       if (!extent_buffer_uptodate(root->node))
+               return -EIO;
+
        return 0;
 }
 
@@ -425,8 +634,13 @@ static int find_and_setup_log_root(struct btrfs_root *tree_root,
        u64 blocknr = btrfs_super_log_root(disk_super);
        struct btrfs_root *log_root = malloc(sizeof(struct btrfs_root));
 
-       if (blocknr == 0)
+       if (!log_root)
+               return -ENOMEM;
+
+       if (blocknr == 0) {
+               free(log_root);
                return 0;
+       }
 
        blocksize = btrfs_level_size(tree_root,
                             btrfs_super_log_root_level(disk_super));
@@ -440,7 +654,13 @@ static int find_and_setup_log_root(struct btrfs_root *tree_root,
                                     btrfs_super_generation(disk_super) + 1);
 
        fs_info->log_root_tree = log_root;
-       BUG_ON(!log_root->node);
+
+       if (!extent_buffer_uptodate(log_root->node)) {
+               free(log_root);
+               return -EIO;
+       }
+
+       free(log_root);
        return 0;
 }
 
@@ -549,7 +769,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                return fs_info->dev_root;
        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
                return fs_info->csum_root;
-       
+
        BUG_ON(location->objectid == BTRFS_TREE_RELOC_OBJECTID ||
               location->offset != (u64)-1);
 
@@ -570,28 +790,10 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
        return root;
 }
 
-struct btrfs_root *open_ctree(const char *filename, u64 sb_bytenr, int writes)
-{
-       int fp;
-       struct btrfs_root *root;
-       int flags = O_CREAT | O_RDWR;
-
-       if (!writes)
-               flags = O_RDONLY;
-
-       fp = open(filename, flags, 0600);
-       if (fp < 0) {
-               fprintf (stderr, "Could not open %s\n", filename);
-               return NULL;
-       }
-       root = open_ctree_fd(fp, filename, sb_bytenr, writes);
-       close(fp);
-
-       return root;
-}
-
-struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
-                                int writes)
+static struct btrfs_fs_info *__open_ctree_fd(int fp, const char *path,
+                                            u64 sb_bytenr,
+                                            u64 root_tree_bytenr, int writes,
+                                            int partial)
 {
        u32 sectorsize;
        u32 nodesize;
@@ -615,17 +817,21 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
        if (sb_bytenr == 0)
                sb_bytenr = BTRFS_SUPER_INFO_OFFSET;
 
+       /* try to drop all the caches */
+       posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED);
+
        ret = btrfs_scan_one_device(fp, path, &fs_devices,
                                    &total_devs, sb_bytenr);
 
        if (ret) {
                fprintf(stderr, "No valid Btrfs found on %s\n", path);
-               return NULL;
+               goto out;
        }
 
        if (total_devs != 1) {
                ret = btrfs_scan_for_fsid(fs_devices, total_devs, 1);
-               BUG_ON(ret);
+               if (ret)
+                       goto out;
        }
 
        memset(fs_info, 0, sizeof(*fs_info));
@@ -660,7 +866,8 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
                ret = btrfs_open_devices(fs_devices, O_RDWR);
        else
                ret = btrfs_open_devices(fs_devices, O_RDONLY);
-       BUG_ON(ret);
+       if (ret)
+               goto out_cleanup;
 
        fs_info->super_bytenr = sb_bytenr;
        disk_super = &fs_info->super_copy;
@@ -668,7 +875,7 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
                                   disk_super, sb_bytenr);
        if (ret) {
                printk("No valid btrfs found\n");
-               BUG_ON(1);
+               goto out_devices;
        }
 
        memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
@@ -678,8 +885,9 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
                   ~BTRFS_FEATURE_INCOMPAT_SUPP;
        if (features) {
                printk("couldn't open because of unsupported "
-                      "option features (%Lx).\n", features);
-               BUG_ON(1);
+                      "option features (%Lx).\n",
+                      (unsigned long long)features);
+               goto out_devices;
        }
 
        features = btrfs_super_incompat_flags(disk_super);
@@ -692,8 +900,9 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (writes && features) {
                printk("couldn't open RDWR because of unsupported "
-                      "option features (%Lx).\n", features);
-               BUG_ON(1);
+                      "option features (%Lx).\n",
+                      (unsigned long long)features);
+               goto out_devices;
        }
 
        nodesize = btrfs_super_nodesize(disk_super);
@@ -706,7 +915,8 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
        tree_root->stripesize = stripesize;
 
        ret = btrfs_read_sys_array(tree_root);
-       BUG_ON(ret);
+       if (ret)
+               goto out_devices;
        blocksize = btrfs_level_size(tree_root,
                                     btrfs_super_chunk_root_level(disk_super));
        generation = btrfs_super_chunk_root_generation(disk_super);
@@ -717,8 +927,10 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
        chunk_root->node = read_tree_block(chunk_root,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
-
-       BUG_ON(!chunk_root->node);
+       if (!extent_buffer_uptodate(chunk_root->node)) {
+               printk("Couldn't read chunk root\n");
+               goto out_devices;
+       }
 
        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
                 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -726,37 +938,52 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
 
        if (!(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_METADUMP)) {
                ret = btrfs_read_chunk_tree(chunk_root);
-               BUG_ON(ret);
+               if (ret)
+                       goto out_failed;
        }
 
        blocksize = btrfs_level_size(tree_root,
                                     btrfs_super_root_level(disk_super));
        generation = btrfs_super_generation(disk_super);
 
+       if (!root_tree_bytenr)
+               root_tree_bytenr = btrfs_super_root(disk_super);
        tree_root->node = read_tree_block(tree_root,
-                                         btrfs_super_root(disk_super),
+                                         root_tree_bytenr,
                                          blocksize, generation);
-       BUG_ON(!tree_root->node);
+       if (!extent_buffer_uptodate(tree_root->node)) {
+               printk("Couldn't read tree root\n");
+               goto out_failed;
+       }
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-       BUG_ON(ret);
+       if (ret) {
+               printk("Couldn't setup extent tree\n");
+               goto out_failed;
+       }
        extent_root->track_dirty = 1;
 
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
-       BUG_ON(ret);
+       if (ret) {
+               printk("Couldn't setup device tree\n");
+               goto out_failed;
+       }
        dev_root->track_dirty = 1;
 
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
-       BUG_ON(ret);
+       if (ret) {
+               printk("Couldn't setup csum tree\n");
+               if (!partial)
+                       goto out_failed;
+       }
        csum_root->track_dirty = 1;
 
-       BUG_ON(ret);
-
        find_and_setup_log_root(tree_root, fs_info, disk_super);
 
-       fs_info->generation = generation + 1;
+       fs_info->generation = generation;
+       fs_info->last_trans_committed = generation;
        btrfs_read_block_groups(fs_info->tree_root);
 
        key.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -764,16 +991,114 @@ struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
        key.offset = (u64)-1;
        fs_info->fs_root = btrfs_read_fs_root(fs_info, &key);
 
+       if (!fs_info->fs_root)
+               goto out_failed;
+
        fs_info->data_alloc_profile = (u64)-1;
        fs_info->metadata_alloc_profile = (u64)-1;
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
-       return fs_info->fs_root;
+       return fs_info;
+
+out_failed:
+       if (partial)
+               return fs_info;
+
+       if (fs_info->csum_root)
+               free_extent_buffer(fs_info->csum_root->node);
+       if (fs_info->dev_root)
+               free_extent_buffer(fs_info->dev_root->node);
+       if (fs_info->extent_root)
+               free_extent_buffer(fs_info->extent_root->node);
+       if (fs_info->tree_root)
+               free_extent_buffer(fs_info->tree_root->node);
+       if (fs_info->chunk_root)
+               free_extent_buffer(fs_info->chunk_root->node);
+out_devices:
+       close_all_devices(fs_info);
+out_cleanup:
+       extent_io_tree_cleanup(&fs_info->extent_cache);
+       extent_io_tree_cleanup(&fs_info->free_space_cache);
+       extent_io_tree_cleanup(&fs_info->block_group_cache);
+       extent_io_tree_cleanup(&fs_info->pinned_extents);
+       extent_io_tree_cleanup(&fs_info->pending_del);
+       extent_io_tree_cleanup(&fs_info->extent_ins);
+out:
+       free(tree_root);
+       free(extent_root);
+       free(chunk_root);
+       free(dev_root);
+       free(csum_root);
+       free(fs_info);
+       return NULL;
+}
+
+struct btrfs_fs_info *open_ctree_fs_info(const char *filename,
+                                        u64 sb_bytenr, int writes,
+                                        int partial)
+{
+       int fp;
+       struct btrfs_fs_info *info;
+       int flags = O_CREAT | O_RDWR;
+
+       if (!writes)
+               flags = O_RDONLY;
+
+       fp = open(filename, flags, 0600);
+       if (fp < 0) {
+               fprintf (stderr, "Could not open %s\n", filename);
+               return NULL;
+       }
+       info = __open_ctree_fd(fp, filename, sb_bytenr, 0, writes, partial);
+       close(fp);
+       return info;
+}
+
+struct btrfs_root *open_ctree(const char *filename, u64 sb_bytenr, int writes)
+{
+       struct btrfs_fs_info *info;
+
+       info = open_ctree_fs_info(filename, sb_bytenr, writes, 0);
+       if (!info)
+               return NULL;
+       return info->fs_root;
+}
+
+struct btrfs_root *open_ctree_recovery(const char *filename, u64 sb_bytenr,
+                                      u64 root_tree_bytenr)
+{
+       int fp;
+       struct btrfs_fs_info *info;
+
+
+       fp = open(filename, O_RDONLY);
+       if (fp < 0) {
+               fprintf (stderr, "Could not open %s\n", filename);
+               return NULL;
+       }
+       info = __open_ctree_fd(fp, filename, sb_bytenr,
+                              root_tree_bytenr, 0, 0);
+       close(fp);
+
+       if (!info)
+               return NULL;
+       return info->fs_root;
+}
+
+struct btrfs_root *open_ctree_fd(int fp, const char *path, u64 sb_bytenr,
+                                int writes)
+{
+       struct btrfs_fs_info *info;
+       info = __open_ctree_fd(fp, path, sb_bytenr, 0, writes, 0);
+       if (!info)
+               return NULL;
+       return info->fs_root;
 }
 
 int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr)
 {
        u8 fsid[BTRFS_FSID_SIZE];
+       int fsid_is_initialized = 0;
        struct btrfs_super_block buf;
        int i;
        int ret;
@@ -786,8 +1111,7 @@ int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr)
                        return -1;
 
                if (btrfs_super_bytenr(&buf) != sb_bytenr ||
-                   strncmp((char *)(&buf.magic), BTRFS_MAGIC,
-                           sizeof(buf.magic)))
+                   buf.magic != cpu_to_le64(BTRFS_MAGIC))
                        return -1;
 
                memcpy(sb, &buf, sizeof(*sb));
@@ -800,15 +1124,25 @@ int btrfs_read_dev_super(int fd, struct btrfs_super_block *sb, u64 sb_bytenr)
                if (ret < sizeof(buf))
                        break;
 
-               if (btrfs_super_bytenr(&buf) != bytenr ||
-                   strncmp((char *)(&buf.magic), BTRFS_MAGIC,
-                           sizeof(buf.magic)))
+               if (btrfs_super_bytenr(&buf) != bytenr )
+                       continue;
+               /* if magic is NULL, the device was removed */
+               if (buf.magic == 0 && i == 0) 
+                       return -1;
+               if (buf.magic != cpu_to_le64(BTRFS_MAGIC))
                        continue;
 
-               if (i == 0)
+               if (!fsid_is_initialized) {
                        memcpy(fsid, buf.fsid, sizeof(fsid));
-               else if (memcmp(fsid, buf.fsid, sizeof(fsid)))
+                       fsid_is_initialized = 1;
+               } else if (memcmp(fsid, buf.fsid, sizeof(fsid))) {
+                       /*
+                        * the superblocks (the original one and
+                        * its backups) contain data of different
+                        * filesystems -> the super cannot be trusted
+                        */
                        continue;
+               }
 
                if (btrfs_super_generation(&buf) > transid) {
                        memcpy(sb, &buf, sizeof(*sb));
@@ -825,24 +1159,28 @@ int write_dev_supers(struct btrfs_root *root, struct btrfs_super_block *sb,
        u64 bytenr;
        u32 crc;
        int i, ret;
+       void *buf;
+
+       buf = calloc(1, BTRFS_SUPER_INFO_SIZE);
+       BUG_ON(!buf);
 
        if (root->fs_info->super_bytenr != BTRFS_SUPER_INFO_OFFSET) {
                btrfs_set_super_bytenr(sb, root->fs_info->super_bytenr);
-
                crc = ~(u32)0;
                crc = btrfs_csum_data(NULL, (char *)sb + BTRFS_CSUM_SIZE, crc,
                                      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
                btrfs_csum_final(crc, (char *)&sb->csum[0]);
 
-               ret = pwrite64(device->fd, sb, BTRFS_SUPER_INFO_SIZE,
+               memcpy(buf, sb, sizeof(*sb));
+               ret = pwrite64(device->fd, buf, BTRFS_SUPER_INFO_SIZE,
                               root->fs_info->super_bytenr);
                BUG_ON(ret != BTRFS_SUPER_INFO_SIZE);
-               return 0;
+               goto out;
        }
 
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-               if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+               if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
                        break;
 
                btrfs_set_super_bytenr(sb, bytenr);
@@ -852,9 +1190,12 @@ int write_dev_supers(struct btrfs_root *root, struct btrfs_super_block *sb,
                                      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
                btrfs_csum_final(crc, (char *)&sb->csum[0]);
 
-               ret = pwrite64(device->fd, sb, BTRFS_SUPER_INFO_SIZE, bytenr);
+               memcpy(buf, sb, sizeof(*sb));
+               ret = pwrite64(device->fd, buf, BTRFS_SUPER_INFO_SIZE, bytenr);
                BUG_ON(ret != BTRFS_SUPER_INFO_SIZE);
        }
+out:
+       free(buf);
        return 0;
 }
 
@@ -935,6 +1276,10 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
        list = &fs_info->fs_devices->devices;
        list_for_each(next, list) {
                device = list_entry(next, struct btrfs_device, dev_list);
+               if (device->fd) {
+                       fsync(device->fd);
+                       posix_fadvise(device->fd, 0, 0, POSIX_FADV_DONTNEED);
+               }
                close(device->fd);
        }
        return 0;
@@ -946,15 +1291,18 @@ int close_ctree(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_commit_transaction(trans, root);
-       trans = btrfs_start_transaction(root, 1);
-       ret = commit_tree_roots(trans, fs_info);
-       BUG_ON(ret);
-       ret = __commit_transaction(trans, root);
-       BUG_ON(ret);
-       write_ctree_super(trans, root);
-       btrfs_free_transaction(root, trans);
+       if (fs_info->last_trans_committed !=
+           fs_info->generation) {
+               trans = btrfs_start_transaction(root, 1);
+               btrfs_commit_transaction(trans, root);
+               trans = btrfs_start_transaction(root, 1);
+               ret = commit_tree_roots(trans, fs_info);
+               BUG_ON(ret);
+               ret = __commit_transaction(trans, root);
+               BUG_ON(ret);
+               write_ctree_super(trans, root);
+               btrfs_free_transaction(root, trans);
+       }
        btrfs_free_block_groups(fs_info);
 
        free_fs_roots(fs_info);
@@ -970,13 +1318,13 @@ int close_ctree(struct btrfs_root *root)
        if (fs_info->csum_root->node)
                free_extent_buffer(fs_info->csum_root->node);
 
-       if (root->fs_info->log_root_tree) {
-               if (root->fs_info->log_root_tree->node)
-                       free_extent_buffer(root->fs_info->log_root_tree->node);
-               free(root->fs_info->log_root_tree);
+       if (fs_info->log_root_tree) {
+               if (fs_info->log_root_tree->node)
+                       free_extent_buffer(fs_info->log_root_tree->node);
+               free(fs_info->log_root_tree);
        }
 
-       close_all_devices(root->fs_info);
+       close_all_devices(fs_info);
        extent_io_tree_cleanup(&fs_info->extent_cache);
        extent_io_tree_cleanup(&fs_info->free_space_cache);
        extent_io_tree_cleanup(&fs_info->block_group_cache);
@@ -1019,7 +1367,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
        if (!ret)
                return ret;
 
-       ret = verify_parent_transid(buf->tree, buf, parent_transid);
+       ret = verify_parent_transid(buf->tree, buf, parent_transid, 1);
        return !ret;
 }