#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
+#include <linux/pagemap.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#define BTRFS_LABEL_SIZE 256
/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unsed_64[4];
+
+ u8 tree_root_level;
+ u8 chunk_root_level;
+ u8 extent_root_level;
+ u8 fs_root_level;
+ u8 dev_root_level;
+ u8 csum_root_level;
+ /* future and to align */
+ u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
/* future expansion */
__le64 reserved[31];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
/*
struct btrfs_block_rsv {
u64 size;
u64 reserved;
- u64 freed[2];
struct btrfs_space_info *space_info;
- struct list_head list;
spinlock_t lock;
- atomic_t usage;
- unsigned int priority:8;
- unsigned int durable:1;
- unsigned int refill_used:1;
unsigned int full:1;
};
spinlock_t lock;
u64 pinned;
u64 reserved;
- u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
+ u64 cache_generation;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
+ /* keep track of unallocated space */
+ spinlock_t free_chunk_lock;
+ u64 free_chunk_space;
+
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
+ /* block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
struct btrfs_block_rsv empty_block_rsv;
- /* list of block reservations that cross multiple transactions */
- struct list_head durable_block_rsv_list;
-
- struct mutex durable_block_rsv_mutex;
-
u64 generation;
u64 last_trans_committed;
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
- struct btrfs_super_block super_copy;
- struct btrfs_super_block super_for_commit;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
struct super_block *sb;
struct inode *btree_inode;
struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
struct btrfs_workers caching_workers;
+ struct btrfs_workers readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
struct btrfs_delayed_root *delayed_root;
+ /* readahead tree */
+ spinlock_t reada_lock;
+ struct radix_tree_root reada_tree;
++
+ /* next backup root to be overwritten */
+ int backup_root_index;
};
/*
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+#define BTRFS_MOUNT_RECOVERY (1 << 18)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
}
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
/* extent-tree.c */
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
3 * num_items;
}
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+ unsigned num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ num_items;
+}
+
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int min_factor);
+ u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
smp_mb();
return fs_info->closing;
}
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ kfree(fs_info->delayed_root);
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kfree(fs_info);
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+ /* reada.c */
+ struct reada_control {
+ struct btrfs_root *root; /* tree to prefetch */
+ struct btrfs_key key_start;
+ struct btrfs_key key_end; /* exclusive */
+ atomic_t elems;
+ struct kref refcnt;
+ wait_queue_head_t wait;
+ };
+ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *start, struct btrfs_key *end);
+ int btrfs_reada_wait(void *handle);
+ void btrfs_reada_detach(void *handle);
+ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err);
+
#endif
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
- ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ ret = read_extent_buffer_pages(io_tree, eb, start,
+ WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret &&
!verify_parent_transid(io_tree, eb, parent_transid))
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err:
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, ret);
+ }
+
free_extent_buffer(eb);
out:
return ret;
}
+ static int btree_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+ {
+ struct extent_io_tree *tree;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page);
+ if (eb == NULL)
+ goto out;
+
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, -EIO);
+ }
+
+ out:
+ return -EIO; /* we fixed nothing */
+ }
+
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 0, btree_get_extent, 0);
+ buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
+ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb)
+ {
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ btree_get_extent, mirror_num);
+ if (ret) {
+ free_extent_buffer(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+ }
+
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
free_extent_buffer(root->node);
+ root->node = NULL;
return -EIO;
}
root->commit_root = btrfs_root_node(root);
return 0;
}
+/*
+ * this will find the highest generation in the array of
+ * root backups. The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+ u64 cur;
+ int newest_index = -1;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = i;
+ }
+
+ /* check to see if we actually wrapped around */
+ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ root_backup = info->super_copy->super_roots;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = 0;
+ }
+ return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array. This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ u64 newest_gen)
+{
+ int newest_index = -1;
+
+ newest_index = find_newest_super_backup(info, newest_gen);
+ /* if there was garbage in there, just move along */
+ if (newest_index == -1) {
+ info->backup_root_index = 0;
+ } else {
+ info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ }
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ int next_backup;
+ struct btrfs_root_backup *root_backup;
+ int last_backup;
+
+ next_backup = info->backup_root_index;
+ last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+
+ /*
+ * just overwrite the last backup if we're at the same generation
+ * this happens only at umount
+ */
+ root_backup = info->super_for_commit->super_roots + last_backup;
+ if (btrfs_backup_tree_root_gen(root_backup) ==
+ btrfs_header_generation(info->tree_root->node))
+ next_backup = last_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(info->extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(info->extent_root->node));
+
+ btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(info->csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(info->csum_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block. It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+ struct btrfs_super_block *super,
+ int *num_backups_tried, int *backup_index)
+{
+ struct btrfs_root_backup *root_backup;
+ int newest = *backup_index;
+
+ if (*num_backups_tried == 0) {
+ u64 gen = btrfs_super_generation(super);
+
+ newest = find_newest_super_backup(info, gen);
+ if (newest == -1)
+ return -1;
+
+ *backup_index = newest;
+ *num_backups_tried = 1;
+ } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ /* we've tried all the backups, all done */
+ return -1;
+ } else {
+ /* jump to the next oldest backup */
+ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+ *backup_index = newest;
+ *num_backups_tried += 1;
+ }
+ root_backup = super->super_roots + newest;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+ free_extent_buffer(info->tree_root->node);
+ free_extent_buffer(info->tree_root->commit_root);
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+
+ info->tree_root->node = NULL;
+ info->tree_root->commit_root = NULL;
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+
+ if (chunk_root) {
+ free_extent_buffer(info->chunk_root->node);
+ free_extent_buffer(info->chunk_root->commit_root);
+ info->chunk_root->node = NULL;
+ info->chunk_root->commit_root = NULL;
+ }
+}
+
+
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
int ret;
int err = -EINVAL;
+ int num_backups_tried = 0;
+ int backup_index = 0;
struct btrfs_super_block *disk_super;
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
btrfs_init_block_rsv(&fs_info->trans_block_rsv);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
- mutex_init(&fs_info->durable_block_rsv_mutex);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
+ fs_info->free_chunk_space = 0;
+ /* readahead state */
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ spin_lock_init(&fs_info->reada_lock);
+
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
goto fail_alloc;
}
- memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
- memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
- sizeof(fs_info->super_for_commit));
+ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
/*
+ * run through our array of backup supers and setup
+ * our ring pointer to the oldest one
+ */
+ generation = btrfs_super_generation(disk_super);
+ find_oldest_super_backup(fs_info, generation);
+
+ /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
+ fs_info->readahead_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->generic_worker, 1);
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
btrfs_start_workers(&fs_info->delayed_workers, 1);
btrfs_start_workers(&fs_info->caching_workers, 1);
+ btrfs_start_workers(&fs_info->readahead_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
blocksize, generation);
- if (!tree_root->node)
- goto fail_chunk_root;
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (!tree_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
- goto fail_tree_root;
+
+ goto recovery_tree_root;
}
+
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_EXTENT_TREE_OBJECTID, extent_root);
if (ret)
- goto fail_tree_root;
+ goto recovery_tree_root;
extent_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
if (ret)
- goto fail_extent_root;
+ goto recovery_tree_root;
dev_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
- goto fail_dev_root;
+ goto recovery_tree_root;
csum_root->track_dirty = 1;
fail_block_groups:
btrfs_free_block_groups(fs_info);
- free_extent_buffer(csum_root->node);
- free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
- free_extent_buffer(dev_root->node);
- free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
- free_extent_buffer(extent_root->node);
- free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
- free_extent_buffer(tree_root->node);
- free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
- free_extent_buffer(chunk_root->node);
- free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+ free_root_pointers(fs_info, 1);
+
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
- kfree(fs_info->delayed_root);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- kfree(extent_root);
- kfree(tree_root);
- kfree(fs_info);
- kfree(chunk_root);
- kfree(dev_root);
- kfree(csum_root);
+ free_fs_info(fs_info);
return ERR_PTR(err);
+
+recovery_tree_root:
+
+ if (!btrfs_test_opt(tree_root, RECOVERY))
+ goto fail_tree_roots;
+
+ free_root_pointers(fs_info, 0);
+
+ /* don't use the log in recovery mode, it won't be valid */
+ btrfs_set_super_log_root(disk_super, 0);
+
+ /* we can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = next_root_backup(fs_info, fs_info->super_copy,
+ &num_backups_tried, &backup_index);
+ if (ret == -1)
+ goto fail_block_groups;
+ goto retry_root_backup;
}
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
int total_errors = 0;
u64 flags;
- max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ backup_super_roots(root->fs_info);
- sb = &root->fs_info->super_for_commit;
+ sb = root->fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
- btrfs_put_block_group_cache(fs_info);
-
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ btrfs_put_block_group_cache(fs_info);
+
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
- kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info);
+ free_fs_info(fs_info);
return 0;
}
return ret;
}
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *))
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
if (!eb)
goto out;
- btrfs_tree_lock(eb);
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush_fn(data);
+ btrfs_tree_lock(eb);
+ }
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
- lock_page(page);
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
return 0;
}
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
+ .readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
u32 blocksize, u64 parent_transid);
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
u64 parent_transid);
+ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
goto again;
}
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask: the allocation mask
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = insert_state(tree, prealloc, start, end, &bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
+ if (next_node && start < end && prealloc && !need_resched()) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ &bits);
+ BUG_ON(err == -EEXIST);
+ if (err) {
+ free_extent_state(prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ set_state_bits(tree, prealloc, &bits);
+ clear_state_bit(tree, prealloc, &clear_bits, 0);
+
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
/* wrappers around set/clear extent bit */
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
0, NULL, cached_state, mask);
}
if (!uptodate && tree->ops &&
tree->ops->readpage_io_failed_hook) {
ret = tree->ops->readpage_io_failed_hook(bio, page,
- start, end, NULL);
+ start, end, state);
if (ret == 0) {
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
int compressed;
int write_flags;
unsigned long nr_written = 0;
+ bool fill_delalloc = true;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
set_page_extent_mapped(page);
+ if (!tree->ops || !tree->ops->fill_delalloc)
+ fill_delalloc = false;
+
delalloc_start = start;
delalloc_end = 0;
page_started = 0;
- if (!epd->extent_locked) {
+ if (!epd->extent_locked && fill_delalloc) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops && tree->ops->write_cache_pages_lock_hook)
- tree->ops->write_cache_pages_lock_hook(page);
- else
- lock_page(page);
+ if (tree->ops &&
+ tree->ops->write_cache_pages_lock_hook) {
+ tree->ops->write_cache_pages_lock_hook(page,
+ data, flush_fn);
+ } else {
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
+ }
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
+ ClearPageError(page);
unlock_page(page);
}
return 0;
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- u64 start, int wait,
+ struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!wait) {
+ if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
} else {
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
- if (ret || !wait)
+ if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = start_i; i < num_pages; i++) {
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
+ #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
struct extent_state *other);
void (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
- int (*write_cache_pages_lock_hook)(struct page *page);
+ int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+ void (*flush_fn)(void *));
};
struct extent_io_tree {
gfp_t mask);
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+ #define WAIT_NONE 0
+ #define WAIT_COMPLETE 1
+ #define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
* any can be found.
*
* Future enhancements:
- * - To enhance the performance, better read-ahead strategies for the
- * extent-tree can be employed.
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - In case of a read error on files with nodatasum, map the file and read
* the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
- * - make the prefetch cancellable
*/
struct scrub_bio;
sdev->curr = -1;
atomic_set(&sdev->in_flight, 0);
atomic_set(&sdev->cancel_req, 0);
- sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sdev->csum_list);
spin_lock_init(&sdev->list_lock);
int slot;
int i;
u64 nstripes;
- int start_stripe;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
u64 generation;
u64 mirror_num;
+ struct reada_control *reada1;
+ struct reada_control *reada2;
+ struct btrfs_key key_start;
+ struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
if (!path)
return -ENOMEM;
- path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
/*
- * find all extents for each stripe and just read them to get
- * them into the page cache
- * FIXME: we can do better. build a more intelligent prefetching
+ * trigger the readahead for extent tree csum tree and wait for
+ * completion. During readahead, the scrub is officially paused
+ * to not hold off transaction commits
*/
logical = base + offset;
- physical = map->stripes[num].physical;
- ret = 0;
- for (i = 0; i < nstripes; ++i) {
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out_noplug;
-
- /*
- * we might miss half an extent here, but that doesn't matter,
- * as it's only the prefetch
- */
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out_noplug;
-
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.objectid >= logical + map->stripe_len)
- break;
-
- path->slots[0]++;
- }
- btrfs_release_path(path);
- logical += increment;
- physical += map->stripe_len;
- cond_resched();
+ wait_event(sdev->list_wait,
+ atomic_read(&sdev->in_flight) == 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ /* FIXME it might be better to start readahead at commit root */
+ key_start.objectid = logical;
+ key_start.type = BTRFS_EXTENT_ITEM_KEY;
+ key_start.offset = (u64)0;
+ key_end.objectid = base + offset + nstripes * increment;
+ key_end.type = BTRFS_EXTENT_ITEM_KEY;
+ key_end.offset = (u64)0;
+ reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+ key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_start.type = BTRFS_EXTENT_CSUM_KEY;
+ key_start.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = base + offset + nstripes * increment;
+ reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+ if (!IS_ERR(reada1))
+ btrfs_reada_wait(reada1);
+ if (!IS_ERR(reada2))
+ btrfs_reada_wait(reada2);
+
+ mutex_lock(&fs_info->scrub_lock);
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
}
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
*/
- start_stripe = 0;
blk_start_plug(&plug);
- again:
- logical = base + offset + start_stripe * increment;
- for (i = start_stripe; i < nstripes; ++i) {
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sdev->csum_list, 1);
- if (ret)
- goto out;
- logical += increment;
- cond_resched();
- }
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset + start_stripe * increment;
- physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+ logical = base + offset;
+ physical = map->stripes[num].physical;
ret = 0;
- for (i = start_stripe; i < nstripes; ++i) {
+ for (i = 0; i < nstripes; ++i) {
/*
* canceled?
*/
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
- scrub_free_csums(sdev);
- start_stripe = i;
- goto again;
}
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sdev->csum_list, 1);
+ if (ret)
+ goto out;
+
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
out:
blk_finish_plug(&plug);
- out_noplug:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
}
INIT_LIST_HEAD(&device->dev_alloc_list);
+ /* init readahead state */
+ spin_lock_init(&device->reada_lock);
+ device->reada_curr_zone = NULL;
+ atomic_set(&device->reada_in_flight, 0);
+ device->reada_next = 0;
+ INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh) {
- ret = -EINVAL;
+ if (!bh)
goto error_close;
- }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
continue;
}
if (fs_devices->open_devices == 0) {
- ret = -EIO;
+ ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
}
BUG_ON(ret);
- if (device->bytes_used > 0)
- device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ if (device->bytes_used > 0) {
+ u64 len = btrfs_dev_extent_length(leaf, extent);
+ device->bytes_used -= len;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = btrfs_del_item(trans, root, path);
out:
if (ret)
goto error_undo;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space = device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
call_rcu(&device->rcu, free_device);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
- btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+ num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
- btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(root->fs_info->super_copy,
total_bytes + device->total_bytes);
- total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
- btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
- &device->dev_root->fs_info->super_copy;
+ device->dev_root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 diff = new_size - device->total_bytes;
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
lock_chunks(root);
device->total_bytes = new_size;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space -= diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
unlock_chunks(root);
again:
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
index++;
}
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
index = 0;
stripe = &chunk->stripe;
while (index < map->num_stripes) {
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_root *root)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;