Btrfs: Online btree defragmentation fixes
authorChris Mason <chris.mason@oracle.com>
Wed, 25 Jun 2008 20:01:31 +0000 (16:01 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:04 +0000 (11:04 -0400)
The btree defragger wasn't making forward progress because the new key wasn't
being saved by the btrfs_search_forward function.

This also disables the automatic btree defrag, it wasn't scaling well to
huge filesystems.  The auto-defrag needs to be done differently.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-defrag.c

index 7f4cc2b..0cb80f3 100644 (file)
@@ -63,10 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
        int i;
-       int keep = p->keep_locks;
-       int skip = p->skip_locking;
 
        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               p->slots[i] = 0;
                if (!p->nodes[i])
                        continue;
                if (p->locks[i]) {
@@ -74,10 +73,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
                        p->locks[i] = 0;
                }
                free_extent_buffer(p->nodes[i]);
+               p->nodes[i] = NULL;
        }
-       memset(p, 0, sizeof(*p));
-       p->keep_locks = keep;
-       p->skip_locking = skip;
 }
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -463,8 +460,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                search_start = cur->start;
                last_block = cur->start;
                *last_ret = search_start;
-               if (parent_level == 1)
-                       btrfs_clear_buffer_defrag(cur);
                btrfs_tree_unlock(cur);
                free_extent_buffer(cur);
        }
@@ -2969,8 +2964,138 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
        return 1;
 }
 
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, but could
+ * also be used to search for blocks that have changed since a given
+ * transaction id.
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                        struct btrfs_path *path, int cache_only,
+                        u64 min_trans)
+{
+       struct extent_buffer *cur;
+       struct btrfs_key found_key;
+       int slot;
+       u32 nritems;
+       int level;
+       int ret = 1;
+
+again:
+       cur = btrfs_lock_root_node(root);
+       level = btrfs_header_level(cur);
+       path->nodes[level] = cur;
+       path->locks[level] = 1;
+
+       if (btrfs_header_generation(cur) < min_trans) {
+               ret = 1;
+               goto out;
+       }
+       while(1) {
+               nritems = btrfs_header_nritems(cur);
+               level = btrfs_header_level(cur);
+               bin_search(cur, min_key, level, &slot);
+
+               /* at level = 0, we're done, setup the path and exit */
+               if (level == 0) {
+                       ret = 0;
+                       path->slots[level] = slot;
+                       btrfs_item_key_to_cpu(cur, &found_key, slot);
+                       goto out;
+               }
+               /*
+                * check this node pointer against the cache_only and
+                * min_trans parameters.  If it isn't in cache or is too
+                * old, skip to the next one.
+                */
+               while(slot < nritems) {
+                       u64 blockptr;
+                       u64 gen;
+                       struct extent_buffer *tmp;
+                       blockptr = btrfs_node_blockptr(cur, slot);
+                       gen = btrfs_node_ptr_generation(cur, slot);
+                       if (gen < min_trans) {
+                               slot++;
+                               continue;
+                       }
+                       if (!cache_only)
+                               break;
+
+                       tmp = btrfs_find_tree_block(root, blockptr,
+                                           btrfs_level_size(root, level - 1));
+
+                       if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                               free_extent_buffer(tmp);
+                               break;
+                       }
+                       if (tmp)
+                               free_extent_buffer(tmp);
+                       slot++;
+               }
+               /*
+                * we didn't find a candidate key in this node, walk forward
+                * and find another one
+                */
+               if (slot >= nritems) {
+                       ret = btrfs_find_next_key(root, path, min_key, level,
+                                                 cache_only, min_trans);
+                       if (ret == 0) {
+                               btrfs_release_path(root, path);
+                               goto again;
+                       } else {
+                               goto out;
+                       }
+               }
+               /* save our key for returning back */
+               btrfs_node_key_to_cpu(cur, &found_key, slot);
+               path->slots[level] = slot;
+               if (level == path->lowest_level) {
+                       ret = 0;
+                       unlock_up(path, level, 1);
+                       goto out;
+               }
+               cur = read_node_slot(root, cur, slot);
+
+               btrfs_tree_lock(cur);
+               path->locks[level - 1] = 1;
+               path->nodes[level - 1] = cur;
+               unlock_up(path, level, 1);
+       }
+out:
+       if (ret == 0)
+               memcpy(min_key, &found_key, sizeof(found_key));
+       return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                       struct btrfs_key *key, int lowest_level)
+                       struct btrfs_key *key, int lowest_level,
+                       int cache_only, u64 min_trans)
 {
        int level = lowest_level;
        int slot;
@@ -2982,6 +3107,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 
                slot = path->slots[level] + 1;
                c = path->nodes[level];
+next:
                if (slot >= btrfs_header_nritems(c)) {
                        level++;
                        if (level == BTRFS_MAX_LEVEL) {
@@ -2991,8 +3117,28 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
-               else
+               else {
+                       u64 blockptr = btrfs_node_blockptr(c, slot);
+                       u64 gen = btrfs_node_ptr_generation(c, slot);
+
+                       if (cache_only) {
+                               struct extent_buffer *cur;
+                               cur = btrfs_find_tree_block(root, blockptr,
+                                           btrfs_level_size(root, level - 1));
+                               if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+                                       slot++;
+                                       if (cur)
+                                               free_extent_buffer(cur);
+                                       goto next;
+                               }
+                               free_extent_buffer(cur);
+                       }
+                       if (gen < min_trans) {
+                               slot++;
+                               goto next;
+                       }
                        btrfs_node_key_to_cpu(c, key, slot);
+               }
                return 0;
        }
        return 1;
@@ -3095,6 +3241,12 @@ done:
        return 0;
 }
 
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type)
index ca8e6f1..a287964 100644 (file)
@@ -609,6 +609,7 @@ struct btrfs_root {
        u64 last_inode_alloc;
        int ref_cows;
        int track_dirty;
+       u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;
        int defrag_running;
@@ -1412,7 +1413,11 @@ int btrfs_previous_item(struct btrfs_root *root,
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                       struct btrfs_key *key, int lowest_level);
+                       struct btrfs_key *key, int lowest_level,
+                       int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                        struct btrfs_path *path, int cache_only,
+                        u64 min_trans);
 
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
index 31ca9f8..4cdc0b6 100644 (file)
@@ -295,7 +295,6 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
-       btrfs_clear_buffer_defrag(eb);
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
@@ -355,7 +354,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
-       btrfs_clear_buffer_defrag(eb);
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                ret = -EIO;
@@ -736,6 +734,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+       root->defrag_trans_start = fs_info->generation;
        init_completion(&root->kobj_unregister);
        root->defrag_running = 0;
        root->defrag_level = 0;
@@ -1168,7 +1167,6 @@ static int transaction_kthread(void *arg)
                        goto sleep;
                }
                mutex_unlock(&root->fs_info->trans_mutex);
-               btrfs_defrag_dirty_roots(root->fs_info);
                trans = btrfs_start_transaction(root, 1);
                ret = btrfs_commit_transaction(trans, root);
 sleep:
@@ -1434,12 +1432,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                                   tree_root,
                                                   "btrfs-transaction");
        if (!fs_info->transaction_kthread)
-               goto fail_trans_kthread;
+               goto fail_cleaner;
 
 
        return tree_root;
 
-fail_trans_kthread:
+fail_cleaner:
        kthread_stop(fs_info->cleaner_kthread);
 fail_extent_root:
        free_extent_buffer(extent_root->node);
@@ -1662,7 +1660,6 @@ int close_ctree(struct btrfs_root *root)
        kthread_stop(root->fs_info->transaction_kthread);
        kthread_stop(root->fs_info->cleaner_kthread);
 
-       btrfs_defrag_dirty_roots(root->fs_info);
        btrfs_clean_old_snapshots(root);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
@@ -1794,58 +1791,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        return;
 }
 
-void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
-                       buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
-}
-
-void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
-                       buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
-                       GFP_NOFS);
-}
-
-int btrfs_buffer_defrag(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
-                    buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
-}
-
-int btrfs_buffer_defrag_done(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
-                    buf->start, buf->start + buf->len - 1,
-                    EXTENT_DEFRAG_DONE, 0);
-}
-
-int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
-                    buf->start, buf->start + buf->len - 1,
-                    EXTENT_DEFRAG_DONE, GFP_NOFS);
-}
-
-int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
-{
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       struct inode *btree_inode = root->fs_info->btree_inode;
-       return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
-                    buf->start, buf->start + buf->len - 1,
-                    EXTENT_DEFRAG, GFP_NOFS);
-}
-
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
index deff6b4..353c3c5 100644 (file)
@@ -61,12 +61,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
                                 struct extent_buffer *buf);
-void btrfs_set_buffer_defrag(struct extent_buffer *buf);
-void btrfs_set_buffer_defrag_done(struct extent_buffer *buf);
-int btrfs_buffer_defrag(struct extent_buffer *buf);
-int btrfs_buffer_defrag_done(struct extent_buffer *buf);
-int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
-int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
index dc3c03c..5e0857f 100644 (file)
@@ -2095,8 +2095,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
        set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
-       if (!btrfs_test_opt(root, SSD))
-               btrfs_set_buffer_defrag(buf);
        trans->blocks_used++;
        return buf;
 }
index 726d687..5e28cf5 100644 (file)
@@ -365,7 +365,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
        btrfs_clean_old_snapshots(root);
-       btrfs_defrag_dirty_roots(root->fs_info);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        sb->s_dirt = 0;
index 8e909cb..98f422d 100644 (file)
@@ -30,7 +30,6 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
-#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -92,9 +91,6 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                                   (unsigned long)root->root_key.objectid,
                                   BTRFS_ROOT_TRANS_TAG);
-                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-                                  (unsigned long)root->root_key.objectid,
-                                  BTRFS_ROOT_DEFRAG_TAG);
                        root->commit_root = btrfs_root_node(root);
                } else {
                        WARN_ON(1);
@@ -403,44 +399,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                cond_resched();
 
                trans = btrfs_start_transaction(root, 1);
-               if (ret != -EAGAIN)
+               if (root->fs_info->closing || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
        smp_mb();
-       radix_tree_tag_clear(&info->fs_roots_radix,
-                    (unsigned long)root->root_key.objectid,
-                    BTRFS_ROOT_DEFRAG_TAG);
        btrfs_end_transaction(trans, root);
        return 0;
 }
 
-int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
-{
-       struct btrfs_root *gang[1];
-       struct btrfs_root *root;
-       int i;
-       int ret;
-       int err = 0;
-       u64 last = 0;
-
-       while(1) {
-               ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
-                                                (void **)gang, last,
-                                                ARRAY_SIZE(gang),
-                                                BTRFS_ROOT_DEFRAG_TAG);
-               if (ret == 0)
-                       break;
-               for (i = 0; i < ret; i++) {
-                       root = gang[i];
-                       last = root->root_key.objectid + 1;
-                       btrfs_defrag_root(root, 1);
-               }
-       }
-       btrfs_defrag_root(info->extent_root, 1);
-       return err;
-}
-
 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                                     struct list_head *list)
 {
index e1e5a06..9ccd5a5 100644 (file)
@@ -84,7 +84,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
                        struct list_head *dead_list);
-int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
index b17693f..cc2650b 100644 (file)
@@ -32,10 +32,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        int wret;
        int level;
        int orig_level;
-       int i;
        int is_extent = 0;
        int next_key_ret = 0;
        u64 last_ret = 0;
+       u64 min_trans = 0;
+
+       if (cache_only)
+               goto out;
 
        if (root->fs_info->extent_root == root) {
                /*
@@ -43,10 +46,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                 * we can't defrag the extent root without deadlock
                 */
                goto out;
-#if 0
-               mutex_lock(&root->fs_info->alloc_mutex);
-               is_extent = 1;
-#endif
        }
 
        if (root->ref_cows == 0 && !is_extent)
@@ -84,6 +83,17 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
        path->lowest_level = 1;
        path->keep_locks = 1;
+       if (cache_only)
+               min_trans = root->defrag_trans_start;
+
+       ret = btrfs_search_forward(root, &key, path, cache_only, min_trans);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = 0;
+               goto out;
+       }
+       btrfs_release_path(root, path);
        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 
        if (wret < 0) {
@@ -95,7 +105,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                goto out;
        }
        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-       next_key_ret = btrfs_find_next_key(root, path, &key, 1);
+       next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+                                          min_trans);
        ret = btrfs_realloc_node(trans, root,
                                 path->nodes[1], 0,
                                 cache_only, &last_ret,
@@ -106,19 +117,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = -EAGAIN;
        }
 
-       for (i = 1; i < BTRFS_MAX_LEVEL; i++) {
-               if (path->locks[i]) {
-                       btrfs_tree_unlock(path->nodes[i]);
-                       path->locks[i] = 0;
-               }
-               if (path->nodes[i]) {
-                       free_extent_buffer(path->nodes[i]);
-                       path->nodes[i] = NULL;
-               }
-       }
+       btrfs_release_path(root, path);
        if (is_extent)
                btrfs_extent_post_op(trans, root);
-
 out:
        if (is_extent)
                mutex_unlock(&root->fs_info->alloc_mutex);
@@ -138,6 +139,7 @@ done:
        if (ret != -EAGAIN) {
                memset(&root->defrag_progress, 0,
                       sizeof(root->defrag_progress));
+               root->defrag_trans_start = trans->transid;
        }
        return ret;
 }