btrfs: qgroup: Use generation-aware subtree swap to mark dirty extents
authorQu Wenruo <wqu@suse.com>
Thu, 27 Sep 2018 06:42:32 +0000 (14:42 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 15 Oct 2018 15:23:36 +0000 (17:23 +0200)
Before this patch, with quota enabled during balance, we need to mark
the whole subtree dirty for quota.

E.g.
OO = Old tree blocks (from file tree)
NN = New tree blocks (from reloc tree)

        File tree (src)           Reloc tree (dst)
            OO (a)                              NN (a)
           /  \                                /  \
     (b) OO    OO (c)                    (b) NN    NN (c)
        /  \  /  \                          /  \  /  \
       OO  OO OO OO (d)                    OO  OO OO NN (d)

For old balance + quota case, quota will mark the whole src and dst tree
dirty, including all the 3 old tree blocks in reloc tree.

It's doable for small file tree or new tree blocks are all located at
lower level.

But for large file tree or new tree blocks are all located at higher
level, this will lead to mark the whole tree dirty, and be unbelievably
slow.

This patch will change how we handle such balance with quota enabled
case.

Now we will search from (b) and (c) for any new tree blocks whose
generation is equal to @last_snapshot, and only mark them dirty.

In above case, we only need to trace tree blocks NN(b), NN(c) and NN(d).
(NN(a) will be traced when COW happens for nodeptr modification).  And
also for tree blocks OO(b), OO(c), OO(d). (OO(a) will be traced when COW
happens for nodeptr modification.)

For above case, we could skip 3 tree blocks, but for larger tree, we can
skip tons of unmodified tree blocks, and hugely speed up balance.

This patch will introduce a new function,
btrfs_qgroup_trace_subtree_swap(), which will do the following main
work:

1) Read out real root eb
   And setup basic dst_path for later calls
2) Call qgroup_trace_new_subtree_blocks()
   To trace all new tree blocks in reloc tree and their counter
   parts in the file tree.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/relocation.c

index 0b49575698da5712b1577f4a3108ae045d73ec01..6b35b34810854a77d2d5ef070ff5317e23f0ad22 100644 (file)
@@ -2009,6 +2009,110 @@ out:
        return ret;
 }
 
+/*
+ * Inform qgroup to trace subtree swap used in balance.
+ *
+ * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
+ * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
+ *
+ * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
+ * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
+ * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
+ * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * and skip all tree blocks whose generation is smaller than last_snapshot.
+ *
+ * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
+ * which could be the cause of very slow balance if the file tree is large.
+ *
+ * @src_parent, @src_slot: pointer to src (file tree) eb.
+ * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
+ */
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+                               struct extent_buffer *src_parent, int src_slot,
+                               struct extent_buffer *dst_parent, int dst_slot,
+                               u64 last_snapshot)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_path *dst_path = NULL;
+       struct btrfs_key first_key;
+       struct extent_buffer *src_eb = NULL;
+       struct extent_buffer *dst_eb = NULL;
+       u64 child_gen;
+       u64 child_bytenr;
+       int level;
+       int ret;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+
+       /* Check parameter order */
+       if (btrfs_node_ptr_generation(src_parent, src_slot) >
+           btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+               btrfs_err_rl(fs_info,
+               "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+                       btrfs_node_ptr_generation(src_parent, src_slot),
+                       btrfs_node_ptr_generation(dst_parent, dst_slot));
+               return -EUCLEAN;
+       }
+
+       /* Read out real @src_eb, pointed by @src_parent and @src_slot */
+       child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
+       child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
+       btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
+
+       src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+                       btrfs_header_level(src_parent) - 1, &first_key);
+       if (IS_ERR(src_eb)) {
+               ret = PTR_ERR(src_eb);
+               goto out;
+       }
+
+       /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
+       child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
+       child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
+       btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
+
+       dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+                       btrfs_header_level(dst_parent) - 1, &first_key);
+       if (IS_ERR(dst_eb)) {
+               ret = PTR_ERR(dst_eb);
+               goto out;
+       }
+
+       if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       level = btrfs_header_level(dst_eb);
+       dst_path = btrfs_alloc_path();
+       if (!dst_path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /* For dst_path */
+       extent_buffer_get(dst_eb);
+       dst_path->nodes[level] = dst_eb;
+       dst_path->slots[level] = 0;
+       dst_path->locks[level] = 0;
+
+       /* Do the generation-aware breadth-first search */
+       ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+                                             level, last_snapshot);
+       if (ret < 0)
+               goto out;
+       ret = 0;
+
+out:
+       free_extent_buffer(src_eb);
+       free_extent_buffer(dst_eb);
+       btrfs_free_path(dst_path);
+       if (ret < 0)
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       return ret;
+}
+
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
                               struct extent_buffer *root_eb,
                               u64 root_gen, int root_level)
index 54b8bb282c0e0a30f3f3efe1ed3fc970641a7d96..1aaf4c2769008137282dca87ac948237bf4a2241 100644 (file)
@@ -236,6 +236,11 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
                               struct extent_buffer *root_eb,
                               u64 root_gen, int root_level);
+
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+                               struct extent_buffer *src_parent, int src_slot,
+                               struct extent_buffer *dst_parent, int dst_slot,
+                               u64 last_snapshot);
 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
                                u64 num_bytes, struct ulist *old_roots,
                                struct ulist *new_roots);
index a5c5e9b3aceb0091633bbec57463e9ca3656d44d..d10357122aa1cf22f4d7b6b8addbaea1a6911f77 100644 (file)
@@ -1888,14 +1888,9 @@ again:
                 *    and tree block numbers, if current trans doesn't free
                 *    data reloc tree inode.
                 */
-               ret = btrfs_qgroup_trace_subtree(trans, parent,
-                               btrfs_header_generation(parent),
-                               btrfs_header_level(parent));
-               if (ret < 0)
-                       break;
-               ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
-                               btrfs_header_generation(path->nodes[level]),
-                               btrfs_header_level(path->nodes[level]));
+               ret = btrfs_qgroup_trace_subtree_swap(trans, parent, slot,
+                               path->nodes[level], path->slots[level],
+                               last_snapshot);
                if (ret < 0)
                        break;