Revert "btrfs: turn delayed_nodes_tree into an XArray"
authorDavid Sterba <dsterba@suse.com>
Fri, 15 Jul 2022 11:59:45 +0000 (13:59 +0200)
committerDavid Sterba <dsterba@suse.com>
Fri, 15 Jul 2022 17:15:19 +0000 (19:15 +0200)
This reverts commit 253bf57555e451dec5a7f09dc95d380ce8b10e5b.

Revert the xarray conversion, there's a problem with potential
sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS
allocation. The radix tree used the preloading mechanism to avoid
sleeping but this is not available in xarray.

Conversion from spin lock to mutex is possible but at time of rc6 is
riskier than a clean revert.

[1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/

Reported-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/disk-io.c
fs/btrfs/inode.c

index a240e8b83709f8f0281e45e6b9ec08989ba73cf9..9c21e214d29e410883fc82f2297e80e021144321 100644 (file)
@@ -1222,10 +1222,10 @@ struct btrfs_root {
        struct rb_root inode_tree;
 
        /*
-        * Xarray that keeps track of delayed nodes of every inode, protected
-        * by inode_lock
+        * radix tree that keeps track of delayed nodes of every inode,
+        * protected by inode_lock
         */
-       struct xarray delayed_nodes;
+       struct radix_tree_root delayed_nodes_tree;
        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
index 66779ab3ed4a3f457bdd5617bce91b43c487af67..748bf6b0d8600fcd380e38b91a973c0255c17c76 100644 (file)
@@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
        }
 
        spin_lock(&root->inode_lock);
-       node = xa_load(&root->delayed_nodes, ino);
+       node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
 
        if (node) {
                if (btrfs_inode->delayed_node) {
@@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
                /*
                 * It's possible that we're racing into the middle of removing
-                * this node from the xarray.  In this case, the refcount
+                * this node from the radix tree.  In this case, the refcount
                 * was zero and it should never go back to one.  Just return
-                * NULL like it was never in the xarray at all; our release
+                * NULL like it was never in the radix at all; our release
                 * function is in the process of removing it.
                 *
                 * Some implementations of refcount_inc refuse to bump the
@@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
                 * here, refcount_inc() may decide to just WARN_ONCE() instead
                 * of actually bumping the refcount.
                 *
-                * If this node is properly in the xarray, we want to bump the
+                * If this node is properly in the radix, we want to bump the
                 * refcount twice, once for the inode and once for this get
                 * operation.
                 */
@@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
        u64 ino = btrfs_ino(btrfs_inode);
        int ret;
 
-       do {
-               node = btrfs_get_delayed_node(btrfs_inode);
-               if (node)
-                       return node;
+again:
+       node = btrfs_get_delayed_node(btrfs_inode);
+       if (node)
+               return node;
 
-               node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
-               if (!node)
-                       return ERR_PTR(-ENOMEM);
-               btrfs_init_delayed_node(node, root, ino);
+       node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+       if (!node)
+               return ERR_PTR(-ENOMEM);
+       btrfs_init_delayed_node(node, root, ino);
 
-               /* Cached in the inode and can be accessed */
-               refcount_set(&node->refs, 2);
+       /* cached in the btrfs inode and can be accessed */
+       refcount_set(&node->refs, 2);
 
-               spin_lock(&root->inode_lock);
-               ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
-               if (ret) {
-                       spin_unlock(&root->inode_lock);
-                       kmem_cache_free(delayed_node_cache, node);
-                       if (ret != -EBUSY)
-                               return ERR_PTR(ret);
-               }
-       } while (ret);
+       ret = radix_tree_preload(GFP_NOFS);
+       if (ret) {
+               kmem_cache_free(delayed_node_cache, node);
+               return ERR_PTR(ret);
+       }
+
+       spin_lock(&root->inode_lock);
+       ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+       if (ret == -EEXIST) {
+               spin_unlock(&root->inode_lock);
+               kmem_cache_free(delayed_node_cache, node);
+               radix_tree_preload_end();
+               goto again;
+       }
        btrfs_inode->delayed_node = node;
        spin_unlock(&root->inode_lock);
+       radix_tree_preload_end();
 
        return node;
 }
@@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node(
                 * back up.  We can delete it now.
                 */
                ASSERT(refcount_read(&delayed_node->refs) == 0);
-               xa_erase(&root->delayed_nodes, delayed_node->inode_id);
+               radix_tree_delete(&root->delayed_nodes_tree,
+                                 delayed_node->inode_id);
                spin_unlock(&root->inode_lock);
                kmem_cache_free(delayed_node_cache, delayed_node);
        }
@@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
 
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 {
-       unsigned long index = 0;
-       struct btrfs_delayed_node *delayed_node;
+       u64 inode_id = 0;
        struct btrfs_delayed_node *delayed_nodes[8];
+       int i, n;
 
        while (1) {
-               int n = 0;
-
                spin_lock(&root->inode_lock);
-               if (xa_empty(&root->delayed_nodes)) {
+               n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+                                          (void **)delayed_nodes, inode_id,
+                                          ARRAY_SIZE(delayed_nodes));
+               if (!n) {
                        spin_unlock(&root->inode_lock);
-                       return;
+                       break;
                }
 
-               xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
+               inode_id = delayed_nodes[n - 1]->inode_id + 1;
+               for (i = 0; i < n; i++) {
                        /*
                         * Don't increase refs in case the node is dead and
                         * about to be removed from the tree in the loop below
                         */
-                       if (refcount_inc_not_zero(&delayed_node->refs)) {
-                               delayed_nodes[n] = delayed_node;
-                               n++;
-                       }
-                       if (n >= ARRAY_SIZE(delayed_nodes))
-                               break;
+                       if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+                               delayed_nodes[i] = NULL;
                }
-               index++;
                spin_unlock(&root->inode_lock);
 
-               for (int i = 0; i < n; i++) {
+               for (i = 0; i < n; i++) {
+                       if (!delayed_nodes[i])
+                               continue;
                        __btrfs_kill_delayed_node(delayed_nodes[i]);
                        btrfs_release_delayed_node(delayed_nodes[i]);
                }
index f142ab43df367a26ba62fe80c07a0b204101a601..daa756b3606de9794ee54da68ebeaba874753fa4 100644 (file)
@@ -1159,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        root->nr_delalloc_inodes = 0;
        root->nr_ordered_extents = 0;
        root->inode_tree = RB_ROOT;
-       xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+       INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
 
        btrfs_init_root_block_rsv(root);
 
index 306673ef72562114b0d53369815356bcb07870b5..329ad27f091890e7e20887c25fc4e5d92d0dbabb 100644 (file)
@@ -3910,7 +3910,7 @@ cache_index:
         * cache.
         *
         * This is required for both inode re-read from disk and delayed inode
-        * in the delayed_nodes xarray.
+        * in delayed_nodes_tree.
         */
        if (BTRFS_I(inode)->last_trans == fs_info->generation)
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,