btrfs: refactor the delayed item deletion entry point
authorFilipe Manana <fdmanana@suse.com>
Tue, 31 May 2022 15:06:37 +0000 (16:06 +0100)
committerDavid Sterba <dsterba@suse.com>
Mon, 25 Jul 2022 15:44:35 +0000 (17:44 +0200)
The delayed item deletion entry point, btrfs_delete_delayed_items(), is a
bit convoluted for a few reasons:

1) It's really a loop disguised with labels and goto statements;

2) There's a 'delete_fail' label which isn't only for error cases, we can
   jump to that label even if no error happened, if we simply don't have
   more delayed items to delete;

3) Unnecessarily keeps track of the current and previous items for no
   good reason, as after getting the next item and releasing the current
   one, it just jumps to the 'again' label just to look again for the
   first delayed item;

4) When a delayed item is not in the tree (because it was already deleted
   before), it releases the item while holding a path locked, which is
   not necessary and adds more contention to the tree, specially taking
   into account that the path came from a deletion search, meaning we have
   write locks for nodes at levels 2, 1 and 0. And releasing the item is
   not computationally trivial (rb tree deletion, a kfree() and some
   trivial things).

So refactor it to use a while loop and add some comments to make it more
obvious why we can have delayed items without a matching item in the tree
as well as why not keep the delayed node locked all the time when running
all its deletion items. This is also a preparation for some upcoming work
involving delayed items.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/delayed-inode.c

index c8deab7..ff986c7 100644 (file)
@@ -867,45 +867,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_delayed_node *node)
 {
-       struct btrfs_delayed_item *curr, *prev;
        int ret = 0;
 
-do_again:
-       mutex_lock(&node->mutex);
-       curr = __btrfs_first_delayed_deletion_item(node);
-       if (!curr)
-               goto delete_fail;
+       while (ret == 0) {
+               struct btrfs_delayed_item *item;
+
+               mutex_lock(&node->mutex);
+               item = __btrfs_first_delayed_deletion_item(node);
+               if (!item) {
+                       mutex_unlock(&node->mutex);
+                       break;
+               }
+
+               ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+               if (ret > 0) {
+                       /*
+                        * There's no matching item in the leaf. This means we
+                        * have already deleted this item in a past run of the
+                        * delayed items. We ignore errors when running delayed
+                        * items from an async context, through a work queue job
+                        * running btrfs_async_run_delayed_root(), and don't
+                        * release delayed items that failed to complete. This
+                        * is because we will retry later, and at transaction
+                        * commit time we always run delayed items and will
+                        * then deal with errors if they fail to run again.
+                        *
+                        * So just release delayed items for which we can't find
+                        * an item in the tree, and move to the next item.
+                        */
+                       btrfs_release_path(path);
+                       btrfs_release_delayed_item(item);
+                       ret = 0;
+               } else if (ret == 0) {
+                       ret = btrfs_batch_delete_items(trans, root, path, item);
+                       btrfs_release_path(path);
+               }
 
-       ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
-       if (ret < 0)
-               goto delete_fail;
-       else if (ret > 0) {
                /*
-                * can't find the item which the node points to, so this node
-                * is invalid, just drop it.
+                * We unlock and relock on each iteration, this is to prevent
+                * blocking other tasks for too long while we are being run from
+                * the async context (work queue job). Those tasks are typically
+                * running system calls like creat/mkdir/rename/unlink/etc which
+                * need to add delayed items to this delayed node.
                 */
-               prev = curr;
-               curr = __btrfs_next_delayed_item(prev);
-               btrfs_release_delayed_item(prev);
-               ret = 0;
-               btrfs_release_path(path);
-               if (curr) {
-                       mutex_unlock(&node->mutex);
-                       goto do_again;
-               } else
-                       goto delete_fail;
+               mutex_unlock(&node->mutex);
        }
 
-       ret = btrfs_batch_delete_items(trans, root, path, curr);
-       if (ret)
-               goto delete_fail;
-       btrfs_release_path(path);
-       mutex_unlock(&node->mutex);
-       goto do_again;
-
-delete_fail:
-       btrfs_release_path(path);
-       mutex_unlock(&node->mutex);
        return ret;
 }