btrfs: avoid unnecessary lock and leaf splits when updating inode in the log

author Filipe Manana <fdmanana@suse.com>

Tue, 20 Jul 2021 15:03:43 +0000 (16:03 +0100)

committer David Sterba <dsterba@suse.com>

Mon, 23 Aug 2021 11:19:01 +0000 (13:19 +0200)
author Filipe Manana <fdmanana@suse.com>
Tue, 20 Jul 2021 15:03:43 +0000 (16:03 +0100)
committer David Sterba <dsterba@suse.com>
Mon, 23 Aug 2021 11:19:01 +0000 (13:19 +0200)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 3e6c8f8..8dde5c0 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3972,14 +3972,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
  
  static int log_inode_item(struct btrfs_trans_handle *trans,
                           struct btrfs_root *log, struct btrfs_path *path,
-                         struct btrfs_inode *inode)
+                         struct btrfs_inode *inode, bool inode_item_dropped)
  {
         struct btrfs_inode_item *inode_item;
         int ret;
  
-       ret = btrfs_insert_empty_item(trans, log, path,
-                                     &inode->location, sizeof(*inode_item));
-       if (ret && ret != -EEXIST)
+       /*
+        * If we are doing a fast fsync and the inode was logged before in the
+        * current transaction, then we know the inode was previously logged and
+        * it exists in the log tree. For performance reasons, in this case use
+        * btrfs_search_slot() directly with ins_len set to 0 so that we never
+        * attempt a write lock on the leaf's parent, which adds unnecessary lock
+        * contention in case there are concurrent fsyncs for other inodes of the
+        * same subvolume. Using btrfs_insert_empty_item() when the inode item
+        * already exists can also result in unnecessarily splitting a leaf.
+        */
+       if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+               ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+               ASSERT(ret <= 0);
+               if (ret > 0)
+                       ret = -ENOENT;
+       } else {
+               /*
+                * This means it is the first fsync in the current transaction,
+                * so the inode item is not in the log and we need to insert it.
+                * We can never get -EEXIST because we are only called for a fast
+                * fsync and in case an inode eviction happens after the inode was
+                * logged before in the current transaction, when we load again
+                * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+                * flags and set ->logged_trans to 0.
+                */
+               ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+                                             sizeof(*inode_item));
+               ASSERT(ret != -EEXIST);
+       }
+       if (ret)
                 return ret;
         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_item);
@@ -5303,6 +5330,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         bool need_log_inode_item = true;
         bool xattrs_logged = false;
         bool recursive_logging = false;
+       bool inode_item_dropped = true;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -5437,6 +5465,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 } else {
                         if (inode_only == LOG_INODE_ALL)
                                 fast_search = true;
+                       inode_item_dropped = false;
                         goto log_extents;
                 }
  
@@ -5470,7 +5499,7 @@ log_extents:
         btrfs_release_path(path);
         btrfs_release_path(dst_path);
         if (need_log_inode_item) {
-               err = log_inode_item(trans, log, dst_path, inode);
+               err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
                 if (err)
                         goto out_unlock;
                 /*
author	Filipe Manana <fdmanana@suse.com>
	Tue, 20 Jul 2021 15:03:43 +0000 (16:03 +0100)
committer	David Sterba <dsterba@suse.com>
	Mon, 23 Aug 2021 11:19:01 +0000 (13:19 +0200)