btrfs: improve the batch insertion of delayed items
authorFilipe Manana <fdmanana@suse.com>
Tue, 20 Jul 2021 15:05:22 +0000 (16:05 +0100)
committerDavid Sterba <dsterba@suse.com>
Mon, 23 Aug 2021 11:19:00 +0000 (13:19 +0200)
When we insert the delayed items of an inode, which corresponds to the
directory index keys for a directory (key type BTRFS_DIR_INDEX_KEY), we
do the following:

1) Pick the first delayed item from the rbtree and insert it into the
   fs/subvolume btree, using btrfs_insert_empty_item() for that;

2) Without releasing the path returned by btrfs_insert_empty_item(),
   keep collecting as many consecutive delayed items from the rbtree
   as possible, as long as each one's BTRFS_DIR_INDEX_KEY key is the
   immediate successor of the previously picked item and as long as
   they fit in the available space of the leaf the path points to;

3) Then insert all the collected items into the leaf;

4) Release the reserve metadata space for each collected item and
   release each item (implies deleting from the rbtree);

5) Unlock the path.

While this is much better than inserting items one by one, it can be
improved in a few aspects:

1) Instead of adding items based on the remaining free space of the
   leaf, collect as many items that can fit in a leaf and bulk insert
   them. This results in less and larger batches, reducing the total
   amount of time to insert the delayed items. For example when adding
   100K files to a directory, we ended up creating 1658 batches with
   very variable sizes ranging from 1 item to 118 items, on a filesystem
   with a node/leaf size of 16K. After this change, we end up with 839
   batches, with the vast majority of them having exactly 120 items;

2) We do the search for more items to batch, by iterating the rbtree,
   while holding a write lock on the leaf;

3) While still holding the leaf locked, we are releasing the reserved
   metadata for each item and then deleting each item, keeping a write
   lock on the leaf for longer than necessary. Releasing the delayed items
   one by one can take a significant amount of time, because deleting
   them from the rbtree can often be a bit slow when the deletion results
   in rebalancing the rbtree.

So change this so that we try to create larger batches, with a total
item size up to the maximum a leaf can support, and by unlocking the leaf
immediately after inserting the items, releasing the reserved metadata
space of each item and releasing each item without holding the write lock
on the leaf.

The following script that runs fs_mark was used to test this change:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-m single -d single"
  FILES=1000000
  THREADS=16
  FILE_SIZE=0

  echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $DEV &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t 16"
  for ((i = 1; i <= $THREADS; i++)); do
      OPTS="$OPTS -d $MNT/d$i"
  done

  fs_mark $OPTS

  umount $MNT

It was run on machine with 12 cores, 64G of ram, using a NVMe device and
using a non-debug kernel config (Debian's default config).

Results before this change:

FSUse%        Count         Size    Files/sec         App Overhead
     1     16000000            0      76182.1             72223046
     3     32000000            0      62746.9             80776528
     5     48000000            0      77029.0             93022381
     6     64000000            0      73691.6             95251075
     8     80000000            0      66288.0             85089634

Results after this change:

FSUse%        Count         Size    Files/sec         App Overhead
     1     16000000            0      79049.5 (+3.7%)     69700824
     3     32000000            0      65248.9 (+3.9%)     80583693
     5     48000000            0      77991.4 (+1.2%)     90040908
     6     64000000            0      75096.8 (+1.9%)     89862241
     8     80000000            0      66926.8 (+1.0%)     84429169

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/delayed-inode.c

index 257c1e18abd4fa50a74edc8d18a0edc3c75dab69..20f3e748027e32ecfd9769acf6c5d1fa0bbaf360 100644 (file)
@@ -672,176 +672,122 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items that have consecutive
+ * keys if they exist.
  */
-static int btrfs_batch_insert_items(struct btrfs_root *root,
-                                   struct btrfs_path *path,
-                                   struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_delayed_item *first_item)
 {
-       struct btrfs_delayed_item *curr, *next;
-       int free_space;
-       int total_size = 0;
-       struct extent_buffer *leaf;
-       char *data_ptr;
-       struct btrfs_key *keys;
-       u32 *data_size;
-       struct list_head head;
-       int slot;
+       LIST_HEAD(batch);
+       struct btrfs_delayed_item *curr;
+       struct btrfs_delayed_item *next;
+       const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+       int total_size;
        int nitems;
-       int i;
-       int ret = 0;
-
-       BUG_ON(!path->nodes[0]);
-
-       leaf = path->nodes[0];
-       free_space = btrfs_leaf_free_space(leaf);
-       INIT_LIST_HEAD(&head);
+       unsigned int nofs_flag;
+       char *ins_data = NULL;
+       struct btrfs_key *ins_keys;
+       u32 *ins_sizes;
+       int ret;
 
-       next = item;
-       nitems = 0;
+       list_add_tail(&first_item->tree_list, &batch);
+       nitems = 1;
+       total_size = first_item->data_len + sizeof(struct btrfs_item);
+       curr = first_item;
 
-       /*
-        * count the number of the continuous items that we can insert in batch
-        */
-       while (total_size + next->data_len + sizeof(struct btrfs_item) <=
-              free_space) {
-               total_size += next->data_len + sizeof(struct btrfs_item);
-               list_add_tail(&next->tree_list, &head);
-               nitems++;
+       while (true) {
+               int next_size;
 
-               curr = next;
                next = __btrfs_next_delayed_item(curr);
-               if (!next)
+               if (!next || !btrfs_is_continuous_delayed_item(curr, next))
                        break;
 
-               if (!btrfs_is_continuous_delayed_item(curr, next))
+               next_size = next->data_len + sizeof(struct btrfs_item);
+               if (total_size + next_size > max_size)
                        break;
-       }
 
-       if (!nitems) {
-               ret = 0;
-               goto out;
+               list_add_tail(&next->tree_list, &batch);
+               nitems++;
+               total_size += next_size;
+               curr = next;
        }
 
-       keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
-       if (!keys) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       if (nitems == 1) {
+               ins_keys = &first_item->key;
+               ins_sizes = &first_item->data_len;
+       } else {
+               int i = 0;
 
-       data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
-       if (!data_size) {
-               ret = -ENOMEM;
-               goto error;
+               ins_data = kmalloc(nitems * sizeof(u32) +
+                                  nitems * sizeof(struct btrfs_key), GFP_NOFS);
+               if (!ins_data) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ins_sizes = (u32 *)ins_data;
+               ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
+               list_for_each_entry(curr, &batch, tree_list) {
+                       ins_keys[i] = curr->key;
+                       ins_sizes[i] = curr->data_len;
+                       i++;
+               }
        }
 
-       /* get keys of all the delayed items */
-       i = 0;
-       list_for_each_entry(next, &head, tree_list) {
-               keys[i] = next->key;
-               data_size[i] = next->data_len;
-               i++;
-       }
+       nofs_flag = memalloc_nofs_save();
+       ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
+                                      nitems);
+       memalloc_nofs_restore(nofs_flag);
+       if (ret)
+               goto out;
 
-       /* insert the keys of the items */
-       setup_items_for_insert(root, path, keys, data_size, nitems);
+       list_for_each_entry(curr, &batch, tree_list) {
+               char *data_ptr;
 
-       /* insert the dir index items */
-       slot = path->slots[0];
-       list_for_each_entry_safe(curr, next, &head, tree_list) {
-               data_ptr = btrfs_item_ptr(leaf, slot, char);
-               write_extent_buffer(leaf, &curr->data,
-                                   (unsigned long)data_ptr,
-                                   curr->data_len);
-               slot++;
+               data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+               write_extent_buffer(path->nodes[0], &curr->data,
+                                   (unsigned long)data_ptr, curr->data_len);
+               path->slots[0]++;
+       }
 
-               btrfs_delayed_item_release_metadata(root, curr);
+       /*
+        * Now release our path before releasing the delayed items and their
+        * metadata reservations, so that we don't block other tasks for more
+        * time than needed.
+        */
+       btrfs_release_path(path);
 
+       list_for_each_entry_safe(curr, next, &batch, tree_list) {
                list_del(&curr->tree_list);
+               btrfs_delayed_item_release_metadata(root, curr);
                btrfs_release_delayed_item(curr);
        }
-
-error:
-       kfree(data_size);
-       kfree(keys);
 out:
+       kfree(ins_data);
        return ret;
 }
 
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_path *path,
-                                    struct btrfs_delayed_item *delayed_item)
-{
-       struct extent_buffer *leaf;
-       unsigned int nofs_flag;
-       char *ptr;
-       int ret;
-
-       nofs_flag = memalloc_nofs_save();
-       ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
-                                     delayed_item->data_len);
-       memalloc_nofs_restore(nofs_flag);
-       if (ret < 0 && ret != -EEXIST)
-               return ret;
-
-       leaf = path->nodes[0];
-
-       ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
-       write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
-                           delayed_item->data_len);
-       btrfs_mark_buffer_dirty(leaf);
-
-       btrfs_delayed_item_release_metadata(root, delayed_item);
-       return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      struct btrfs_root *root,
                                      struct btrfs_delayed_node *node)
 {
-       struct btrfs_delayed_item *curr, *prev;
        int ret = 0;
 
-do_again:
-       mutex_lock(&node->mutex);
-       curr = __btrfs_first_delayed_insertion_item(node);
-       if (!curr)
-               goto insert_end;
-
-       ret = btrfs_insert_delayed_item(trans, root, path, curr);
-       if (ret < 0) {
-               btrfs_release_path(path);
-               goto insert_end;
-       }
+       while (ret == 0) {
+               struct btrfs_delayed_item *curr;
 
-       prev = curr;
-       curr = __btrfs_next_delayed_item(prev);
-       if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
-               /* insert the continuous items into the same leaf */
-               path->slots[0]++;
-               btrfs_batch_insert_items(root, path, curr);
+               mutex_lock(&node->mutex);
+               curr = __btrfs_first_delayed_insertion_item(node);
+               if (!curr) {
+                       mutex_unlock(&node->mutex);
+                       break;
+               }
+               ret = btrfs_insert_delayed_item(trans, root, path, curr);
+               mutex_unlock(&node->mutex);
        }
-       btrfs_release_delayed_item(prev);
-       btrfs_mark_buffer_dirty(path->nodes[0]);
-
-       btrfs_release_path(path);
-       mutex_unlock(&node->mutex);
-       goto do_again;
 
-insert_end:
-       mutex_unlock(&node->mutex);
        return ret;
 }