btrfs: avoid blocking on space revervation when doing nowait dio writes
authorFilipe Manana <fdmanana@suse.com>
Wed, 23 Mar 2022 16:19:30 +0000 (16:19 +0000)
committerDavid Sterba <dsterba@suse.com>
Mon, 16 May 2022 15:03:10 +0000 (17:03 +0200)
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.

So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.

This is part of a patchset comprised of the following patches:

  btrfs: avoid blocking on page locks with nowait dio on compressed range
  btrfs: avoid blocking nowait dio when locking file range
  btrfs: avoid double nocow check when doing nowait dio writes
  btrfs: stop allocating a path when checking if cross reference exists
  btrfs: free path at can_nocow_extent() before checking for checksum items
  btrfs: release path earlier at can_nocow_extent()
  btrfs: avoid blocking when allocating context for nowait dio read/write
  btrfs: avoid blocking on space revervation when doing nowait dio writes

The following test was run before and after applying this patchset:

  $ cat io-uring-nodatacow-test.sh
  #!/bin/bash

  DEV=/dev/sdc
  MNT=/mnt/sdc

  MOUNT_OPTIONS="-o ssd -o nodatacow"
  MKFS_OPTIONS="-R free-space-tree -O no-holes"

  NUM_JOBS=4
  FILE_SIZE=8G
  RUN_TIME=300

  cat <<EOF > /tmp/fio-job.ini
  [io_uring_rw]
  rw=randrw
  fsync=0
  fallocate=posix
  group_reporting=1
  direct=1
  ioengine=io_uring
  iodepth=64
  bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
  filesize=$FILE_SIZE
  runtime=$RUN_TIME
  time_based
  filename=foobar
  directory=$MNT
  numjobs=$NUM_JOBS
  thread
  EOF

  echo performance | \
     tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $MNT &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
  mount $MOUNT_OPTIONS $DEV $MNT

  fio /tmp/fio-job.ini

  umount $MNT

The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.

Result before the patchset:

 READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec

Result after the patchset:

 READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec

That's about +7.2% throughput for reads and +6.9% for writes.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/delalloc-space.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/relocation.c
fs/btrfs/root-tree.c

index 79399eb..be126aa 100644 (file)
@@ -2893,7 +2893,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
-                                   u64 disk_num_bytes);
+                                   u64 disk_num_bytes, bool noflush);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
                                   u64 start, u64 end);
index bd8267c..36ab085 100644 (file)
@@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
-                                   u64 disk_num_bytes)
+                                   u64 disk_num_bytes, bool noflush)
 {
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
         * If we have a transaction open (can happen if we call truncate_block
         * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
         */
-       if (btrfs_is_free_space_inode(inode)) {
+       if (noflush || btrfs_is_free_space_inode(inode)) {
                flush = BTRFS_RESERVE_NO_FLUSH;
        } else {
                if (current->journal_info)
@@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
         */
        calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
                                &meta_reserve, &qgroup_reserve);
-       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+                                                noflush);
        if (ret)
                return ret;
        ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
        ret = btrfs_check_data_free_space(inode, reserved, start, len);
        if (ret < 0)
                return ret;
-       ret = btrfs_delalloc_reserve_metadata(inode, len, len);
+       ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
        if (ret < 0) {
                btrfs_free_reserved_data_space(inode, *reserved, start, len);
                extent_changeset_free(*reserved);
index ceac806..b64fb93 100644 (file)
@@ -1684,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
                WARN_ON(reserve_bytes == 0);
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
                                                      reserve_bytes,
-                                                     reserve_bytes);
+                                                     reserve_bytes, false);
                if (ret) {
                        if (!only_release_metadata)
                                btrfs_free_reserved_data_space(BTRFS_I(inode),
index 4254c3c..b3f2010 100644 (file)
@@ -4705,7 +4705,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
                        goto out;
                }
        }
-       ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
+       ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
        if (ret < 0) {
                if (!only_release_metadata)
                        btrfs_free_reserved_data_space(inode, data_reserved,
@@ -7415,6 +7415,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
                                         u64 start, u64 len,
                                         unsigned int iomap_flags)
 {
+       const bool nowait = (iomap_flags & IOMAP_NOWAIT);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_map *em = *map;
        int type;
@@ -7454,12 +7455,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
                struct extent_map *em2;
 
                /* We can NOCOW, so only need to reserve metadata space. */
-               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+                                                     nowait);
                if (ret < 0) {
                        /* Our caller expects us to free the input extent map. */
                        free_extent_map(em);
                        *map = NULL;
                        btrfs_dec_nocow_writers(fs_info, block_start);
+                       if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+                               ret = -EAGAIN;
                        goto out;
                }
                space_reserved = true;
@@ -7483,7 +7487,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
                free_extent_map(em);
                *map = NULL;
 
-               if (iomap_flags & IOMAP_NOWAIT)
+               if (nowait)
                        return -EAGAIN;
 
                /* We have to COW, so need to reserve metadata and data space. */
@@ -10801,7 +10805,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
        ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
        if (ret)
                goto out_free_data_space;
-       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+                                             false);
        if (ret)
                goto out_qgroup_free_data;
 
index a9fed81..db723c0 100644 (file)
@@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 }
 
 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-                               enum btrfs_qgroup_rsv_type type, bool enforce)
+                               enum btrfs_qgroup_rsv_type type, bool enforce,
+                               bool noflush)
 {
        int ret;
 
        ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
-       if (ret <= 0 && ret != -EDQUOT)
+       if ((ret <= 0 && ret != -EDQUOT) || noflush)
                return ret;
 
        ret = try_flush_qgroup(root);
index 880e9df..0c4dd2a 100644 (file)
@@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
                              enum btrfs_qgroup_rsv_type type, bool enforce);
 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-                               enum btrfs_qgroup_rsv_type type, bool enforce);
+                               enum btrfs_qgroup_rsv_type type, bool enforce,
+                               bool noflush);
 /* Reserve metadata space for pertrans and prealloc type */
 static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
                                int num_bytes, bool enforce)
 {
        return __btrfs_qgroup_reserve_meta(root, num_bytes,
-                       BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+                                          BTRFS_QGROUP_RSV_META_PERTRANS,
+                                          enforce, false);
 }
 static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
-                               int num_bytes, bool enforce)
+                                                    int num_bytes, bool enforce,
+                                                    bool noflush)
 {
        return __btrfs_qgroup_reserve_meta(root, num_bytes,
-                       BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+                                          BTRFS_QGROUP_RSV_META_PREALLOC,
+                                          enforce, noflush);
 }
 
 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
index fdc2c4b..b1c36fc 100644 (file)
@@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 
                /* Reserve metadata for this range */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
-                                                     clamped_len, clamped_len);
+                                                     clamped_len, clamped_len,
+                                                     false);
                if (ret)
                        goto release_page;
 
index ca7426e..a64b26b 100644 (file)
@@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                /* One for parent inode, two for dir entries */
                qgroup_num_bytes = 3 * fs_info->nodesize;
                ret = btrfs_qgroup_reserve_meta_prealloc(root,
-                               qgroup_num_bytes, true);
+                                                        qgroup_num_bytes, true,
+                                                        false);
                if (ret)
                        return ret;
        }