btrfs: fix deadlock due to page faults during direct IO reads and writes

author Filipe Manana <fdmanana@suse.com>

Thu, 14 Apr 2022 22:28:54 +0000 (06:28 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 1 May 2022 15:22:33 +0000 (17:22 +0200)
author Filipe Manana <fdmanana@suse.com>
Thu, 14 Apr 2022 22:28:54 +0000 (06:28 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 1 May 2022 15:22:33 +0000 (17:22 +0200)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index af890a0c36e31c3daf015cff30842888b2c37419..ff578c934bbcfa46166116809a0be83ae90bc298 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
  
  static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
  {
+       const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         loff_t pos;
         ssize_t written = 0;
         ssize_t written_buffered;
+       size_t prev_left = 0;
         loff_t endbyte;
         ssize_t err;
         unsigned int ilock_flags = 0;
-       struct iomap_dio *dio = NULL;
  
         if (iocb->ki_flags & IOCB_NOWAIT)
                 ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1955,23 +1956,80 @@ relock:
                 goto buffered;
         }
  
-       dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                            0, 0);
+       /*
+        * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
+        * calls generic_write_sync() (through iomap_dio_complete()), because
+        * that results in calling fsync (btrfs_sync_file()) which will try to
+        * lock the inode in exclusive/write mode.
+        */
+       if (is_sync_write)
+               iocb->ki_flags &= ~IOCB_DSYNC;
  
-       btrfs_inode_unlock(inode, ilock_flags);
+       /*
+        * The iov_iter can be mapped to the same file range we are writing to.
+        * If that's the case, then we will deadlock in the iomap code, because
+        * it first calls our callback btrfs_dio_iomap_begin(), which will create
+        * an ordered extent, and after that it will fault in the pages that the
+        * iov_iter refers to. During the fault in we end up in the readahead
+        * pages code (starting at btrfs_readahead()), which will lock the range,
+        * find that ordered extent and then wait for it to complete (at
+        * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+        * obviously the ordered extent can never complete as we didn't submit
+        * yet the respective bio(s). This always happens when the buffer is
+        * memory mapped to the same file range, since the iomap DIO code always
+        * invalidates pages in the target file range (after starting and waiting
+        * for any writeback).
+        *
+        * So here we disable page faults in the iov_iter and then retry if we
+        * got -EFAULT, faulting in the pages before the retry.
+        */
+again:
+       from->nofault = true;
+       err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+                          IOMAP_DIO_PARTIAL, written);
+       from->nofault = false;
  
-       if (IS_ERR_OR_NULL(dio)) {
-               err = PTR_ERR_OR_ZERO(dio);
-               if (err < 0 && err != -ENOTBLK)
-                       goto out;
-       } else {
-               written = iomap_dio_complete(dio);
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (err > 0)
+               written = err;
+
+       if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+               const size_t left = iov_iter_count(from);
+               /*
+                * We have more data left to write. Try to fault in as many as
+                * possible of the remainder pages and retry. We do this without
+                * releasing and locking again the inode, to prevent races with
+                * truncate.
+                *
+                * Also, in case the iov refers to pages in the file range of the
+                * file we want to write to (due to a mmap), we could enter an
+                * infinite loop if we retry after faulting the pages in, since
+                * iomap will invalidate any pages in the range early on, before
+                * it tries to fault in the pages of the iov. So we keep track of
+                * how much was left of iov in the previous EFAULT and fallback
+                * to buffered IO in case we haven't made any progress.
+                */
+               if (left == prev_left) {
+                       err = -ENOTBLK;
+               } else {
+                       fault_in_iov_iter_readable(from, left);
+                       prev_left = left;
+                       goto again;
+               }
         }
  
-       if (written < 0 || !iov_iter_count(from)) {
-               err = written;
+       btrfs_inode_unlock(inode, ilock_flags);
+
+       /*
+        * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
+        * the fsync (call generic_write_sync()).
+        */
+       if (is_sync_write)
+               iocb->ki_flags |= IOCB_DSYNC;
+
+       /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
+       if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
                 goto out;
-       }
  
  buffered:
         pos = iocb->ki_pos;
@@ -1996,7 +2054,7 @@ buffered:
         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
                                  endbyte >> PAGE_SHIFT);
  out:
-       return written ? written : err;
+       return err < 0 ? err : written;
  }
  
  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
  static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
  {
         struct inode *inode = file_inode(iocb->ki_filp);
+       size_t prev_left = 0;
+       ssize_t read = 0;
         ssize_t ret;
  
         if (fsverity_active(inode))
@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
                 return 0;
  
         btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+again:
+       /*
+        * This is similar to what we do for direct IO writes, see the comment
+        * at btrfs_direct_write(), but we also disable page faults in addition
+        * to disabling them only at the iov_iter level. This is because when
+        * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+        * which can still trigger page fault ins despite having set ->nofault
+        * to true of our 'to' iov_iter.
+        *
+        * The difference to direct IO writes is that we deadlock when trying
+        * to lock the extent range in the inode's tree during he page reads
+        * triggered by the fault in (while for writes it is due to waiting for
+        * our own ordered extent). This is because for direct IO reads,
+        * btrfs_dio_iomap_begin() returns with the extent range locked, which
+        * is only unlocked in the endio callback (end_bio_extent_readpage()).
+        */
+       pagefault_disable();
+       to->nofault = true;
         ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                          0, 0);
+                          IOMAP_DIO_PARTIAL, read);
+       to->nofault = false;
+       pagefault_enable();
+
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (ret > 0)
+               read = ret;
+
+       if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+               const size_t left = iov_iter_count(to);
+
+               if (left == prev_left) {
+                       /*
+                        * We didn't make any progress since the last attempt,
+                        * fallback to a buffered read for the remainder of the
+                        * range. This is just to avoid any possibility of looping
+                        * for too long.
+                        */
+                       ret = read;
+               } else {
+                       /*
+                        * We made some progress since the last retry or this is
+                        * the first time we are retrying. Fault in as many pages
+                        * as possible and retry.
+                        */
+                       fault_in_iov_iter_writeable(to, left);
+                       prev_left = left;
+                       goto again;
+               }
+       }
         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-       return ret;
+       return ret < 0 ? ret : read;
  }
  
  static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
author	Filipe Manana <fdmanana@suse.com>
	Thu, 14 Apr 2022 22:28:54 +0000 (06:28 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 1 May 2022 15:22:33 +0000 (17:22 +0200)