btrfs: fix deadlock when cloning inline extent and low on free metadata space

author Filipe Manana <fdmanana@suse.com>

Wed, 2 Dec 2020 11:55:58 +0000 (11:55 +0000)

committer David Sterba <dsterba@suse.com>

Fri, 18 Dec 2020 13:49:50 +0000 (14:49 +0100)
author Filipe Manana <fdmanana@suse.com>
Wed, 2 Dec 2020 11:55:58 +0000 (11:55 +0000)
committer David Sterba <dsterba@suse.com>
Fri, 18 Dec 2020 13:49:50 +0000 (14:49 +0100)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 555cbce..d9bf53d 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -42,6 +42,15 @@ enum {
          * to an inode.
          */
         BTRFS_INODE_NO_XATTRS,
+       /*
+        * Set when we are in a context where we need to start a transaction and
+        * have dirty pages with the respective file range locked. This is to
+        * ensure that when reserving space for the transaction, if we are low
+        * on available space and need to flush delalloc, we will not flush
+        * delalloc for this inode, because that could result in a deadlock (on
+        * the file range, inode's io_tree).
+        */
+       BTRFS_INODE_NO_DELALLOC_FLUSH,
  };
  
  /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 9dde770..2674f24 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3074,7 +3074,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                                u32 min_type);
  
  int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+                              bool in_reclaim_context);
  int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                               unsigned int extra_bits,
                               struct extent_state **cached_state);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index a98e33f..324f646 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -715,7 +715,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
          * flush all outstanding I/O and inode extent mappings before the
          * copy operation is declared as being finished
          */
-       ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+       ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
         if (ret) {
                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 8e23780..0707166 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
   * some fairly slow code that needs optimization. This walks the list
   * of all the inodes with pending delalloc and forces them to disk.
   */
-static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot,
+                                bool in_reclaim_context)
  {
         struct btrfs_inode *binode;
         struct inode *inode;
@@ -9411,6 +9412,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
  
                 list_move_tail(&binode->delalloc_inodes,
                                &root->delalloc_inodes);
+
+               if (in_reclaim_context &&
+                   test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+                       continue;
+
                 inode = igrab(&binode->vfs_inode);
                 if (!inode) {
                         cond_resched_lock(&root->delalloc_lock);
@@ -9464,10 +9470,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                 return -EROFS;
  
-       return start_delalloc_inodes(root, &nr, true);
+       return start_delalloc_inodes(root, &nr, true, false);
  }
  
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+                              bool in_reclaim_context)
  {
         struct btrfs_root *root;
         struct list_head splice;
@@ -9490,7 +9497,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
                                &fs_info->delalloc_roots);
                 spin_unlock(&fs_info->delalloc_root_lock);
  
-               ret = start_delalloc_inodes(root, &nr, false);
+               ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context);
                 btrfs_put_root(root);
                 if (ret < 0)
                         goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 703212f..dde49a7 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4951,7 +4951,7 @@ long btrfs_ioctl(struct file *file, unsigned int
         case BTRFS_IOC_SYNC: {
                 int ret;
  
-               ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+               ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
                 if (ret)
                         return ret;
                 ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c

index ab80896..b03e789 100644 (file)
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -89,6 +89,19 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
         if (ret)
                 goto out_unlock;
  
+       /*
+        * After dirtying the page our caller will need to start a transaction,
+        * and if we are low on metadata free space, that can cause flushing of
+        * delalloc for all inodes in order to get metadata space released.
+        * However we are holding the range locked for the whole duration of
+        * the clone/dedupe operation, so we may deadlock if that happens and no
+        * other task releases enough space. So mark this inode as not being
+        * possible to flush to avoid such deadlock. We will clear that flag
+        * when we finish cloning all extents, since a transaction is started
+        * after finding each extent to clone.
+        */
+       set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
         if (comp_type == BTRFS_COMPRESS_NONE) {
                 char *map;
  
@@ -549,6 +562,8 @@ process_slot:
  out:
         btrfs_free_path(path);
         kvfree(buf);
+       clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
         return ret;
  }
  
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c

index 6409956..67e55c5 100644 (file)
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -532,7 +532,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
  
         loops = 0;
         while ((delalloc_bytes || dio_bytes) && loops < 3) {
-               btrfs_start_delalloc_roots(fs_info, items);
+               btrfs_start_delalloc_roots(fs_info, items, true);
  
                 loops++;
                 if (wait_ordered && !trans) {
author	Filipe Manana <fdmanana@suse.com>
	Wed, 2 Dec 2020 11:55:58 +0000 (11:55 +0000)
committer	David Sterba <dsterba@suse.com>
	Fri, 18 Dec 2020 13:49:50 +0000 (14:49 +0100)
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/reflink.c		patch \| blob \| history
fs/btrfs/space-info.c		patch \| blob \| history