btrfs: ensure pages are unlocked on cow_file_range() failure

author Naohiro Aota <naohiro.aota@wdc.com>

Tue, 21 Jun 2022 06:40:59 +0000 (15:40 +0900)

committer David Sterba <dsterba@suse.com>

Mon, 25 Jul 2022 15:45:38 +0000 (17:45 +0200)
author Naohiro Aota <naohiro.aota@wdc.com>
Tue, 21 Jun 2022 06:40:59 +0000 (15:40 +0900)
committer David Sterba <dsterba@suse.com>
Mon, 25 Jul 2022 15:45:38 +0000 (17:45 +0200)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index a00052b..5a58042 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1128,6 +1128,28 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
   * *page_started is set to one if we unlock locked_page and do everything
   * required to start IO on it.  It may be clean and already done with
   * IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ *     - All the pages are unlocked. IO is started.
+ *     - Note that this can happen only on success
+ * - unlock == 1
+ *     - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ *     - On success, all the pages are locked for writing out them
+ *     - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
   */
  static noinline int cow_file_range(struct btrfs_inode *inode,
                                    struct page *locked_page,
@@ -1137,6 +1159,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
         struct btrfs_root *root = inode->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u64 alloc_hint = 0;
+       u64 orig_start = start;
         u64 num_bytes;
         unsigned long ram_size;
         u64 cur_alloc_size = 0;
@@ -1324,18 +1347,44 @@ out_reserve:
         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
  out_unlock:
+       /*
+        * Now, we have three regions to clean up:
+        *
+        * |-------(1)----|---(2)---|-------------(3)----------|
+        * `- orig_start  `- start  `- start + cur_alloc_size  `- end
+        *
+        * We process each region below.
+        */
+
         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
         /*
-        * If we reserved an extent for our delalloc range (or a subrange) and
-        * failed to create the respective ordered extent, then it means that
-        * when we reserved the extent we decremented the extent's size from
-        * the data space_info's bytes_may_use counter and incremented the
-        * space_info's bytes_reserved counter by the same amount. We must make
-        * sure extent_clear_unlock_delalloc() does not try to decrement again
-        * the data space_info's bytes_may_use counter, therefore we do not pass
-        * it the flag EXTENT_CLEAR_DATA_RESV.
+        * For the range (1). We have already instantiated the ordered extents
+        * for this region. They are cleaned up by
+        * btrfs_cleanup_ordered_extents() in e.g,
+        * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+        * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+        * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+        * function.
+        *
+        * However, in case of unlock == 0, we still need to unlock the pages
+        * (except @locked_page) to ensure all the pages are unlocked.
+        */
+       if (!unlock && orig_start < start)
+               extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+                                            locked_page, 0, page_ops);
+
+       /*
+        * For the range (2). If we reserved an extent for our delalloc range
+        * (or a subrange) and failed to create the respective ordered extent,
+        * then it means that when we reserved the extent we decremented the
+        * extent's size from the data space_info's bytes_may_use counter and
+        * incremented the space_info's bytes_reserved counter by the same
+        * amount. We must make sure extent_clear_unlock_delalloc() does not try
+        * to decrement again the data space_info's bytes_may_use counter,
+        * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
          */
         if (extent_reserved) {
                 extent_clear_unlock_delalloc(inode, start,
@@ -1347,6 +1396,13 @@ out_unlock:
                 if (start >= end)
                         goto out;
         }
+
+       /*
+        * For the range (3). We never touched the region. In addition to the
+        * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+        * space_info's bytes_may_use counter, reserved in
+        * btrfs_check_data_free_space().
+        */
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
                                      page_ops);
author	Naohiro Aota <naohiro.aota@wdc.com>
	Tue, 21 Jun 2022 06:40:59 +0000 (15:40 +0900)
committer	David Sterba <dsterba@suse.com>
	Mon, 25 Jul 2022 15:45:38 +0000 (17:45 +0200)