btrfs: use per-buffer locking for extent_buffer reading
authorChristoph Hellwig <hch@lst.de>
Wed, 3 May 2023 15:24:40 +0000 (17:24 +0200)
committerDavid Sterba <dsterba@suse.com>
Mon, 19 Jun 2023 11:59:28 +0000 (13:59 +0200)
Instead of locking and unlocking every page or the extent, just add a
new EXTENT_BUFFER_READING bit that mirrors EXTENT_BUFFER_WRITEBACK
for synchronizing threads trying to read an extent_buffer and to wait
for I/O completion.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h

index 40f0639..650cb68 100644 (file)
@@ -4049,6 +4049,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
 static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
 {
        struct extent_buffer *eb = bbio->private;
+       struct btrfs_fs_info *fs_info = eb->fs_info;
        bool uptodate = !bbio->bio.bi_status;
        struct bvec_iter_all iter_all;
        struct bio_vec *bvec;
@@ -4068,26 +4069,47 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
        }
 
        bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
-               end_page_read(bvec->bv_page, uptodate, eb->start + bio_offset,
-                             bvec->bv_len);
-               bio_offset += bvec->bv_len;
-       }
+               u64 start = eb->start + bio_offset;
+               struct page *page = bvec->bv_page;
+               u32 len = bvec->bv_len;
 
-       if (eb->fs_info->nodesize < PAGE_SIZE) {
-               unlock_extent(&bbio->inode->io_tree, eb->start,
-                             eb->start + bio_offset - 1, NULL);
+               if (uptodate)
+                       btrfs_page_set_uptodate(fs_info, page, start, len);
+               else
+                       btrfs_page_clear_uptodate(fs_info, page, start, len);
+
+               bio_offset += len;
        }
+
+       clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+       smp_mb__after_atomic();
+       wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
        free_extent_buffer(eb);
 
        bio_put(&bbio->bio);
 }
 
-static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
-                                      struct btrfs_tree_parent_check *check)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+                            struct btrfs_tree_parent_check *check)
 {
        int num_pages = num_extent_pages(eb), i;
        struct btrfs_bio *bbio;
 
+       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+               return 0;
+
+       /*
+        * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
+        * operation, which could potentially still be in flight.  In this case
+        * we simply want to return an error.
+        */
+       if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
+               return -EIO;
+
+       /* Someone else is already reading the buffer, just wait for it. */
+       if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
+               goto done;
+
        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        eb->read_mirror = 0;
        check_buffer_tree_ref(eb);
@@ -4108,117 +4130,15 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
                        __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
        }
        btrfs_submit_bio(bbio, mirror_num);
-}
-
-static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
-                                     int mirror_num,
-                                     struct btrfs_tree_parent_check *check)
-{
-       struct btrfs_fs_info *fs_info = eb->fs_info;
-       struct extent_io_tree *io_tree;
-       struct page *page = eb->pages[0];
-       struct extent_state *cached_state = NULL;
-       int ret;
-
-       ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
-       ASSERT(PagePrivate(page));
-       ASSERT(check);
-       io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
-
-       if (wait == WAIT_NONE) {
-               if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-                                    &cached_state))
-                       return -EAGAIN;
-       } else {
-               ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-                                 &cached_state);
-               if (ret < 0)
-                       return ret;
-       }
-
-       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) {
-               unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-                             &cached_state);
-               return 0;
-       }
-
-       btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
 
-       __read_extent_buffer_pages(eb, mirror_num, check);
-       if (wait != WAIT_COMPLETE) {
-               free_extent_state(cached_state);
-               return 0;
-       }
-
-       wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
-                       EXTENT_LOCKED, &cached_state);
-       if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-               return -EIO;
-       return 0;
-}
-
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
-                            struct btrfs_tree_parent_check *check)
-{
-       int i;
-       struct page *page;
-       int locked_pages = 0;
-       int num_pages;
-
-       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-               return 0;
-
-       /*
-        * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
-        * operation, which could potentially still be in flight.  In this case
-        * we simply want to return an error.
-        */
-       if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
-               return -EIO;
-
-       if (eb->fs_info->nodesize < PAGE_SIZE)
-               return read_extent_buffer_subpage(eb, wait, mirror_num, check);
-
-       num_pages = num_extent_pages(eb);
-       for (i = 0; i < num_pages; i++) {
-               page = eb->pages[i];
-               if (wait == WAIT_NONE) {
-                       /*
-                        * WAIT_NONE is only utilized by readahead. If we can't
-                        * acquire the lock atomically it means either the eb
-                        * is being read out or under modification.
-                        * Either way the eb will be or has been cached,
-                        * readahead can exit safely.
-                        */
-                       if (!trylock_page(page))
-                               goto unlock_exit;
-               } else {
-                       lock_page(page);
-               }
-               locked_pages++;
-       }
-
-       __read_extent_buffer_pages(eb, mirror_num, check);
-
-       if (wait != WAIT_COMPLETE)
-               return 0;
-
-       for (i = 0; i < num_pages; i++) {
-               page = eb->pages[i];
-               wait_on_page_locked(page);
-               if (!PageUptodate(page))
+done:
+       if (wait == WAIT_COMPLETE) {
+               wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+               if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                        return -EIO;
        }
 
        return 0;
-
-unlock_exit:
-       while (locked_pages > 0) {
-               locked_pages--;
-               page = eb->pages[locked_pages];
-               unlock_page(page);
-       }
-       return 0;
 }
 
 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
index 217c433..2d91ca9 100644 (file)
@@ -29,6 +29,8 @@ enum {
        /* write IO error */
        EXTENT_BUFFER_WRITE_ERR,
        EXTENT_BUFFER_NO_CHECK,
+       /* Indicate that extent buffer pages a being read */
+       EXTENT_BUFFER_READING,
 };
 
 /* these are flags for __process_pages_contig */