btrfs: use bios instead of buffer_heads from super block writeout
authorJohannes Thumshirn <johannes.thumshirn@wdc.com>
Thu, 13 Feb 2020 15:24:33 +0000 (00:24 +0900)
committerDavid Sterba <dsterba@suse.com>
Mon, 23 Mar 2020 16:01:39 +0000 (17:01 +0100)
Similar to the superblock read path, change the write path to using bios
and pages instead of buffer_heads. This allows us to skip over the
buffer_head code, for writing the superblock to disk.

This is based on a patch originally authored by Nikolay Borisov.

Co-developed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/disk-io.c

index ac9554d71a4630f2c3442c3f83b6c59eb2e76926..756bf2ab64cda73f099b63e8cbb6c5a793fb5309 100644 (file)
@@ -7,7 +7,6 @@
 #include <linux/blkdev.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
@@ -3395,25 +3394,34 @@ fail:
 }
 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
 
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
 {
-       if (uptodate) {
-               set_buffer_uptodate(bh);
-       } else {
-               struct btrfs_device *device = (struct btrfs_device *)
-                       bh->b_private;
-
-               btrfs_warn_rl_in_rcu(device->fs_info,
-                               "lost page write due to IO error on %s",
-                                         rcu_str_deref(device->name));
-               /* note, we don't set_buffer_write_io_error because we have
-                * our own ways of dealing with the IO errors
-                */
-               clear_buffer_uptodate(bh);
-               btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+       struct btrfs_device *device = bio->bi_private;
+       struct bio_vec *bvec;
+       struct bvec_iter_all iter_all;
+       struct page *page;
+
+       bio_for_each_segment_all(bvec, bio, iter_all) {
+               page = bvec->bv_page;
+
+               if (bio->bi_status) {
+                       btrfs_warn_rl_in_rcu(device->fs_info,
+                               "lost page write due to IO error on %s (%d)",
+                               rcu_str_deref(device->name),
+                               blk_status_to_errno(bio->bi_status));
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+                       btrfs_dev_stat_inc_and_print(device,
+                                                    BTRFS_DEV_STAT_WRITE_ERRS);
+               } else {
+                       SetPageUptodate(page);
+               }
+
+               put_page(page);
+               unlock_page(page);
        }
-       unlock_buffer(bh);
-       put_bh(bh);
+
+       bio_put(bio);
 }
 
 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
@@ -3473,25 +3481,23 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
 
 /*
  * Write superblock @sb to the @device. Do not wait for completion, all the
- * buffer heads we write are pinned.
+ * pages we use for writing are locked.
  *
  * Write @max_mirrors copies of the superblock, where 0 means default that fit
  * the expected device size at commit time. Note that max_mirrors must be
  * same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
  */
 static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb, int max_mirrors)
 {
        struct btrfs_fs_info *fs_info = device->fs_info;
+       struct address_space *mapping = device->bdev->bd_inode->i_mapping;
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-       struct buffer_head *bh;
        int i;
-       int ret;
        int errors = 0;
        u64 bytenr;
-       int op_flags;
 
        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3499,6 +3505,10 @@ static int write_dev_supers(struct btrfs_device *device,
        shash->tfm = fs_info->csum_shash;
 
        for (i = 0; i < max_mirrors; i++) {
+               struct page *page;
+               struct bio *bio;
+               struct btrfs_super_block *disk_super;
+
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
                    device->commit_total_bytes)
@@ -3511,37 +3521,45 @@ static int write_dev_supers(struct btrfs_device *device,
                                    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
                crypto_shash_final(shash, sb->csum);
 
-               /* One reference for us, and we leave it for the caller */
-               bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
-                             BTRFS_SUPER_INFO_SIZE);
-               if (!bh) {
+               page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+                                          GFP_NOFS);
+               if (!page) {
                        btrfs_err(device->fs_info,
-                           "couldn't get super buffer head for bytenr %llu",
+                           "couldn't get super block page for bytenr %llu",
                            bytenr);
                        errors++;
                        continue;
                }
 
-               memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+               /* Bump the refcount for wait_dev_supers() */
+               get_page(page);
 
-               /* one reference for submit_bh */
-               get_bh(bh);
+               disk_super = page_address(page);
+               memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
 
-               set_buffer_uptodate(bh);
-               lock_buffer(bh);
-               bh->b_end_io = btrfs_end_buffer_write_sync;
-               bh->b_private = device;
+               /*
+                * Directly use bios here instead of relying on the page cache
+                * to do I/O, so we don't lose the ability to do integrity
+                * checking.
+                */
+               bio = bio_alloc(GFP_NOFS, 1);
+               bio_set_dev(bio, device->bdev);
+               bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+               bio->bi_private = device;
+               bio->bi_end_io = btrfs_end_super_write;
+               __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+                              offset_in_page(bytenr));
 
                /*
-                * we fua the first super.  The others we allow
-                * to go down lazy.
+                * We FUA only the first super block.  The others we allow to
+                * go down lazy and there's a short window where the on-disk
+                * copies might still contain the older version.
                 */
-               op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
+               bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
                if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
-                       op_flags |= REQ_FUA;
-               ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
-               if (ret)
-                       errors++;
+                       bio->bi_opf |= REQ_FUA;
+
+               btrfsic_submit_bio(bio);
        }
        return errors < i ? 0 : -1;
 }
@@ -3550,12 +3568,11 @@ static int write_dev_supers(struct btrfs_device *device,
  * Wait for write completion of superblocks done by write_dev_supers,
  * @max_mirrors same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
  * date.
  */
 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 {
-       struct buffer_head *bh;
        int i;
        int errors = 0;
        bool primary_failed = false;
@@ -3565,32 +3582,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
        for (i = 0; i < max_mirrors; i++) {
+               struct page *page;
+
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
                    device->commit_total_bytes)
                        break;
 
-               bh = __find_get_block(device->bdev,
-                                     bytenr / BTRFS_BDEV_BLOCKSIZE,
-                                     BTRFS_SUPER_INFO_SIZE);
-               if (!bh) {
+               page = find_get_page(device->bdev->bd_inode->i_mapping,
+                                    bytenr >> PAGE_SHIFT);
+               if (!page) {
                        errors++;
                        if (i == 0)
                                primary_failed = true;
                        continue;
                }
-               wait_on_buffer(bh);
-               if (!buffer_uptodate(bh)) {
+               /* Page is submitted locked and unlocked once the IO completes */
+               wait_on_page_locked(page);
+               if (PageError(page)) {
                        errors++;
                        if (i == 0)
                                primary_failed = true;
                }
 
-               /* drop our reference */
-               brelse(bh);
+               /* Drop our reference */
+               put_page(page);
 
-               /* drop the reference from the writing run */
-               brelse(bh);
+               /* Drop the reference from the writing run */
+               put_page(page);
        }
 
        /* log error, force error return */