btrfs: use chained bios when cloning
authorChristoph Hellwig <hch@lst.de>
Sat, 6 Aug 2022 08:03:24 +0000 (10:03 +0200)
committerDavid Sterba <dsterba@suse.com>
Mon, 26 Sep 2022 10:27:59 +0000 (12:27 +0200)
The stripes_pending in the btrfs_io_context counts number of inflight
low-level bios for an upper btrfs_bio.  For reads this is generally
one as reads are never cloned, while for writes we can trivially use
the bio remaining mechanisms that is used for chained bios.

To be able to make use of that mechanism, split out a separate trivial
end_io handler for the cloned bios that does a minimal amount of error
tracking and which then calls bio_endio on the original bio to transfer
control to that, with the remaining counter making sure it is completed
last.  This then allows to merge btrfs_end_bioc into the original bio
bi_end_io handler.

To make this all work all error handling needs to happen through the
bi_end_io handler, which requires a small amount of reshuffling in
submit_stripe_bio so that the bio is cloned already by the time the
suitability of the device is checked.

This reduces the size of the btrfs_io_context and prepares splitting
the btrfs_bio at the stripe boundary.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 1d01347..44c7529 100644 (file)
@@ -5898,7 +5898,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
                sizeof(u64) * (total_stripes),
                GFP_NOFS|__GFP_NOFAIL);
 
-       atomic_set(&bioc->error, 0);
        refcount_set(&bioc->refs, 1);
 
        bioc->fs_info = fs_info;
@@ -6653,7 +6652,21 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
        bio_trim(bio, offset >> 9, size >> 9);
        bbio->iter = bio->bi_iter;
        return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+       if (!dev || !dev->bdev)
+               return;
+       if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+               return;
 
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+       if (!(bio->bi_opf & REQ_RAHEAD))
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+       if (bio->bi_opf & REQ_PREFLUSH)
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
 }
 
 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
@@ -6671,62 +6684,54 @@ static void btrfs_end_bio_work(struct work_struct *work)
        bio_endio(&bbio->bio);
 }
 
-static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
+static void btrfs_end_bio(struct bio *bio)
 {
-       struct bio *orig_bio = bioc->orig_bio;
-       struct btrfs_bio *bbio = btrfs_bio(orig_bio);
+       struct btrfs_io_stripe *stripe = bio->bi_private;
+       struct btrfs_io_context *bioc = stripe->bioc;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
 
        btrfs_bio_counter_dec(bioc->fs_info);
 
+       if (bio->bi_status) {
+               atomic_inc(&bioc->error);
+               btrfs_log_dev_io_error(bio, stripe->dev);
+       }
+
        bbio->mirror_num = bioc->mirror_num;
-       orig_bio->bi_private = bioc->private;
-       orig_bio->bi_end_io = bioc->end_io;
+       bio->bi_end_io = bioc->end_io;
+       bio->bi_private = bioc->private;
 
        /*
         * Only send an error to the higher layers if it is beyond the tolerance
         * threshold.
         */
        if (atomic_read(&bioc->error) > bioc->max_errors)
-               orig_bio->bi_status = BLK_STS_IOERR;
+               bio->bi_status = BLK_STS_IOERR;
        else
-               orig_bio->bi_status = BLK_STS_OK;
+               bio->bi_status = BLK_STS_OK;
 
-       if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+       if (btrfs_op(bio) == BTRFS_MAP_READ) {
                INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
                queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
        } else {
-               bio_endio(orig_bio);
+               bio_endio(bio);
        }
 
        btrfs_put_bioc(bioc);
 }
 
-static void btrfs_end_bio(struct bio *bio)
+static void btrfs_clone_write_end_io(struct bio *bio)
 {
        struct btrfs_io_stripe *stripe = bio->bi_private;
-       struct btrfs_io_context *bioc = stripe->bioc;
 
        if (bio->bi_status) {
-               atomic_inc(&bioc->error);
-               if (bio->bi_status == BLK_STS_IOERR ||
-                   bio->bi_status == BLK_STS_TARGET) {
-                       if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_WRITE_ERRS);
-                       else if (!(bio->bi_opf & REQ_RAHEAD))
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_READ_ERRS);
-                       if (bio->bi_opf & REQ_PREFLUSH)
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_FLUSH_ERRS);
-               }
+               atomic_inc(&stripe->bioc->error);
+               btrfs_log_dev_io_error(bio, stripe->dev);
        }
 
-       if (bio != bioc->orig_bio)
-               bio_put(bio);
-
-       if (atomic_dec_and_test(&bioc->stripes_pending))
-               btrfs_end_bioc(bioc, true);
+       /* Pass on control to the original bio this one was cloned from */
+       bio_endio(stripe->bioc->orig_bio);
+       bio_put(bio);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc,
@@ -6737,28 +6742,30 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
        u64 physical = bioc->stripes[dev_nr].physical;
        struct bio *bio;
 
-       if (!dev || !dev->bdev ||
-           test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
-           (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
-            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-               atomic_inc(&bioc->error);
-               if (atomic_dec_and_test(&bioc->stripes_pending))
-                       btrfs_end_bioc(bioc, false);
-               return;
-       }
-
        if (clone) {
-               bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
+               bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+               bio_inc_remaining(orig_bio);
+               bio->bi_end_io = btrfs_clone_write_end_io;
        } else {
                bio = orig_bio;
-               bio_set_dev(bio, dev->bdev);
                btrfs_bio(bio)->device = dev;
+               bio->bi_end_io = btrfs_end_bio;
        }
 
        bioc->stripes[dev_nr].bioc = bioc;
        bio->bi_private = &bioc->stripes[dev_nr];
-       bio->bi_end_io = btrfs_end_bio;
        bio->bi_iter.bi_sector = physical >> 9;
+
+       if (!dev || !dev->bdev ||
+           test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+           (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+               bio_io_error(bio);
+               return;
+       }
+
+       bio_set_dev(bio, dev->bdev);
+
        /*
         * For zone append writing, bi_sector must point the beginning of the
         * zone
@@ -6807,7 +6814,6 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
        bioc->orig_bio = bio;
        bioc->private = bio->bi_private;
        bioc->end_io = bio->bi_end_io;
-       atomic_set(&bioc->stripes_pending, total_devs);
 
        if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
            ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
index 149fb5d..56fc013 100644 (file)
@@ -457,7 +457,6 @@ struct btrfs_discard_stripe {
  */
 struct btrfs_io_context {
        refcount_t refs;
-       atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
        u64 map_type; /* get from map_lookup->type */
        bio_end_io_t *end_io;