dm: support bio polling

author Ming Lei <ming.lei@redhat.com>

Sat, 5 Mar 2022 02:08:04 +0000 (21:08 -0500)

committer Mike Snitzer <snitzer@redhat.com>

Wed, 9 Mar 2022 17:21:56 +0000 (12:21 -0500)
author Ming Lei <ming.lei@redhat.com>
Sat, 5 Mar 2022 02:08:04 +0000 (21:08 -0500)
committer Mike Snitzer <snitzer@redhat.com>
Wed, 9 Mar 2022 17:21:56 +0000 (12:21 -0500)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h

index 8078b6c155eff5aed621389991122edd3de09b1d..8cc03c0c262e20da6e3cdb0159e5ce13ce042059 100644 (file)
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -235,6 +235,8 @@ struct dm_io {
         bool start_io_acct:1;
         int was_accounted;
         unsigned long start_time;
+       void *data;
+       struct hlist_node node;
         spinlock_t endio_lock;
         struct dm_stats_aux stats_aux;
         /* last member of dm_target_io is 'struct bio' */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c

index f4ed756ab391af3127adbd6007b52de3a5b5a358..c0be4f60b427bda33ebd2306db79a6013fbea082 100644 (file)
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1481,6 +1481,14 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
         return &t->targets[(KEYS_PER_NODE * n) + k];
  }
  
+static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
+                                  sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags);
+}
+
  /*
   * type->iterate_devices() should be called when the sanity check needs to
   * iterate and check all underlying data devices. iterate_devices() will
@@ -1531,6 +1539,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
         return 0;
  }
  
+static int dm_table_supports_poll(struct dm_table *t)
+{
+       return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
+}
+
  /*
   * Check whether a table has no data devices attached using each
   * target's iterate_devices method.
@@ -2067,6 +2080,20 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
         dm_update_crypto_profile(q, t);
         disk_update_readahead(t->md->disk);
  
+       /*
+        * Check for request-based device is left to
+        * dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
+        *
+        * For bio-based device, only set QUEUE_FLAG_POLL when all
+        * underlying devices supporting polling.
+        */
+       if (__table_type_bio_based(t->type)) {
+               if (dm_table_supports_poll(t))
+                       blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+               else
+                       blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+       }
+
         return 0;
  }
  
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index 454d39bc7745997bec828f69fe874d0ead141a64..d9111e17f0fcf0925434b69b439b15d265fc8f2b 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -40,6 +40,13 @@
  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  #define DM_COOKIE_LENGTH 24
  
+/*
+ * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
+ * dm_io into one list, and reuse bio->bi_private as the list head. Before
+ * ending this fs bio, we will recover its ->bi_private.
+ */
+#define REQ_DM_POLL_LIST       REQ_DRV
+
  static const char *_name = DM_NAME;
  
  static unsigned int major = 0;
@@ -73,6 +80,7 @@ struct clone_info {
         struct dm_io *io;
         sector_t sector;
         unsigned sector_count;
+       bool submit_as_polled;
  };
  
  #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
@@ -599,6 +607,9 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
                 if (!clone)
                         return NULL;
  
+               /* REQ_DM_POLL_LIST shouldn't be inherited */
+               clone->bi_opf &= ~REQ_DM_POLL_LIST;
+
                 tio = clone_to_tio(clone);
                 tio->inside_dm_io = false;
         }
@@ -888,8 +899,15 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
                 if (unlikely(wq_has_sleeper(&md->wait)))
                         wake_up(&md->wait);
  
-               if (io_error == BLK_STS_DM_REQUEUE)
+               if (io_error == BLK_STS_DM_REQUEUE) {
+                       /*
+                        * Upper layer won't help us poll split bio, io->orig_bio
+                        * may only reflect a subset of the pre-split original,
+                        * so clear REQ_POLLED in case of requeue
+                        */
+                       bio->bi_opf &= ~REQ_POLLED;
                         return;
+               }
  
                 if (bio_is_flush_with_data(bio)) {
                         /*
@@ -1440,6 +1458,47 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
         return true;
  }
  
+/*
+ * Reuse ->bi_private as hlist head for storing all dm_io instances
+ * associated with this bio, and this bio's bi_private needs to be
+ * stored in dm_io->data before the reuse.
+ *
+ * bio->bi_private is owned by fs or upper layer, so block layer won't
+ * touch it after splitting. Meantime it won't be changed by anyone after
+ * bio is submitted. So this reuse is safe.
+ */
+static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
+{
+       return (struct hlist_head *)&bio->bi_private;
+}
+
+static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
+{
+       struct hlist_head *head = dm_get_bio_hlist_head(bio);
+
+       if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
+               bio->bi_opf |= REQ_DM_POLL_LIST;
+               /*
+                * Save .bi_private into dm_io, so that we can reuse
+                * .bi_private as hlist head for storing dm_io list
+                */
+               io->data = bio->bi_private;
+
+               INIT_HLIST_HEAD(head);
+
+               /* tell block layer to poll for completion */
+               bio->bi_cookie = ~BLK_QC_T_NONE;
+       } else {
+               /*
+                * bio recursed due to split, reuse original poll list,
+                * and save bio->bi_private too.
+                */
+               io->data = hlist_entry(head->first, struct dm_io, node)->data;
+       }
+
+       hlist_add_head(&io->node, head);
+}
+
  /*
   * Select the correct strategy for processing a non-flush bio.
   */
@@ -1457,6 +1516,12 @@ static int __split_and_process_bio(struct clone_info *ci)
         if (__process_abnormal_io(ci, ti, &r))
                 return r;
  
+       /*
+        * Only support bio polling for normal IO, and the target io is
+        * exactly inside the dm_io instance (verified in dm_poll_dm_io)
+        */
+       ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
+
         len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
         clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
         __map_bio(clone);
@@ -1473,6 +1538,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
         ci->map = map;
         ci->io = alloc_io(md, bio);
         ci->bio = bio;
+       ci->submit_as_polled = false;
         ci->sector = bio->bi_iter.bi_sector;
         ci->sector_count = bio_sectors(bio);
  
@@ -1522,8 +1588,17 @@ out:
         if (ci.io->start_io_acct)
                 dm_start_io_acct(ci.io, NULL);
  
-       /* drop the extra reference count */
-       dm_io_dec_pending(ci.io, errno_to_blk_status(error));
+       /*
+        * Drop the extra reference count for non-POLLED bio, and hold one
+        * reference for POLLED bio, which will be released in dm_poll_bio
+        *
+        * Add every dm_io instance into the hlist_head which is stored in
+        * bio->bi_private, so that dm_poll_bio can poll them all.
+        */
+       if (error || !ci.submit_as_polled)
+               dm_io_dec_pending(ci.io, errno_to_blk_status(error));
+       else
+               dm_queue_poll_io(bio, ci.io);
  }
  
  static void dm_submit_bio(struct bio *bio)
@@ -1558,6 +1633,67 @@ out:
         dm_put_live_table(md, srcu_idx);
  }
  
+static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
+                         unsigned int flags)
+{
+       WARN_ON_ONCE(!io->tio.inside_dm_io);
+
+       /* don't poll if the mapped io is done */
+       if (atomic_read(&io->io_count) > 1)
+               bio_poll(&io->tio.clone, iob, flags);
+
+       /* bio_poll holds the last reference */
+       return atomic_read(&io->io_count) == 1;
+}
+
+static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
+                      unsigned int flags)
+{
+       struct hlist_head *head = dm_get_bio_hlist_head(bio);
+       struct hlist_head tmp = HLIST_HEAD_INIT;
+       struct hlist_node *next;
+       struct dm_io *io;
+
+       /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
+       if (!(bio->bi_opf & REQ_DM_POLL_LIST))
+               return 0;
+
+       WARN_ON_ONCE(hlist_empty(head));
+
+       hlist_move_list(head, &tmp);
+
+       /*
+        * Restore .bi_private before possibly completing dm_io.
+        *
+        * bio_poll() is only possible once @bio has been completely
+        * submitted via submit_bio_noacct()'s depth-first submission.
+        * So there is no dm_queue_poll_io() race associated with
+        * clearing REQ_DM_POLL_LIST here.
+        */
+       bio->bi_opf &= ~REQ_DM_POLL_LIST;
+       bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
+
+       hlist_for_each_entry_safe(io, next, &tmp, node) {
+               if (dm_poll_dm_io(io, iob, flags)) {
+                       hlist_del_init(&io->node);
+                       /*
+                        * clone_endio() has already occurred, so passing
+                        * error as 0 here doesn't override io->status
+                        */
+                       dm_io_dec_pending(io, 0);
+               }
+       }
+
+       /* Not done? */
+       if (!hlist_empty(&tmp)) {
+               bio->bi_opf |= REQ_DM_POLL_LIST;
+               /* Reset bio->bi_private to dm_io list head */
+               hlist_move_list(&tmp, head);
+               return 0;
+       }
+       return 1;
+}
+
  /*-----------------------------------------------------------------
   * An IDR is used to keep track of allocated minor numbers.
   *---------------------------------------------------------------*/
@@ -2983,6 +3119,7 @@ static const struct pr_ops dm_pr_ops = {
  
  static const struct block_device_operations dm_blk_dops = {
         .submit_bio = dm_submit_bio,
+       .poll_bio = dm_poll_bio,
         .open = dm_blk_open,
         .release = dm_blk_close,
         .ioctl = dm_blk_ioctl,
author	Ming Lei <ming.lei@redhat.com>
	Sat, 5 Mar 2022 02:08:04 +0000 (21:08 -0500)
committer	Mike Snitzer <snitzer@redhat.com>
	Wed, 9 Mar 2022 17:21:56 +0000 (12:21 -0500)
drivers/md/dm-core.h		patch \| blob \| history
drivers/md/dm-table.c		patch \| blob \| history
drivers/md/dm.c		patch \| blob \| history