blk-mq: move srcu from blk_mq_hw_ctx to request_queue
authorMing Lei <ming.lei@redhat.com>
Fri, 3 Dec 2021 13:15:32 +0000 (21:15 +0800)
committerJens Axboe <axboe@kernel.dk>
Fri, 3 Dec 2021 21:51:29 +0000 (14:51 -0700)
In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-core.c
block/blk-mq-sysfs.c
block/blk-mq.c
block/blk-mq.h
block/blk-sysfs.c
block/blk.h
block/genhd.c
include/linux/blk-mq.h
include/linux/blkdev.h

index b0660c9df8526bfeebb9b898d9af5e69ab4253c2..10619fd83c1bc38db8521b61dbbd6e0010c17596 100644 (file)
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
 {
 }
 
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
        struct request_queue *q;
        int ret;
 
-       q = kmem_cache_alloc_node(blk_requestq_cachep,
-                               GFP_KERNEL | __GFP_ZERO, node_id);
+       q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+                       GFP_KERNEL | __GFP_ZERO, node_id);
        if (!q)
                return NULL;
 
+       if (alloc_srcu) {
+               blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+               if (init_srcu_struct(q->srcu) != 0)
+                       goto fail_q;
+       }
+
        q->last_merge = NULL;
 
        q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
        if (q->id < 0)
-               goto fail_q;
+               goto fail_srcu;
 
        ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
        if (ret)
@@ -508,8 +515,11 @@ fail_split:
        bioset_exit(&q->bio_split);
 fail_id:
        ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+       if (alloc_srcu)
+               cleanup_srcu_struct(q->srcu);
 fail_q:
-       kmem_cache_free(blk_requestq_cachep, q);
+       kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
        return NULL;
 }
 
@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
                        sizeof_field(struct request, cmd_flags));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct bio, bi_opf));
+       BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+                          __alignof__(struct request_queue)) !=
+                    sizeof(struct request_queue));
 
        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
        kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
        blk_requestq_cachep = kmem_cache_create("request_queue",
                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+       blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+                       sizeof(struct request_queue) +
+                       sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
        blk_debugfs_root = debugfs_create_dir("block", NULL);
 
        return 0;
index 253c857cba47c31c1ad746b2713e7b7b2abb544f..6747865740750d4e9d47678490e24d27783f9f48 100644 (file)
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
        struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
                                                  kobj);
 
-       if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->srcu);
        blk_free_flush_queue(hctx->fq);
        sbitmap_free(&hctx->ctx_map);
        free_cpumask_var(hctx->cpumask);
index 494da31dc1a55b23871a706e334916d85f51b890..6a2c2704454e03fca96a3914e1467da01fd2121e 100644 (file)
@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_wait_quiesce_done(struct request_queue *q)
 {
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
-       bool rcu = false;
-
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (hctx->flags & BLK_MQ_F_BLOCKING)
-                       synchronize_srcu(hctx->srcu);
-               else
-                       rcu = true;
-       }
-       if (rcu)
+       if (blk_queue_has_srcu(q))
+               synchronize_srcu(q->srcu);
+       else
                synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
        }
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-       int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-                          __alignof__(struct blk_mq_hw_ctx)) !=
-                    sizeof(struct blk_mq_hw_ctx));
-
-       if (tag_set->flags & BLK_MQ_F_BLOCKING)
-               hw_ctx_size += sizeof(struct srcu_struct);
-
-       return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
        struct blk_mq_hw_ctx *hctx;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-       hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+       hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
        if (!hctx)
                goto fail_alloc_hctx;
 
@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
        if (!hctx->fq)
                goto free_bitmap;
 
-       if (hctx->flags & BLK_MQ_F_BLOCKING)
-               init_srcu_struct(hctx->srcu);
        blk_mq_hctx_kobj_init(hctx);
 
        return hctx;
@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
        struct request_queue *q;
        int ret;
 
-       q = blk_alloc_queue(set->numa_node);
+       q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
        if (!q)
                return ERR_PTR(-ENOMEM);
        q->queuedata = queuedata;
@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q)
 {
+       WARN_ON_ONCE(blk_queue_has_srcu(q) !=
+                       !!(set->flags & BLK_MQ_F_BLOCKING));
+
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;
 
index e4c39620492888cd3603d8c0a10e5ae089353bf1..792f0b29c6eb9347dae7108ec6e101429b4ac206 100644 (file)
@@ -385,9 +385,9 @@ do {                                                                \
                int srcu_idx;                                   \
                                                                \
                might_sleep();                                  \
-               srcu_idx = srcu_read_lock((hctx)->srcu);        \
+               srcu_idx = srcu_read_lock((hctx)->queue->srcu); \
                (dispatch_ops);                                 \
-               srcu_read_unlock((hctx)->srcu, srcu_idx);       \
+               srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
        }                                                       \
 } while (0)
 
index 4622da4bb9927537e366b0b8f2a8d907fa6de0e9..3e6357321225fbb36afe930edfd844a8022beb70 100644 (file)
@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 {
        struct request_queue *q = container_of(rcu_head, struct request_queue,
                                               rcu_head);
-       kmem_cache_free(blk_requestq_cachep, q);
+
+       kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
 }
 
 /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
index 24d8b333bb03743d917ed45243c65ad923f0f532..7ccb7c7d86b38afd2889acf99af6c261ce520963 100644 (file)
@@ -27,6 +27,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
@@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page);
 
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+       if (srcu)
+               return blk_requestq_srcu_cachep;
+       return blk_requestq_cachep;
+}
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+
 int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
 
 int disk_alloc_events(struct gendisk *disk);
index 5179a4f00fba5bcc77b9a37cc09b785ead91cfb7..3c139a1b6f04915b9d8fc599a0db1c3e9e05b10d 100644 (file)
@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
        struct request_queue *q;
        struct gendisk *disk;
 
-       q = blk_alloc_queue(node);
+       q = blk_alloc_queue(node, false);
        if (!q)
                return NULL;
 
index 561beb5be7ec07708c0b02dd5b6719131bd52556..ecdc049b52fa134616079eb292d2969373620200 100644 (file)
@@ -4,7 +4,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
-#include <linux/srcu.h>
 #include <linux/lockdep.h>
 #include <linux/scatterlist.h>
 #include <linux/prefetch.h>
@@ -375,13 +374,6 @@ struct blk_mq_hw_ctx {
         * q->unused_hctx_list.
         */
        struct list_head        hctx_list;
-
-       /**
-        * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-        * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-        * blk_mq_hw_ctx_size().
-        */
-       struct srcu_struct      srcu[];
 };
 
 /**
index 0a4416ef4fbf8552deafffa795394942e2c82b6e..c80cfaefc0a8f6238e67f5213ad1a85700c75ffe 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/blkzoned.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct module;
 struct request_queue;
@@ -373,11 +374,18 @@ struct request_queue {
         * devices that do not have multiple independent access ranges.
         */
        struct blk_independent_access_ranges *ia_ranges;
+
+       /**
+        * @srcu: Sleepable RCU. Use as lock when type of the request queue
+        * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+        */
+       struct srcu_struct      srcu[];
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
 #define QUEUE_FLAG_STOPPED     0       /* queue is stopped */
 #define QUEUE_FLAG_DYING       1       /* queue being torn down */
+#define QUEUE_FLAG_HAS_SRCU    2       /* SRCU is allocated */
 #define QUEUE_FLAG_NOMERGES     3      /* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP   4       /* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO     5       /* fake timeout */
@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 
 #define blk_queue_stopped(q)   test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)     test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
+#define blk_queue_has_srcu(q)  test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
 #define blk_queue_dead(q)      test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)  test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)