From 5a409b4f56d50b212334f338cb8465d65550cd85 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Mon, 21 May 2018 11:49:54 +0800
Subject: [PATCH] MD: fix lock contention for flush bios

There is a lock contention when there are many processes which send flush bios
to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.

Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
to be NULL, otherwise get mddev->lock.

This patch remove mddev->flush_bio and handle flush bio asynchronously.
I did a test with command dbench -s 128 -t 300. This is the test result:

=================Without the patch============================
 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                    11165   167.595  5879.560
 Close                   107469     1.391  2231.094
 LockX                      384     0.003     0.019
 Rename                    5944     2.141  1856.001
 ReadX                   208121     0.003     0.074
 WriteX                   98259  1925.402 15204.895
 Unlink                   25198    13.264  3457.268
 UnlockX                    384     0.001     0.009
 FIND_FIRST               47111     0.012     0.076
 SET_FILE_INFORMATION     12966     0.007     0.065
 QUERY_FILE_INFORMATION   27921     0.004     0.085
 QUERY_PATH_INFORMATION  124650     0.005     5.766
 QUERY_FS_INFORMATION     22519     0.003     0.053
 NTCreateX               141086     4.291  2502.812

Throughput 3.7181 MB/sec (sync open)  128 clients  128 procs  max_latency=15204.905 ms

=================With the patch============================
 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                     4500   174.134   406.398
 Close                    48195     0.060   467.062
 LockX                      256     0.003     0.029
 Rename                    2324     0.026     0.360
 ReadX                    78846     0.004     0.504
 WriteX                   66832   562.775  1467.037
 Unlink                    5516     3.665  1141.740
 UnlockX                    256     0.002     0.019
 FIND_FIRST               16428     0.015     0.313
 SET_FILE_INFORMATION      6400     0.009     0.520
 QUERY_FILE_INFORMATION   17865     0.003     0.089
 QUERY_PATH_INFORMATION   47060     0.078   416.299
 QUERY_FS_INFORMATION      7024     0.004     0.032
 NTCreateX                55921     0.854  1141.452

Throughput 11.744 MB/sec (sync open)  128 clients  128 procs  max_latency=1467.041 ms

The test is done on raid1 disk with two rotational disks

V5: V4 is more complicated than the version with memory pool. So revert to the memory pool
version

V4: use address of fbio to do hash to choose free flush info.
V3:
Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device
and uses a simple bitmap to record which resource is free.

Fix a bug from v2. It should set flush_pending to 1 at first.

V2:
Neil pointed out two problems. One is counting error problem and another is return value
when allocat memory fails.
1. counting error problem
This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
that you previously called
                          atomic_inc(&rdev->nr_pending);
If an rdev was added to the list between the start and end of the flush,
this will do something bad.

Now it doesn't use bio_chain. It uses specified call back function for each
flush bio.
2. Returned on IO error when kmalloc fails is wrong.
I use mempool suggested by Neil in V2
3. Fixed some places pointed by Guoqing

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 158 +++++++++++++++++++++++++++++++++++++-------------------
 drivers/md/md.h |  22 +++++---
 2 files changed, 120 insertions(+), 60 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index df9eb1a..6b4e2f2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
 		mddev->sync_speed_max : sysctl_speed_limit_max;
 }
 
+static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+{
+        return kzalloc(sizeof(struct flush_info), gfp_flags);
+}
+static void flush_info_free(void *flush_info, void *data)
+{
+        kfree(flush_info);
+}
+
+static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+{
+	return kzalloc(sizeof(struct flush_bio), gfp_flags);
+}
+static void flush_bio_free(void *flush_bio, void *data)
+{
+	kfree(flush_bio);
+}
+
 static struct ctl_table_header *raid_table_header;
 
 static struct ctl_table raid_table[] = {
@@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
 /*
  * Generic flush handling for md
  */
+static void submit_flushes(struct work_struct *ws)
+{
+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+	struct mddev *mddev = fi->mddev;
+	struct bio *bio = fi->bio;
+
+	bio->bi_opf &= ~REQ_PREFLUSH;
+	md_handle_request(mddev, bio);
+
+	mempool_free(fi, mddev->flush_pool);
+}
 
-static void md_end_flush(struct bio *bio)
+static void md_end_flush(struct bio *fbio)
 {
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
+	struct flush_bio *fb = fbio->bi_private;
+	struct md_rdev *rdev = fb->rdev;
+	struct flush_info *fi = fb->fi;
+	struct bio *bio = fi->bio;
+	struct mddev *mddev = fi->mddev;
 
 	rdev_dec_pending(rdev, mddev);
 
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (bio->bi_iter.bi_size == 0)
+			/* an empty barrier - all done */
+			bio_endio(bio);
+		else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
-	bio_put(bio);
-}
 
-static void md_submit_flush_data(struct work_struct *ws);
+	mempool_free(fb, mddev->flush_bio_pool);
+	bio_put(fbio);
+}
 
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct md_rdev *rdev;
+	struct flush_info *fi;
+
+	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
+
+	fi->bio = bio;
+	fi->mddev = mddev;
+	atomic_set(&fi->flush_pending, 1);
 
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
@@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws)
 			 * we reclaim rcu_read_lock
 			 */
 			struct bio *bi;
+			struct flush_bio *fb;
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
+
+			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
+			fb->fi = fi;
+			fb->rdev = rdev;
+
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
 			bio_set_dev(bi, rdev->bdev);
+			bi->bi_end_io = md_end_flush;
+			bi->bi_private = fb;
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
+
+			atomic_inc(&fi->flush_pending);
 			submit_bio(bi);
+
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
 
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	mddev->flush_bio = NULL;
-	wake_up(&mddev->sb_wait);
-
-	if (bio->bi_iter.bi_size == 0)
-		/* an empty barrier - all done */
-		bio_endio(bio);
-	else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (bio->bi_iter.bi_size == 0)
+			/* an empty barrier - all done */
+			bio_endio(bio);
+		else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
 }
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio,
-			    mddev->lock);
-	mddev->flush_bio = bio;
-	spin_unlock_irq(&mddev->lock);
-
-	INIT_WORK(&mddev->flush_work, submit_flushes);
-	queue_work(md_wq, &mddev->flush_work);
-}
 EXPORT_SYMBOL(md_flush_request);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
 	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
@@ -5510,6 +5530,22 @@ int md_run(struct mddev *mddev)
 			goto abort;
 		}
 	}
+	if (mddev->flush_pool == NULL) {
+		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
+						flush_info_free, mddev);
+		if (!mddev->flush_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
+	if (mddev->flush_bio_pool == NULL) {
+		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
+						flush_bio_free, mddev);
+		if (!mddev->flush_bio_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -5669,6 +5705,14 @@ int md_run(struct mddev *mddev)
 	return 0;
 
 abort:
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool){
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -5889,6 +5933,14 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool) {
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925c..ffcb1ae 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
 };
 
+#define NR_FLUSH_INFOS 8
+#define NR_FLUSH_BIOS 64
+struct flush_info {
+	struct bio			*bio;
+	struct mddev			*mddev;
+	struct work_struct		flush_work;
+	atomic_t			flush_pending;
+};
+struct flush_bio {
+	struct flush_info *fi;
+	struct md_rdev *rdev;
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -457,13 +470,8 @@ struct mddev {
 						   * metadata and bitmap writes
 						   */
 
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
-	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	struct work_struct flush_work;
+	mempool_t			*flush_pool;
+	mempool_t			*flush_bio_pool;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
-- 
2.7.4