md/raid5: avoid device_lock in read_one_chunk()
authorGal Ofri <gal.ofri@storing.io>
Mon, 7 Jun 2021 11:07:03 +0000 (14:07 +0300)
committerSong Liu <song@kernel.org>
Tue, 15 Jun 2021 05:32:07 +0000 (22:32 -0700)
There is a lock contention on device_lock in read_one_chunk().
device_lock is taken to sync conf->active_aligned_reads and
conf->quiesce.
read_one_chunk() takes the lock, then waits for quiesce=0 (resumed)
before incrementing active_aligned_reads.
raid5_quiesce() takes the lock, sets quiesce=2 (in-progress), then waits
for active_aligned_reads to be zero before setting quiesce=1
(suspended).

Introduce a fast (lockless) path in read_one_chunk(): activate aligned
read without taking device_lock.  In case quiesce starts while
activating the aligned-read in fast path, deactivate it and revert to
old behavior (take device_lock and wait for quiesce to finish).

Add smp store/load in raid5_quiesce()/read_one_chunk() respectively to
gaurantee that read_one_chunk() does not miss an ongoing quiesce.

My setups:
1. 8 local nvme drives (each up to 250k iops).
2. 8 ram disks (brd).

Each setup with raid6 (6+2), 1024 io threads on a 96 cpu-cores (48 per
socket) system. Record both iops and cpu spent on this contention with
rand-read-4k. Record bw with sequential-read-128k.  Note: in most cases
cpu is still busy but due to "new" bottlenecks.

nvme:
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 1.6M           | ~50% | 5.5GB/s
with patch    | 2M (throttled) | 0%   | 16GB/s (throttled)

ram (brd):
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 2M             | ~80% | 24GB/s
with patch    | 4M             | 0%   | 55GB/s

CC: Song Liu <song@kernel.org>
CC: Neil Brown <neilb@suse.de>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Gal Ofri <gal.ofri@storing.io>
Signed-off-by: Song Liu <song@kernel.org>
drivers/md/raid5.c

index 0ee9aa0..e248532 100644 (file)
@@ -5403,6 +5403,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
        sector_t sector, end_sector, first_bad;
        int bad_sectors, dd_idx;
        struct md_io_acct *md_io_acct;
+       bool did_inc;
 
        if (!in_chunk_boundary(mddev, raid_bio)) {
                pr_debug("%s: non aligned\n", __func__);
@@ -5454,11 +5455,24 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
        /* No reshape active, so we can trust rdev->data_offset */
        align_bio->bi_iter.bi_sector += rdev->data_offset;
 
-       spin_lock_irq(&conf->device_lock);
-       wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-                           conf->device_lock);
-       atomic_inc(&conf->active_aligned_reads);
-       spin_unlock_irq(&conf->device_lock);
+       did_inc = false;
+       if (conf->quiesce == 0) {
+               atomic_inc(&conf->active_aligned_reads);
+               did_inc = true;
+       }
+       /* need a memory barrier to detect the race with raid5_quiesce() */
+       if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
+               /* quiesce is in progress, so we need to undo io activation and wait
+                * for it to finish
+                */
+               if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
+                       wake_up(&conf->wait_for_quiescent);
+               spin_lock_irq(&conf->device_lock);
+               wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
+                                   conf->device_lock);
+               atomic_inc(&conf->active_aligned_reads);
+               spin_unlock_irq(&conf->device_lock);
+       }
 
        if (mddev->gendisk)
                trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -8346,7 +8360,10 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
                 * active stripes can drain
                 */
                r5c_flush_cache(conf, INT_MAX);
-               conf->quiesce = 2;
+               /* need a memory barrier to make sure read_one_chunk() sees
+                * quiesce started and reverts to slow (locked) path.
+                */
+               smp_store_release(&conf->quiesce, 2);
                wait_event_cmd(conf->wait_for_quiescent,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,