dm btree: increase rebalance threshold in __rebalance2()
[platform/kernel/linux-rpi.git] / drivers / md / md-multipath.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * multipath.c : Multiple Devices driver for Linux
4  *
5  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
6  *
7  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8  *
9  * MULTIPATH management functions.
10  *
11  * derived from raid1.c.
12  */
13
14 #include <linux/blkdev.h>
15 #include <linux/module.h>
16 #include <linux/raid/md_u.h>
17 #include <linux/seq_file.h>
18 #include <linux/slab.h>
19 #include "md.h"
20 #include "md-multipath.h"
21
22 #define MAX_WORK_PER_DISK 128
23
24 #define NR_RESERVED_BUFS        32
25
26 static int multipath_map (struct mpconf *conf)
27 {
28         int i, disks = conf->raid_disks;
29
30         /*
31          * Later we do read balancing on the read side
32          * now we use the first available disk.
33          */
34
35         rcu_read_lock();
36         for (i = 0; i < disks; i++) {
37                 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
38                 if (rdev && test_bit(In_sync, &rdev->flags) &&
39                     !test_bit(Faulty, &rdev->flags)) {
40                         atomic_inc(&rdev->nr_pending);
41                         rcu_read_unlock();
42                         return i;
43                 }
44         }
45         rcu_read_unlock();
46
47         pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
48         return (-1);
49 }
50
51 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
52 {
53         unsigned long flags;
54         struct mddev *mddev = mp_bh->mddev;
55         struct mpconf *conf = mddev->private;
56
57         spin_lock_irqsave(&conf->device_lock, flags);
58         list_add(&mp_bh->retry_list, &conf->retry_list);
59         spin_unlock_irqrestore(&conf->device_lock, flags);
60         md_wakeup_thread(mddev->thread);
61 }
62
63 /*
64  * multipath_end_bh_io() is called when we have finished servicing a multipathed
65  * operation and are ready to return a success/failure code to the buffer
66  * cache layer.
67  */
68 static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
69 {
70         struct bio *bio = mp_bh->master_bio;
71         struct mpconf *conf = mp_bh->mddev->private;
72
73         bio->bi_status = status;
74         bio_endio(bio);
75         mempool_free(mp_bh, &conf->pool);
76 }
77
78 static void multipath_end_request(struct bio *bio)
79 {
80         struct multipath_bh *mp_bh = bio->bi_private;
81         struct mpconf *conf = mp_bh->mddev->private;
82         struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
83
84         if (!bio->bi_status)
85                 multipath_end_bh_io(mp_bh, 0);
86         else if (!(bio->bi_opf & REQ_RAHEAD)) {
87                 /*
88                  * oops, IO error:
89                  */
90                 char b[BDEVNAME_SIZE];
91                 md_error (mp_bh->mddev, rdev);
92                 pr_info("multipath: %s: rescheduling sector %llu\n",
93                         bdevname(rdev->bdev,b),
94                         (unsigned long long)bio->bi_iter.bi_sector);
95                 multipath_reschedule_retry(mp_bh);
96         } else
97                 multipath_end_bh_io(mp_bh, bio->bi_status);
98         rdev_dec_pending(rdev, conf->mddev);
99 }
100
101 static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
102 {
103         struct mpconf *conf = mddev->private;
104         struct multipath_bh * mp_bh;
105         struct multipath_info *multipath;
106
107         if (unlikely(bio->bi_opf & REQ_PREFLUSH)
108             && md_flush_request(mddev, bio))
109                 return true;
110
111         mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
112
113         mp_bh->master_bio = bio;
114         mp_bh->mddev = mddev;
115
116         mp_bh->path = multipath_map(conf);
117         if (mp_bh->path < 0) {
118                 bio_io_error(bio);
119                 mempool_free(mp_bh, &conf->pool);
120                 return true;
121         }
122         multipath = conf->multipaths + mp_bh->path;
123
124         bio_init(&mp_bh->bio, NULL, 0);
125         __bio_clone_fast(&mp_bh->bio, bio);
126
127         mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
128         bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
129         mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
130         mp_bh->bio.bi_end_io = multipath_end_request;
131         mp_bh->bio.bi_private = mp_bh;
132         mddev_check_writesame(mddev, &mp_bh->bio);
133         mddev_check_write_zeroes(mddev, &mp_bh->bio);
134         generic_make_request(&mp_bh->bio);
135         return true;
136 }
137
138 static void multipath_status(struct seq_file *seq, struct mddev *mddev)
139 {
140         struct mpconf *conf = mddev->private;
141         int i;
142
143         seq_printf (seq, " [%d/%d] [", conf->raid_disks,
144                     conf->raid_disks - mddev->degraded);
145         rcu_read_lock();
146         for (i = 0; i < conf->raid_disks; i++) {
147                 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
148                 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
149         }
150         rcu_read_unlock();
151         seq_putc(seq, ']');
152 }
153
154 static int multipath_congested(struct mddev *mddev, int bits)
155 {
156         struct mpconf *conf = mddev->private;
157         int i, ret = 0;
158
159         rcu_read_lock();
160         for (i = 0; i < mddev->raid_disks ; i++) {
161                 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
162                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
163                         struct request_queue *q = bdev_get_queue(rdev->bdev);
164
165                         ret |= bdi_congested(q->backing_dev_info, bits);
166                         /* Just like multipath_map, we just check the
167                          * first available device
168                          */
169                         break;
170                 }
171         }
172         rcu_read_unlock();
173         return ret;
174 }
175
176 /*
177  * Careful, this can execute in IRQ contexts as well!
178  */
179 static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
180 {
181         struct mpconf *conf = mddev->private;
182         char b[BDEVNAME_SIZE];
183
184         if (conf->raid_disks - mddev->degraded <= 1) {
185                 /*
186                  * Uh oh, we can do nothing if this is our last path, but
187                  * first check if this is a queued request for a device
188                  * which has just failed.
189                  */
190                 pr_warn("multipath: only one IO path left and IO error.\n");
191                 /* leave it active... it's all we have */
192                 return;
193         }
194         /*
195          * Mark disk as unusable
196          */
197         if (test_and_clear_bit(In_sync, &rdev->flags)) {
198                 unsigned long flags;
199                 spin_lock_irqsave(&conf->device_lock, flags);
200                 mddev->degraded++;
201                 spin_unlock_irqrestore(&conf->device_lock, flags);
202         }
203         set_bit(Faulty, &rdev->flags);
204         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
205         pr_err("multipath: IO failure on %s, disabling IO path.\n"
206                "multipath: Operation continuing on %d IO paths.\n",
207                bdevname(rdev->bdev, b),
208                conf->raid_disks - mddev->degraded);
209 }
210
211 static void print_multipath_conf (struct mpconf *conf)
212 {
213         int i;
214         struct multipath_info *tmp;
215
216         pr_debug("MULTIPATH conf printout:\n");
217         if (!conf) {
218                 pr_debug("(conf==NULL)\n");
219                 return;
220         }
221         pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
222                  conf->raid_disks);
223
224         for (i = 0; i < conf->raid_disks; i++) {
225                 char b[BDEVNAME_SIZE];
226                 tmp = conf->multipaths + i;
227                 if (tmp->rdev)
228                         pr_debug(" disk%d, o:%d, dev:%s\n",
229                                  i,!test_bit(Faulty, &tmp->rdev->flags),
230                                  bdevname(tmp->rdev->bdev,b));
231         }
232 }
233
234 static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
235 {
236         struct mpconf *conf = mddev->private;
237         int err = -EEXIST;
238         int path;
239         struct multipath_info *p;
240         int first = 0;
241         int last = mddev->raid_disks - 1;
242
243         if (rdev->raid_disk >= 0)
244                 first = last = rdev->raid_disk;
245
246         print_multipath_conf(conf);
247
248         for (path = first; path <= last; path++)
249                 if ((p=conf->multipaths+path)->rdev == NULL) {
250                         disk_stack_limits(mddev->gendisk, rdev->bdev,
251                                           rdev->data_offset << 9);
252
253                         err = md_integrity_add_rdev(rdev, mddev);
254                         if (err)
255                                 break;
256                         spin_lock_irq(&conf->device_lock);
257                         mddev->degraded--;
258                         rdev->raid_disk = path;
259                         set_bit(In_sync, &rdev->flags);
260                         spin_unlock_irq(&conf->device_lock);
261                         rcu_assign_pointer(p->rdev, rdev);
262                         err = 0;
263                         break;
264                 }
265
266         print_multipath_conf(conf);
267
268         return err;
269 }
270
271 static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
272 {
273         struct mpconf *conf = mddev->private;
274         int err = 0;
275         int number = rdev->raid_disk;
276         struct multipath_info *p = conf->multipaths + number;
277
278         print_multipath_conf(conf);
279
280         if (rdev == p->rdev) {
281                 if (test_bit(In_sync, &rdev->flags) ||
282                     atomic_read(&rdev->nr_pending)) {
283                         pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
284                         err = -EBUSY;
285                         goto abort;
286                 }
287                 p->rdev = NULL;
288                 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
289                         synchronize_rcu();
290                         if (atomic_read(&rdev->nr_pending)) {
291                                 /* lost the race, try later */
292                                 err = -EBUSY;
293                                 p->rdev = rdev;
294                                 goto abort;
295                         }
296                 }
297                 err = md_integrity_register(mddev);
298         }
299 abort:
300
301         print_multipath_conf(conf);
302         return err;
303 }
304
305 /*
306  * This is a kernel thread which:
307  *
308  *      1.      Retries failed read operations on working multipaths.
309  *      2.      Updates the raid superblock when problems encounter.
310  *      3.      Performs writes following reads for array syncronising.
311  */
312
313 static void multipathd(struct md_thread *thread)
314 {
315         struct mddev *mddev = thread->mddev;
316         struct multipath_bh *mp_bh;
317         struct bio *bio;
318         unsigned long flags;
319         struct mpconf *conf = mddev->private;
320         struct list_head *head = &conf->retry_list;
321
322         md_check_recovery(mddev);
323         for (;;) {
324                 char b[BDEVNAME_SIZE];
325                 spin_lock_irqsave(&conf->device_lock, flags);
326                 if (list_empty(head))
327                         break;
328                 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
329                 list_del(head->prev);
330                 spin_unlock_irqrestore(&conf->device_lock, flags);
331
332                 bio = &mp_bh->bio;
333                 bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
334
335                 if ((mp_bh->path = multipath_map (conf))<0) {
336                         pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
337                                bio_devname(bio, b),
338                                (unsigned long long)bio->bi_iter.bi_sector);
339                         multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
340                 } else {
341                         pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
342                                bio_devname(bio, b),
343                                (unsigned long long)bio->bi_iter.bi_sector);
344                         *bio = *(mp_bh->master_bio);
345                         bio->bi_iter.bi_sector +=
346                                 conf->multipaths[mp_bh->path].rdev->data_offset;
347                         bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
348                         bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
349                         bio->bi_end_io = multipath_end_request;
350                         bio->bi_private = mp_bh;
351                         generic_make_request(bio);
352                 }
353         }
354         spin_unlock_irqrestore(&conf->device_lock, flags);
355 }
356
357 static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
358 {
359         WARN_ONCE(sectors || raid_disks,
360                   "%s does not support generic reshape\n", __func__);
361
362         return mddev->dev_sectors;
363 }
364
365 static int multipath_run (struct mddev *mddev)
366 {
367         struct mpconf *conf;
368         int disk_idx;
369         struct multipath_info *disk;
370         struct md_rdev *rdev;
371         int working_disks;
372         int ret;
373
374         if (md_check_no_bitmap(mddev))
375                 return -EINVAL;
376
377         if (mddev->level != LEVEL_MULTIPATH) {
378                 pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
379                         mdname(mddev), mddev->level);
380                 goto out;
381         }
382         /*
383          * copy the already verified devices into our private MULTIPATH
384          * bookkeeping area. [whatever we allocate in multipath_run(),
385          * should be freed in multipath_free()]
386          */
387
388         conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
389         mddev->private = conf;
390         if (!conf)
391                 goto out;
392
393         conf->multipaths = kcalloc(mddev->raid_disks,
394                                    sizeof(struct multipath_info),
395                                    GFP_KERNEL);
396         if (!conf->multipaths)
397                 goto out_free_conf;
398
399         working_disks = 0;
400         rdev_for_each(rdev, mddev) {
401                 disk_idx = rdev->raid_disk;
402                 if (disk_idx < 0 ||
403                     disk_idx >= mddev->raid_disks)
404                         continue;
405
406                 disk = conf->multipaths + disk_idx;
407                 disk->rdev = rdev;
408                 disk_stack_limits(mddev->gendisk, rdev->bdev,
409                                   rdev->data_offset << 9);
410
411                 if (!test_bit(Faulty, &rdev->flags))
412                         working_disks++;
413         }
414
415         conf->raid_disks = mddev->raid_disks;
416         conf->mddev = mddev;
417         spin_lock_init(&conf->device_lock);
418         INIT_LIST_HEAD(&conf->retry_list);
419
420         if (!working_disks) {
421                 pr_warn("multipath: no operational IO paths for %s\n",
422                         mdname(mddev));
423                 goto out_free_conf;
424         }
425         mddev->degraded = conf->raid_disks - working_disks;
426
427         ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
428                                         sizeof(struct multipath_bh));
429         if (ret)
430                 goto out_free_conf;
431
432         mddev->thread = md_register_thread(multipathd, mddev,
433                                            "multipath");
434         if (!mddev->thread)
435                 goto out_free_conf;
436
437         pr_info("multipath: array %s active with %d out of %d IO paths\n",
438                 mdname(mddev), conf->raid_disks - mddev->degraded,
439                 mddev->raid_disks);
440         /*
441          * Ok, everything is just fine now
442          */
443         md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
444
445         if (md_integrity_register(mddev))
446                 goto out_free_conf;
447
448         return 0;
449
450 out_free_conf:
451         mempool_exit(&conf->pool);
452         kfree(conf->multipaths);
453         kfree(conf);
454         mddev->private = NULL;
455 out:
456         return -EIO;
457 }
458
459 static void multipath_free(struct mddev *mddev, void *priv)
460 {
461         struct mpconf *conf = priv;
462
463         mempool_exit(&conf->pool);
464         kfree(conf->multipaths);
465         kfree(conf);
466 }
467
468 static struct md_personality multipath_personality =
469 {
470         .name           = "multipath",
471         .level          = LEVEL_MULTIPATH,
472         .owner          = THIS_MODULE,
473         .make_request   = multipath_make_request,
474         .run            = multipath_run,
475         .free           = multipath_free,
476         .status         = multipath_status,
477         .error_handler  = multipath_error,
478         .hot_add_disk   = multipath_add_disk,
479         .hot_remove_disk= multipath_remove_disk,
480         .size           = multipath_size,
481         .congested      = multipath_congested,
482 };
483
484 static int __init multipath_init (void)
485 {
486         return register_md_personality (&multipath_personality);
487 }
488
489 static void __exit multipath_exit (void)
490 {
491         unregister_md_personality (&multipath_personality);
492 }
493
494 module_init(multipath_init);
495 module_exit(multipath_exit);
496 MODULE_LICENSE("GPL");
497 MODULE_DESCRIPTION("simple multi-path personality for MD");
498 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
499 MODULE_ALIAS("md-multipath");
500 MODULE_ALIAS("md-level--4");