95f16d816585061bab60a6eff04814462f3bfd1b
[platform/kernel/linux-starfive.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm-rq.h"
11 #include "dm-bio-record.h"
12 #include "dm-path-selector.h"
13 #include "dm-uevent.h"
14
15 #include <linux/blkdev.h>
16 #include <linux/ctype.h>
17 #include <linux/init.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/time.h>
23 #include <linux/timer.h>
24 #include <linux/workqueue.h>
25 #include <linux/delay.h>
26 #include <scsi/scsi_dh.h>
27 #include <linux/atomic.h>
28 #include <linux/blk-mq.h>
29
30 #define DM_MSG_PREFIX "multipath"
31 #define DM_PG_INIT_DELAY_MSECS 2000
32 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
33 #define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
34
35 static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
36
37 /* Path properties */
38 struct pgpath {
39         struct list_head list;
40
41         struct priority_group *pg;      /* Owning PG */
42         unsigned fail_count;            /* Cumulative failure count */
43
44         struct dm_path path;
45         struct delayed_work activate_path;
46
47         bool is_active:1;               /* Path status */
48 };
49
50 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
51
52 /*
53  * Paths are grouped into Priority Groups and numbered from 1 upwards.
54  * Each has a path selector which controls which path gets used.
55  */
56 struct priority_group {
57         struct list_head list;
58
59         struct multipath *m;            /* Owning multipath instance */
60         struct path_selector ps;
61
62         unsigned pg_num;                /* Reference number */
63         unsigned nr_pgpaths;            /* Number of paths in PG */
64         struct list_head pgpaths;
65
66         bool bypassed:1;                /* Temporarily bypass this PG? */
67 };
68
69 /* Multipath context */
70 struct multipath {
71         unsigned long flags;            /* Multipath state flags */
72
73         spinlock_t lock;
74         enum dm_queue_mode queue_mode;
75
76         struct pgpath *current_pgpath;
77         struct priority_group *current_pg;
78         struct priority_group *next_pg; /* Switch to this PG if set */
79
80         atomic_t nr_valid_paths;        /* Total number of usable paths */
81         unsigned nr_priority_groups;
82         struct list_head priority_groups;
83
84         const char *hw_handler_name;
85         char *hw_handler_params;
86         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
87         unsigned pg_init_retries;       /* Number of times to retry pg_init */
88         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
89         atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
90         atomic_t pg_init_count;         /* Number of times pg_init called */
91
92         struct mutex work_mutex;
93         struct work_struct trigger_event;
94         struct dm_target *ti;
95
96         struct work_struct process_queued_bios;
97         struct bio_list queued_bios;
98
99         struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
100 };
101
102 /*
103  * Context information attached to each io we process.
104  */
105 struct dm_mpath_io {
106         struct pgpath *pgpath;
107         size_t nr_bytes;
108 };
109
110 typedef int (*action_fn) (struct pgpath *pgpath);
111
112 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
113 static void trigger_event(struct work_struct *work);
114 static void activate_or_offline_path(struct pgpath *pgpath);
115 static void activate_path_work(struct work_struct *work);
116 static void process_queued_bios(struct work_struct *work);
117 static void queue_if_no_path_timeout_work(struct timer_list *t);
118
119 /*-----------------------------------------------
120  * Multipath state flags.
121  *-----------------------------------------------*/
122
123 #define MPATHF_QUEUE_IO 0                       /* Must we queue all I/O? */
124 #define MPATHF_QUEUE_IF_NO_PATH 1               /* Queue I/O if last path fails? */
125 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2         /* Saved state during suspension */
126 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3     /* If there's already a hw_handler present, don't change it. */
127 #define MPATHF_PG_INIT_DISABLED 4               /* pg_init is not currently allowed */
128 #define MPATHF_PG_INIT_REQUIRED 5               /* pg_init needs calling? */
129 #define MPATHF_PG_INIT_DELAY_RETRY 6            /* Delay pg_init retry? */
130
131 /*-----------------------------------------------
132  * Allocation routines
133  *-----------------------------------------------*/
134
135 static struct pgpath *alloc_pgpath(void)
136 {
137         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
138
139         if (!pgpath)
140                 return NULL;
141
142         pgpath->is_active = true;
143
144         return pgpath;
145 }
146
147 static void free_pgpath(struct pgpath *pgpath)
148 {
149         kfree(pgpath);
150 }
151
152 static struct priority_group *alloc_priority_group(void)
153 {
154         struct priority_group *pg;
155
156         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
157
158         if (pg)
159                 INIT_LIST_HEAD(&pg->pgpaths);
160
161         return pg;
162 }
163
164 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
165 {
166         struct pgpath *pgpath, *tmp;
167
168         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
169                 list_del(&pgpath->list);
170                 dm_put_device(ti, pgpath->path.dev);
171                 free_pgpath(pgpath);
172         }
173 }
174
175 static void free_priority_group(struct priority_group *pg,
176                                 struct dm_target *ti)
177 {
178         struct path_selector *ps = &pg->ps;
179
180         if (ps->type) {
181                 ps->type->destroy(ps);
182                 dm_put_path_selector(ps->type);
183         }
184
185         free_pgpaths(&pg->pgpaths, ti);
186         kfree(pg);
187 }
188
189 static struct multipath *alloc_multipath(struct dm_target *ti)
190 {
191         struct multipath *m;
192
193         m = kzalloc(sizeof(*m), GFP_KERNEL);
194         if (m) {
195                 INIT_LIST_HEAD(&m->priority_groups);
196                 spin_lock_init(&m->lock);
197                 atomic_set(&m->nr_valid_paths, 0);
198                 INIT_WORK(&m->trigger_event, trigger_event);
199                 mutex_init(&m->work_mutex);
200
201                 m->queue_mode = DM_TYPE_NONE;
202
203                 m->ti = ti;
204                 ti->private = m;
205
206                 timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
207         }
208
209         return m;
210 }
211
212 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
213 {
214         if (m->queue_mode == DM_TYPE_NONE) {
215                 m->queue_mode = DM_TYPE_REQUEST_BASED;
216         } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
217                 INIT_WORK(&m->process_queued_bios, process_queued_bios);
218                 /*
219                  * bio-based doesn't support any direct scsi_dh management;
220                  * it just discovers if a scsi_dh is attached.
221                  */
222                 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
223         }
224
225         dm_table_set_type(ti->table, m->queue_mode);
226
227         /*
228          * Init fields that are only used when a scsi_dh is attached
229          * - must do this unconditionally (really doesn't hurt non-SCSI uses)
230          */
231         set_bit(MPATHF_QUEUE_IO, &m->flags);
232         atomic_set(&m->pg_init_in_progress, 0);
233         atomic_set(&m->pg_init_count, 0);
234         m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
235         init_waitqueue_head(&m->pg_init_wait);
236
237         return 0;
238 }
239
240 static void free_multipath(struct multipath *m)
241 {
242         struct priority_group *pg, *tmp;
243
244         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
245                 list_del(&pg->list);
246                 free_priority_group(pg, m->ti);
247         }
248
249         kfree(m->hw_handler_name);
250         kfree(m->hw_handler_params);
251         mutex_destroy(&m->work_mutex);
252         kfree(m);
253 }
254
255 static struct dm_mpath_io *get_mpio(union map_info *info)
256 {
257         return info->ptr;
258 }
259
260 static size_t multipath_per_bio_data_size(void)
261 {
262         return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
263 }
264
265 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
266 {
267         return dm_per_bio_data(bio, multipath_per_bio_data_size());
268 }
269
270 static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
271 {
272         /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
273         void *bio_details = mpio + 1;
274         return bio_details;
275 }
276
277 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
278 {
279         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
280         struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
281
282         mpio->nr_bytes = bio->bi_iter.bi_size;
283         mpio->pgpath = NULL;
284         *mpio_p = mpio;
285
286         dm_bio_record(bio_details, bio);
287 }
288
289 /*-----------------------------------------------
290  * Path selection
291  *-----------------------------------------------*/
292
293 static int __pg_init_all_paths(struct multipath *m)
294 {
295         struct pgpath *pgpath;
296         unsigned long pg_init_delay = 0;
297
298         lockdep_assert_held(&m->lock);
299
300         if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
301                 return 0;
302
303         atomic_inc(&m->pg_init_count);
304         clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
305
306         /* Check here to reset pg_init_required */
307         if (!m->current_pg)
308                 return 0;
309
310         if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
311                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
312                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
313         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
314                 /* Skip failed paths */
315                 if (!pgpath->is_active)
316                         continue;
317                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
318                                        pg_init_delay))
319                         atomic_inc(&m->pg_init_in_progress);
320         }
321         return atomic_read(&m->pg_init_in_progress);
322 }
323
324 static int pg_init_all_paths(struct multipath *m)
325 {
326         int ret;
327         unsigned long flags;
328
329         spin_lock_irqsave(&m->lock, flags);
330         ret = __pg_init_all_paths(m);
331         spin_unlock_irqrestore(&m->lock, flags);
332
333         return ret;
334 }
335
336 static void __switch_pg(struct multipath *m, struct priority_group *pg)
337 {
338         m->current_pg = pg;
339
340         /* Must we initialise the PG first, and queue I/O till it's ready? */
341         if (m->hw_handler_name) {
342                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
343                 set_bit(MPATHF_QUEUE_IO, &m->flags);
344         } else {
345                 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
346                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
347         }
348
349         atomic_set(&m->pg_init_count, 0);
350 }
351
352 static struct pgpath *choose_path_in_pg(struct multipath *m,
353                                         struct priority_group *pg,
354                                         size_t nr_bytes)
355 {
356         unsigned long flags;
357         struct dm_path *path;
358         struct pgpath *pgpath;
359
360         path = pg->ps.type->select_path(&pg->ps, nr_bytes);
361         if (!path)
362                 return ERR_PTR(-ENXIO);
363
364         pgpath = path_to_pgpath(path);
365
366         if (unlikely(READ_ONCE(m->current_pg) != pg)) {
367                 /* Only update current_pgpath if pg changed */
368                 spin_lock_irqsave(&m->lock, flags);
369                 m->current_pgpath = pgpath;
370                 __switch_pg(m, pg);
371                 spin_unlock_irqrestore(&m->lock, flags);
372         }
373
374         return pgpath;
375 }
376
377 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
378 {
379         unsigned long flags;
380         struct priority_group *pg;
381         struct pgpath *pgpath;
382         unsigned bypassed = 1;
383
384         if (!atomic_read(&m->nr_valid_paths)) {
385                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
386                 goto failed;
387         }
388
389         /* Were we instructed to switch PG? */
390         if (READ_ONCE(m->next_pg)) {
391                 spin_lock_irqsave(&m->lock, flags);
392                 pg = m->next_pg;
393                 if (!pg) {
394                         spin_unlock_irqrestore(&m->lock, flags);
395                         goto check_current_pg;
396                 }
397                 m->next_pg = NULL;
398                 spin_unlock_irqrestore(&m->lock, flags);
399                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
400                 if (!IS_ERR_OR_NULL(pgpath))
401                         return pgpath;
402         }
403
404         /* Don't change PG until it has no remaining paths */
405 check_current_pg:
406         pg = READ_ONCE(m->current_pg);
407         if (pg) {
408                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
409                 if (!IS_ERR_OR_NULL(pgpath))
410                         return pgpath;
411         }
412
413         /*
414          * Loop through priority groups until we find a valid path.
415          * First time we skip PGs marked 'bypassed'.
416          * Second time we only try the ones we skipped, but set
417          * pg_init_delay_retry so we do not hammer controllers.
418          */
419         do {
420                 list_for_each_entry(pg, &m->priority_groups, list) {
421                         if (pg->bypassed == !!bypassed)
422                                 continue;
423                         pgpath = choose_path_in_pg(m, pg, nr_bytes);
424                         if (!IS_ERR_OR_NULL(pgpath)) {
425                                 if (!bypassed)
426                                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
427                                 return pgpath;
428                         }
429                 }
430         } while (bypassed--);
431
432 failed:
433         spin_lock_irqsave(&m->lock, flags);
434         m->current_pgpath = NULL;
435         m->current_pg = NULL;
436         spin_unlock_irqrestore(&m->lock, flags);
437
438         return NULL;
439 }
440
441 /*
442  * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
443  * report the function name and line number of the function from which
444  * it has been invoked.
445  */
446 #define dm_report_EIO(m)                                                \
447 do {                                                                    \
448         struct mapped_device *md = dm_table_get_md((m)->ti->table);     \
449                                                                         \
450         DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
451                       dm_device_name(md),                               \
452                       test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),   \
453                       test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
454                       dm_noflush_suspending((m)->ti));                  \
455 } while (0)
456
457 /*
458  * Check whether bios must be queued in the device-mapper core rather
459  * than here in the target.
460  *
461  * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold
462  * the same value then we are not between multipath_presuspend()
463  * and multipath_resume() calls and we have no need to check
464  * for the DMF_NOFLUSH_SUSPENDING flag.
465  */
466 static bool __must_push_back(struct multipath *m, unsigned long flags)
467 {
468         return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) !=
469                  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) &&
470                 dm_noflush_suspending(m->ti));
471 }
472
473 /*
474  * Following functions use READ_ONCE to get atomic access to
475  * all m->flags to avoid taking spinlock
476  */
477 static bool must_push_back_rq(struct multipath *m)
478 {
479         unsigned long flags = READ_ONCE(m->flags);
480         return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags);
481 }
482
483 static bool must_push_back_bio(struct multipath *m)
484 {
485         unsigned long flags = READ_ONCE(m->flags);
486         return __must_push_back(m, flags);
487 }
488
489 /*
490  * Map cloned requests (request-based multipath)
491  */
492 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
493                                    union map_info *map_context,
494                                    struct request **__clone)
495 {
496         struct multipath *m = ti->private;
497         size_t nr_bytes = blk_rq_bytes(rq);
498         struct pgpath *pgpath;
499         struct block_device *bdev;
500         struct dm_mpath_io *mpio = get_mpio(map_context);
501         struct request_queue *q;
502         struct request *clone;
503
504         /* Do we need to select a new pgpath? */
505         pgpath = READ_ONCE(m->current_pgpath);
506         if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
507                 pgpath = choose_pgpath(m, nr_bytes);
508
509         if (!pgpath) {
510                 if (must_push_back_rq(m))
511                         return DM_MAPIO_DELAY_REQUEUE;
512                 dm_report_EIO(m);       /* Failed */
513                 return DM_MAPIO_KILL;
514         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
515                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
516                 pg_init_all_paths(m);
517                 return DM_MAPIO_DELAY_REQUEUE;
518         }
519
520         mpio->pgpath = pgpath;
521         mpio->nr_bytes = nr_bytes;
522
523         bdev = pgpath->path.dev->bdev;
524         q = bdev_get_queue(bdev);
525         clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
526                         BLK_MQ_REQ_NOWAIT);
527         if (IS_ERR(clone)) {
528                 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
529                 if (blk_queue_dying(q)) {
530                         atomic_inc(&m->pg_init_in_progress);
531                         activate_or_offline_path(pgpath);
532                         return DM_MAPIO_DELAY_REQUEUE;
533                 }
534
535                 /*
536                  * blk-mq's SCHED_RESTART can cover this requeue, so we
537                  * needn't deal with it by DELAY_REQUEUE. More importantly,
538                  * we have to return DM_MAPIO_REQUEUE so that blk-mq can
539                  * get the queue busy feedback (via BLK_STS_RESOURCE),
540                  * otherwise I/O merging can suffer.
541                  */
542                 return DM_MAPIO_REQUEUE;
543         }
544         clone->bio = clone->biotail = NULL;
545         clone->rq_disk = bdev->bd_disk;
546         clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
547         *__clone = clone;
548
549         if (pgpath->pg->ps.type->start_io)
550                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
551                                               &pgpath->path,
552                                               nr_bytes);
553         return DM_MAPIO_REMAPPED;
554 }
555
556 static void multipath_release_clone(struct request *clone,
557                                     union map_info *map_context)
558 {
559         if (unlikely(map_context)) {
560                 /*
561                  * non-NULL map_context means caller is still map
562                  * method; must undo multipath_clone_and_map()
563                  */
564                 struct dm_mpath_io *mpio = get_mpio(map_context);
565                 struct pgpath *pgpath = mpio->pgpath;
566
567                 if (pgpath && pgpath->pg->ps.type->end_io)
568                         pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
569                                                     &pgpath->path,
570                                                     mpio->nr_bytes,
571                                                     clone->io_start_time_ns);
572         }
573
574         blk_put_request(clone);
575 }
576
577 /*
578  * Map cloned bios (bio-based multipath)
579  */
580
581 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
582 {
583         struct pgpath *pgpath;
584         unsigned long flags;
585         bool queue_io;
586
587         /* Do we need to select a new pgpath? */
588         pgpath = READ_ONCE(m->current_pgpath);
589         if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
590                 pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
591
592         /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
593         queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
594
595         if ((pgpath && queue_io) ||
596             (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
597                 /* Queue for the daemon to resubmit */
598                 spin_lock_irqsave(&m->lock, flags);
599                 bio_list_add(&m->queued_bios, bio);
600                 spin_unlock_irqrestore(&m->lock, flags);
601
602                 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
603                 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
604                         pg_init_all_paths(m);
605                 else if (!queue_io)
606                         queue_work(kmultipathd, &m->process_queued_bios);
607
608                 return ERR_PTR(-EAGAIN);
609         }
610
611         return pgpath;
612 }
613
614 static int __multipath_map_bio(struct multipath *m, struct bio *bio,
615                                struct dm_mpath_io *mpio)
616 {
617         struct pgpath *pgpath = __map_bio(m, bio);
618
619         if (IS_ERR(pgpath))
620                 return DM_MAPIO_SUBMITTED;
621
622         if (!pgpath) {
623                 if (must_push_back_bio(m))
624                         return DM_MAPIO_REQUEUE;
625                 dm_report_EIO(m);
626                 return DM_MAPIO_KILL;
627         }
628
629         mpio->pgpath = pgpath;
630
631         bio->bi_status = 0;
632         bio_set_dev(bio, pgpath->path.dev->bdev);
633         bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
634
635         if (pgpath->pg->ps.type->start_io)
636                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
637                                               &pgpath->path,
638                                               mpio->nr_bytes);
639         return DM_MAPIO_REMAPPED;
640 }
641
642 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
643 {
644         struct multipath *m = ti->private;
645         struct dm_mpath_io *mpio = NULL;
646
647         multipath_init_per_bio_data(bio, &mpio);
648         return __multipath_map_bio(m, bio, mpio);
649 }
650
651 static void process_queued_io_list(struct multipath *m)
652 {
653         if (m->queue_mode == DM_TYPE_REQUEST_BASED)
654                 dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
655         else if (m->queue_mode == DM_TYPE_BIO_BASED)
656                 queue_work(kmultipathd, &m->process_queued_bios);
657 }
658
659 static void process_queued_bios(struct work_struct *work)
660 {
661         int r;
662         unsigned long flags;
663         struct bio *bio;
664         struct bio_list bios;
665         struct blk_plug plug;
666         struct multipath *m =
667                 container_of(work, struct multipath, process_queued_bios);
668
669         bio_list_init(&bios);
670
671         spin_lock_irqsave(&m->lock, flags);
672
673         if (bio_list_empty(&m->queued_bios)) {
674                 spin_unlock_irqrestore(&m->lock, flags);
675                 return;
676         }
677
678         bio_list_merge(&bios, &m->queued_bios);
679         bio_list_init(&m->queued_bios);
680
681         spin_unlock_irqrestore(&m->lock, flags);
682
683         blk_start_plug(&plug);
684         while ((bio = bio_list_pop(&bios))) {
685                 struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
686                 dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
687                 r = __multipath_map_bio(m, bio, mpio);
688                 switch (r) {
689                 case DM_MAPIO_KILL:
690                         bio->bi_status = BLK_STS_IOERR;
691                         bio_endio(bio);
692                         break;
693                 case DM_MAPIO_REQUEUE:
694                         bio->bi_status = BLK_STS_DM_REQUEUE;
695                         bio_endio(bio);
696                         break;
697                 case DM_MAPIO_REMAPPED:
698                         generic_make_request(bio);
699                         break;
700                 case DM_MAPIO_SUBMITTED:
701                         break;
702                 default:
703                         WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
704                 }
705         }
706         blk_finish_plug(&plug);
707 }
708
709 /*
710  * If we run out of usable paths, should we queue I/O or error it?
711  */
712 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
713                             bool save_old_value)
714 {
715         unsigned long flags;
716
717         spin_lock_irqsave(&m->lock, flags);
718         assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
719                    (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
720                    (!save_old_value && queue_if_no_path));
721         assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
722         spin_unlock_irqrestore(&m->lock, flags);
723
724         if (!queue_if_no_path) {
725                 dm_table_run_md_queue_async(m->ti->table);
726                 process_queued_io_list(m);
727         }
728
729         return 0;
730 }
731
732 /*
733  * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
734  * process any queued I/O.
735  */
736 static void queue_if_no_path_timeout_work(struct timer_list *t)
737 {
738         struct multipath *m = from_timer(m, t, nopath_timer);
739         struct mapped_device *md = dm_table_get_md(m->ti->table);
740
741         DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
742         queue_if_no_path(m, false, false);
743 }
744
745 /*
746  * Enable the queue_if_no_path timeout if necessary.
747  * Called with m->lock held.
748  */
749 static void enable_nopath_timeout(struct multipath *m)
750 {
751         unsigned long queue_if_no_path_timeout =
752                 READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
753
754         lockdep_assert_held(&m->lock);
755
756         if (queue_if_no_path_timeout > 0 &&
757             atomic_read(&m->nr_valid_paths) == 0 &&
758             test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
759                 mod_timer(&m->nopath_timer,
760                           jiffies + queue_if_no_path_timeout);
761         }
762 }
763
764 static void disable_nopath_timeout(struct multipath *m)
765 {
766         del_timer_sync(&m->nopath_timer);
767 }
768
769 /*
770  * An event is triggered whenever a path is taken out of use.
771  * Includes path failure and PG bypass.
772  */
773 static void trigger_event(struct work_struct *work)
774 {
775         struct multipath *m =
776                 container_of(work, struct multipath, trigger_event);
777
778         dm_table_event(m->ti->table);
779 }
780
781 /*-----------------------------------------------------------------
782  * Constructor/argument parsing:
783  * <#multipath feature args> [<arg>]*
784  * <#hw_handler args> [hw_handler [<arg>]*]
785  * <#priority groups>
786  * <initial priority group>
787  *     [<selector> <#selector args> [<arg>]*
788  *      <#paths> <#per-path selector args>
789  *         [<path> [<arg>]* ]+ ]+
790  *---------------------------------------------------------------*/
791 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
792                                struct dm_target *ti)
793 {
794         int r;
795         struct path_selector_type *pst;
796         unsigned ps_argc;
797
798         static const struct dm_arg _args[] = {
799                 {0, 1024, "invalid number of path selector args"},
800         };
801
802         pst = dm_get_path_selector(dm_shift_arg(as));
803         if (!pst) {
804                 ti->error = "unknown path selector type";
805                 return -EINVAL;
806         }
807
808         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
809         if (r) {
810                 dm_put_path_selector(pst);
811                 return -EINVAL;
812         }
813
814         r = pst->create(&pg->ps, ps_argc, as->argv);
815         if (r) {
816                 dm_put_path_selector(pst);
817                 ti->error = "path selector constructor failed";
818                 return r;
819         }
820
821         pg->ps.type = pst;
822         dm_consume_args(as, ps_argc);
823
824         return 0;
825 }
826
827 static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
828                          const char **attached_handler_name, char **error)
829 {
830         struct request_queue *q = bdev_get_queue(bdev);
831         int r;
832
833         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
834 retain:
835                 if (*attached_handler_name) {
836                         /*
837                          * Clear any hw_handler_params associated with a
838                          * handler that isn't already attached.
839                          */
840                         if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
841                                 kfree(m->hw_handler_params);
842                                 m->hw_handler_params = NULL;
843                         }
844
845                         /*
846                          * Reset hw_handler_name to match the attached handler
847                          *
848                          * NB. This modifies the table line to show the actual
849                          * handler instead of the original table passed in.
850                          */
851                         kfree(m->hw_handler_name);
852                         m->hw_handler_name = *attached_handler_name;
853                         *attached_handler_name = NULL;
854                 }
855         }
856
857         if (m->hw_handler_name) {
858                 r = scsi_dh_attach(q, m->hw_handler_name);
859                 if (r == -EBUSY) {
860                         char b[BDEVNAME_SIZE];
861
862                         printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
863                                bdevname(bdev, b));
864                         goto retain;
865                 }
866                 if (r < 0) {
867                         *error = "error attaching hardware handler";
868                         return r;
869                 }
870
871                 if (m->hw_handler_params) {
872                         r = scsi_dh_set_params(q, m->hw_handler_params);
873                         if (r < 0) {
874                                 *error = "unable to set hardware handler parameters";
875                                 return r;
876                         }
877                 }
878         }
879
880         return 0;
881 }
882
883 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
884                                  struct dm_target *ti)
885 {
886         int r;
887         struct pgpath *p;
888         struct multipath *m = ti->private;
889         struct request_queue *q;
890         const char *attached_handler_name = NULL;
891
892         /* we need at least a path arg */
893         if (as->argc < 1) {
894                 ti->error = "no device given";
895                 return ERR_PTR(-EINVAL);
896         }
897
898         p = alloc_pgpath();
899         if (!p)
900                 return ERR_PTR(-ENOMEM);
901
902         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
903                           &p->path.dev);
904         if (r) {
905                 ti->error = "error getting device";
906                 goto bad;
907         }
908
909         q = bdev_get_queue(p->path.dev->bdev);
910         attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
911         if (attached_handler_name || m->hw_handler_name) {
912                 INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
913                 r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
914                 kfree(attached_handler_name);
915                 if (r) {
916                         dm_put_device(ti, p->path.dev);
917                         goto bad;
918                 }
919         }
920
921         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
922         if (r) {
923                 dm_put_device(ti, p->path.dev);
924                 goto bad;
925         }
926
927         return p;
928  bad:
929         free_pgpath(p);
930         return ERR_PTR(r);
931 }
932
933 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
934                                                    struct multipath *m)
935 {
936         static const struct dm_arg _args[] = {
937                 {1, 1024, "invalid number of paths"},
938                 {0, 1024, "invalid number of selector args"}
939         };
940
941         int r;
942         unsigned i, nr_selector_args, nr_args;
943         struct priority_group *pg;
944         struct dm_target *ti = m->ti;
945
946         if (as->argc < 2) {
947                 as->argc = 0;
948                 ti->error = "not enough priority group arguments";
949                 return ERR_PTR(-EINVAL);
950         }
951
952         pg = alloc_priority_group();
953         if (!pg) {
954                 ti->error = "couldn't allocate priority group";
955                 return ERR_PTR(-ENOMEM);
956         }
957         pg->m = m;
958
959         r = parse_path_selector(as, pg, ti);
960         if (r)
961                 goto bad;
962
963         /*
964          * read the paths
965          */
966         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
967         if (r)
968                 goto bad;
969
970         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
971         if (r)
972                 goto bad;
973
974         nr_args = 1 + nr_selector_args;
975         for (i = 0; i < pg->nr_pgpaths; i++) {
976                 struct pgpath *pgpath;
977                 struct dm_arg_set path_args;
978
979                 if (as->argc < nr_args) {
980                         ti->error = "not enough path parameters";
981                         r = -EINVAL;
982                         goto bad;
983                 }
984
985                 path_args.argc = nr_args;
986                 path_args.argv = as->argv;
987
988                 pgpath = parse_path(&path_args, &pg->ps, ti);
989                 if (IS_ERR(pgpath)) {
990                         r = PTR_ERR(pgpath);
991                         goto bad;
992                 }
993
994                 pgpath->pg = pg;
995                 list_add_tail(&pgpath->list, &pg->pgpaths);
996                 dm_consume_args(as, nr_args);
997         }
998
999         return pg;
1000
1001  bad:
1002         free_priority_group(pg, ti);
1003         return ERR_PTR(r);
1004 }
1005
1006 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
1007 {
1008         unsigned hw_argc;
1009         int ret;
1010         struct dm_target *ti = m->ti;
1011
1012         static const struct dm_arg _args[] = {
1013                 {0, 1024, "invalid number of hardware handler args"},
1014         };
1015
1016         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
1017                 return -EINVAL;
1018
1019         if (!hw_argc)
1020                 return 0;
1021
1022         if (m->queue_mode == DM_TYPE_BIO_BASED) {
1023                 dm_consume_args(as, hw_argc);
1024                 DMERR("bio-based multipath doesn't allow hardware handler args");
1025                 return 0;
1026         }
1027
1028         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
1029         if (!m->hw_handler_name)
1030                 return -EINVAL;
1031
1032         if (hw_argc > 1) {
1033                 char *p;
1034                 int i, j, len = 4;
1035
1036                 for (i = 0; i <= hw_argc - 2; i++)
1037                         len += strlen(as->argv[i]) + 1;
1038                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
1039                 if (!p) {
1040                         ti->error = "memory allocation failed";
1041                         ret = -ENOMEM;
1042                         goto fail;
1043                 }
1044                 j = sprintf(p, "%d", hw_argc - 1);
1045                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
1046                         j = sprintf(p, "%s", as->argv[i]);
1047         }
1048         dm_consume_args(as, hw_argc - 1);
1049
1050         return 0;
1051 fail:
1052         kfree(m->hw_handler_name);
1053         m->hw_handler_name = NULL;
1054         return ret;
1055 }
1056
1057 static int parse_features(struct dm_arg_set *as, struct multipath *m)
1058 {
1059         int r;
1060         unsigned argc;
1061         struct dm_target *ti = m->ti;
1062         const char *arg_name;
1063
1064         static const struct dm_arg _args[] = {
1065                 {0, 8, "invalid number of feature args"},
1066                 {1, 50, "pg_init_retries must be between 1 and 50"},
1067                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1068         };
1069
1070         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1071         if (r)
1072                 return -EINVAL;
1073
1074         if (!argc)
1075                 return 0;
1076
1077         do {
1078                 arg_name = dm_shift_arg(as);
1079                 argc--;
1080
1081                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
1082                         r = queue_if_no_path(m, true, false);
1083                         continue;
1084                 }
1085
1086                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
1087                         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
1088                         continue;
1089                 }
1090
1091                 if (!strcasecmp(arg_name, "pg_init_retries") &&
1092                     (argc >= 1)) {
1093                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1094                         argc--;
1095                         continue;
1096                 }
1097
1098                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1099                     (argc >= 1)) {
1100                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1101                         argc--;
1102                         continue;
1103                 }
1104
1105                 if (!strcasecmp(arg_name, "queue_mode") &&
1106                     (argc >= 1)) {
1107                         const char *queue_mode_name = dm_shift_arg(as);
1108
1109                         if (!strcasecmp(queue_mode_name, "bio"))
1110                                 m->queue_mode = DM_TYPE_BIO_BASED;
1111                         else if (!strcasecmp(queue_mode_name, "rq") ||
1112                                  !strcasecmp(queue_mode_name, "mq"))
1113                                 m->queue_mode = DM_TYPE_REQUEST_BASED;
1114                         else {
1115                                 ti->error = "Unknown 'queue_mode' requested";
1116                                 r = -EINVAL;
1117                         }
1118                         argc--;
1119                         continue;
1120                 }
1121
1122                 ti->error = "Unrecognised multipath feature request";
1123                 r = -EINVAL;
1124         } while (argc && !r);
1125
1126         return r;
1127 }
1128
1129 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1130 {
1131         /* target arguments */
1132         static const struct dm_arg _args[] = {
1133                 {0, 1024, "invalid number of priority groups"},
1134                 {0, 1024, "invalid initial priority group number"},
1135         };
1136
1137         int r;
1138         struct multipath *m;
1139         struct dm_arg_set as;
1140         unsigned pg_count = 0;
1141         unsigned next_pg_num;
1142         unsigned long flags;
1143
1144         as.argc = argc;
1145         as.argv = argv;
1146
1147         m = alloc_multipath(ti);
1148         if (!m) {
1149                 ti->error = "can't allocate multipath";
1150                 return -EINVAL;
1151         }
1152
1153         r = parse_features(&as, m);
1154         if (r)
1155                 goto bad;
1156
1157         r = alloc_multipath_stage2(ti, m);
1158         if (r)
1159                 goto bad;
1160
1161         r = parse_hw_handler(&as, m);
1162         if (r)
1163                 goto bad;
1164
1165         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1166         if (r)
1167                 goto bad;
1168
1169         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1170         if (r)
1171                 goto bad;
1172
1173         if ((!m->nr_priority_groups && next_pg_num) ||
1174             (m->nr_priority_groups && !next_pg_num)) {
1175                 ti->error = "invalid initial priority group";
1176                 r = -EINVAL;
1177                 goto bad;
1178         }
1179
1180         /* parse the priority groups */
1181         while (as.argc) {
1182                 struct priority_group *pg;
1183                 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1184
1185                 pg = parse_priority_group(&as, m);
1186                 if (IS_ERR(pg)) {
1187                         r = PTR_ERR(pg);
1188                         goto bad;
1189                 }
1190
1191                 nr_valid_paths += pg->nr_pgpaths;
1192                 atomic_set(&m->nr_valid_paths, nr_valid_paths);
1193
1194                 list_add_tail(&pg->list, &m->priority_groups);
1195                 pg_count++;
1196                 pg->pg_num = pg_count;
1197                 if (!--next_pg_num)
1198                         m->next_pg = pg;
1199         }
1200
1201         if (pg_count != m->nr_priority_groups) {
1202                 ti->error = "priority group count mismatch";
1203                 r = -EINVAL;
1204                 goto bad;
1205         }
1206
1207         spin_lock_irqsave(&m->lock, flags);
1208         enable_nopath_timeout(m);
1209         spin_unlock_irqrestore(&m->lock, flags);
1210
1211         ti->num_flush_bios = 1;
1212         ti->num_discard_bios = 1;
1213         ti->num_write_same_bios = 1;
1214         ti->num_write_zeroes_bios = 1;
1215         if (m->queue_mode == DM_TYPE_BIO_BASED)
1216                 ti->per_io_data_size = multipath_per_bio_data_size();
1217         else
1218                 ti->per_io_data_size = sizeof(struct dm_mpath_io);
1219
1220         return 0;
1221
1222  bad:
1223         free_multipath(m);
1224         return r;
1225 }
1226
1227 static void multipath_wait_for_pg_init_completion(struct multipath *m)
1228 {
1229         DEFINE_WAIT(wait);
1230
1231         while (1) {
1232                 prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
1233
1234                 if (!atomic_read(&m->pg_init_in_progress))
1235                         break;
1236
1237                 io_schedule();
1238         }
1239         finish_wait(&m->pg_init_wait, &wait);
1240 }
1241
1242 static void flush_multipath_work(struct multipath *m)
1243 {
1244         if (m->hw_handler_name) {
1245                 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1246                 smp_mb__after_atomic();
1247
1248                 if (atomic_read(&m->pg_init_in_progress))
1249                         flush_workqueue(kmpath_handlerd);
1250                 multipath_wait_for_pg_init_completion(m);
1251
1252                 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1253                 smp_mb__after_atomic();
1254         }
1255
1256         if (m->queue_mode == DM_TYPE_BIO_BASED)
1257                 flush_work(&m->process_queued_bios);
1258         flush_work(&m->trigger_event);
1259 }
1260
1261 static void multipath_dtr(struct dm_target *ti)
1262 {
1263         struct multipath *m = ti->private;
1264
1265         disable_nopath_timeout(m);
1266         flush_multipath_work(m);
1267         free_multipath(m);
1268 }
1269
1270 /*
1271  * Take a path out of use.
1272  */
1273 static int fail_path(struct pgpath *pgpath)
1274 {
1275         unsigned long flags;
1276         struct multipath *m = pgpath->pg->m;
1277
1278         spin_lock_irqsave(&m->lock, flags);
1279
1280         if (!pgpath->is_active)
1281                 goto out;
1282
1283         DMWARN("Failing path %s.", pgpath->path.dev->name);
1284
1285         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1286         pgpath->is_active = false;
1287         pgpath->fail_count++;
1288
1289         atomic_dec(&m->nr_valid_paths);
1290
1291         if (pgpath == m->current_pgpath)
1292                 m->current_pgpath = NULL;
1293
1294         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1295                        pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1296
1297         schedule_work(&m->trigger_event);
1298
1299         enable_nopath_timeout(m);
1300
1301 out:
1302         spin_unlock_irqrestore(&m->lock, flags);
1303
1304         return 0;
1305 }
1306
1307 /*
1308  * Reinstate a previously-failed path
1309  */
1310 static int reinstate_path(struct pgpath *pgpath)
1311 {
1312         int r = 0, run_queue = 0;
1313         unsigned long flags;
1314         struct multipath *m = pgpath->pg->m;
1315         unsigned nr_valid_paths;
1316
1317         spin_lock_irqsave(&m->lock, flags);
1318
1319         if (pgpath->is_active)
1320                 goto out;
1321
1322         DMWARN("Reinstating path %s.", pgpath->path.dev->name);
1323
1324         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1325         if (r)
1326                 goto out;
1327
1328         pgpath->is_active = true;
1329
1330         nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1331         if (nr_valid_paths == 1) {
1332                 m->current_pgpath = NULL;
1333                 run_queue = 1;
1334         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1335                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1336                         atomic_inc(&m->pg_init_in_progress);
1337         }
1338
1339         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1340                        pgpath->path.dev->name, nr_valid_paths);
1341
1342         schedule_work(&m->trigger_event);
1343
1344 out:
1345         spin_unlock_irqrestore(&m->lock, flags);
1346         if (run_queue) {
1347                 dm_table_run_md_queue_async(m->ti->table);
1348                 process_queued_io_list(m);
1349         }
1350
1351         if (pgpath->is_active)
1352                 disable_nopath_timeout(m);
1353
1354         return r;
1355 }
1356
1357 /*
1358  * Fail or reinstate all paths that match the provided struct dm_dev.
1359  */
1360 static int action_dev(struct multipath *m, struct dm_dev *dev,
1361                       action_fn action)
1362 {
1363         int r = -EINVAL;
1364         struct pgpath *pgpath;
1365         struct priority_group *pg;
1366
1367         list_for_each_entry(pg, &m->priority_groups, list) {
1368                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1369                         if (pgpath->path.dev == dev)
1370                                 r = action(pgpath);
1371                 }
1372         }
1373
1374         return r;
1375 }
1376
1377 /*
1378  * Temporarily try to avoid having to use the specified PG
1379  */
1380 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1381                       bool bypassed)
1382 {
1383         unsigned long flags;
1384
1385         spin_lock_irqsave(&m->lock, flags);
1386
1387         pg->bypassed = bypassed;
1388         m->current_pgpath = NULL;
1389         m->current_pg = NULL;
1390
1391         spin_unlock_irqrestore(&m->lock, flags);
1392
1393         schedule_work(&m->trigger_event);
1394 }
1395
1396 /*
1397  * Switch to using the specified PG from the next I/O that gets mapped
1398  */
1399 static int switch_pg_num(struct multipath *m, const char *pgstr)
1400 {
1401         struct priority_group *pg;
1402         unsigned pgnum;
1403         unsigned long flags;
1404         char dummy;
1405
1406         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1407             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1408                 DMWARN("invalid PG number supplied to switch_pg_num");
1409                 return -EINVAL;
1410         }
1411
1412         spin_lock_irqsave(&m->lock, flags);
1413         list_for_each_entry(pg, &m->priority_groups, list) {
1414                 pg->bypassed = false;
1415                 if (--pgnum)
1416                         continue;
1417
1418                 m->current_pgpath = NULL;
1419                 m->current_pg = NULL;
1420                 m->next_pg = pg;
1421         }
1422         spin_unlock_irqrestore(&m->lock, flags);
1423
1424         schedule_work(&m->trigger_event);
1425         return 0;
1426 }
1427
1428 /*
1429  * Set/clear bypassed status of a PG.
1430  * PGs are numbered upwards from 1 in the order they were declared.
1431  */
1432 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1433 {
1434         struct priority_group *pg;
1435         unsigned pgnum;
1436         char dummy;
1437
1438         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1439             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1440                 DMWARN("invalid PG number supplied to bypass_pg");
1441                 return -EINVAL;
1442         }
1443
1444         list_for_each_entry(pg, &m->priority_groups, list) {
1445                 if (!--pgnum)
1446                         break;
1447         }
1448
1449         bypass_pg(m, pg, bypassed);
1450         return 0;
1451 }
1452
1453 /*
1454  * Should we retry pg_init immediately?
1455  */
1456 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1457 {
1458         unsigned long flags;
1459         bool limit_reached = false;
1460
1461         spin_lock_irqsave(&m->lock, flags);
1462
1463         if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1464             !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1465                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1466         else
1467                 limit_reached = true;
1468
1469         spin_unlock_irqrestore(&m->lock, flags);
1470
1471         return limit_reached;
1472 }
1473
1474 static void pg_init_done(void *data, int errors)
1475 {
1476         struct pgpath *pgpath = data;
1477         struct priority_group *pg = pgpath->pg;
1478         struct multipath *m = pg->m;
1479         unsigned long flags;
1480         bool delay_retry = false;
1481
1482         /* device or driver problems */
1483         switch (errors) {
1484         case SCSI_DH_OK:
1485                 break;
1486         case SCSI_DH_NOSYS:
1487                 if (!m->hw_handler_name) {
1488                         errors = 0;
1489                         break;
1490                 }
1491                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1492                       "Error %d.", m->hw_handler_name, errors);
1493                 /*
1494                  * Fail path for now, so we do not ping pong
1495                  */
1496                 fail_path(pgpath);
1497                 break;
1498         case SCSI_DH_DEV_TEMP_BUSY:
1499                 /*
1500                  * Probably doing something like FW upgrade on the
1501                  * controller so try the other pg.
1502                  */
1503                 bypass_pg(m, pg, true);
1504                 break;
1505         case SCSI_DH_RETRY:
1506                 /* Wait before retrying. */
1507                 delay_retry = true;
1508                 /* fall through */
1509         case SCSI_DH_IMM_RETRY:
1510         case SCSI_DH_RES_TEMP_UNAVAIL:
1511                 if (pg_init_limit_reached(m, pgpath))
1512                         fail_path(pgpath);
1513                 errors = 0;
1514                 break;
1515         case SCSI_DH_DEV_OFFLINED:
1516         default:
1517                 /*
1518                  * We probably do not want to fail the path for a device
1519                  * error, but this is what the old dm did. In future
1520                  * patches we can do more advanced handling.
1521                  */
1522                 fail_path(pgpath);
1523         }
1524
1525         spin_lock_irqsave(&m->lock, flags);
1526         if (errors) {
1527                 if (pgpath == m->current_pgpath) {
1528                         DMERR("Could not failover device. Error %d.", errors);
1529                         m->current_pgpath = NULL;
1530                         m->current_pg = NULL;
1531                 }
1532         } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1533                 pg->bypassed = false;
1534
1535         if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1536                 /* Activations of other paths are still on going */
1537                 goto out;
1538
1539         if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1540                 if (delay_retry)
1541                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1542                 else
1543                         clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1544
1545                 if (__pg_init_all_paths(m))
1546                         goto out;
1547         }
1548         clear_bit(MPATHF_QUEUE_IO, &m->flags);
1549
1550         process_queued_io_list(m);
1551
1552         /*
1553          * Wake up any thread waiting to suspend.
1554          */
1555         wake_up(&m->pg_init_wait);
1556
1557 out:
1558         spin_unlock_irqrestore(&m->lock, flags);
1559 }
1560
1561 static void activate_or_offline_path(struct pgpath *pgpath)
1562 {
1563         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1564
1565         if (pgpath->is_active && !blk_queue_dying(q))
1566                 scsi_dh_activate(q, pg_init_done, pgpath);
1567         else
1568                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1569 }
1570
1571 static void activate_path_work(struct work_struct *work)
1572 {
1573         struct pgpath *pgpath =
1574                 container_of(work, struct pgpath, activate_path.work);
1575
1576         activate_or_offline_path(pgpath);
1577 }
1578
1579 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1580                             blk_status_t error, union map_info *map_context)
1581 {
1582         struct dm_mpath_io *mpio = get_mpio(map_context);
1583         struct pgpath *pgpath = mpio->pgpath;
1584         int r = DM_ENDIO_DONE;
1585
1586         /*
1587          * We don't queue any clone request inside the multipath target
1588          * during end I/O handling, since those clone requests don't have
1589          * bio clones.  If we queue them inside the multipath target,
1590          * we need to make bio clones, that requires memory allocation.
1591          * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1592          *  don't have bio clones.)
1593          * Instead of queueing the clone request here, we queue the original
1594          * request into dm core, which will remake a clone request and
1595          * clone bios for it and resubmit it later.
1596          */
1597         if (error && blk_path_error(error)) {
1598                 struct multipath *m = ti->private;
1599
1600                 if (error == BLK_STS_RESOURCE)
1601                         r = DM_ENDIO_DELAY_REQUEUE;
1602                 else
1603                         r = DM_ENDIO_REQUEUE;
1604
1605                 if (pgpath)
1606                         fail_path(pgpath);
1607
1608                 if (atomic_read(&m->nr_valid_paths) == 0 &&
1609                     !must_push_back_rq(m)) {
1610                         if (error == BLK_STS_IOERR)
1611                                 dm_report_EIO(m);
1612                         /* complete with the original error */
1613                         r = DM_ENDIO_DONE;
1614                 }
1615         }
1616
1617         if (pgpath) {
1618                 struct path_selector *ps = &pgpath->pg->ps;
1619
1620                 if (ps->type->end_io)
1621                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1622                                          clone->io_start_time_ns);
1623         }
1624
1625         return r;
1626 }
1627
1628 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
1629                                 blk_status_t *error)
1630 {
1631         struct multipath *m = ti->private;
1632         struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1633         struct pgpath *pgpath = mpio->pgpath;
1634         unsigned long flags;
1635         int r = DM_ENDIO_DONE;
1636
1637         if (!*error || !blk_path_error(*error))
1638                 goto done;
1639
1640         if (pgpath)
1641                 fail_path(pgpath);
1642
1643         if (atomic_read(&m->nr_valid_paths) == 0 &&
1644             !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1645                 if (must_push_back_bio(m)) {
1646                         r = DM_ENDIO_REQUEUE;
1647                 } else {
1648                         dm_report_EIO(m);
1649                         *error = BLK_STS_IOERR;
1650                 }
1651                 goto done;
1652         }
1653
1654         spin_lock_irqsave(&m->lock, flags);
1655         bio_list_add(&m->queued_bios, clone);
1656         spin_unlock_irqrestore(&m->lock, flags);
1657         if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
1658                 queue_work(kmultipathd, &m->process_queued_bios);
1659
1660         r = DM_ENDIO_INCOMPLETE;
1661 done:
1662         if (pgpath) {
1663                 struct path_selector *ps = &pgpath->pg->ps;
1664
1665                 if (ps->type->end_io)
1666                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1667                                          dm_start_time_ns_from_clone(clone));
1668         }
1669
1670         return r;
1671 }
1672
1673 /*
1674  * Suspend can't complete until all the I/O is processed so if
1675  * the last path fails we must error any remaining I/O.
1676  * Note that if the freeze_bdev fails while suspending, the
1677  * queue_if_no_path state is lost - userspace should reset it.
1678  */
1679 static void multipath_presuspend(struct dm_target *ti)
1680 {
1681         struct multipath *m = ti->private;
1682
1683         queue_if_no_path(m, false, true);
1684 }
1685
1686 static void multipath_postsuspend(struct dm_target *ti)
1687 {
1688         struct multipath *m = ti->private;
1689
1690         mutex_lock(&m->work_mutex);
1691         flush_multipath_work(m);
1692         mutex_unlock(&m->work_mutex);
1693 }
1694
1695 /*
1696  * Restore the queue_if_no_path setting.
1697  */
1698 static void multipath_resume(struct dm_target *ti)
1699 {
1700         struct multipath *m = ti->private;
1701         unsigned long flags;
1702
1703         spin_lock_irqsave(&m->lock, flags);
1704         assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
1705                    test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
1706         spin_unlock_irqrestore(&m->lock, flags);
1707 }
1708
1709 /*
1710  * Info output has the following format:
1711  * num_multipath_feature_args [multipath_feature_args]*
1712  * num_handler_status_args [handler_status_args]*
1713  * num_groups init_group_number
1714  *            [A|D|E num_ps_status_args [ps_status_args]*
1715  *             num_paths num_selector_args
1716  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1717  *
1718  * Table output has the following format (identical to the constructor string):
1719  * num_feature_args [features_args]*
1720  * num_handler_args hw_handler [hw_handler_args]*
1721  * num_groups init_group_number
1722  *     [priority selector-name num_ps_args [ps_args]*
1723  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1724  */
1725 static void multipath_status(struct dm_target *ti, status_type_t type,
1726                              unsigned status_flags, char *result, unsigned maxlen)
1727 {
1728         int sz = 0;
1729         unsigned long flags;
1730         struct multipath *m = ti->private;
1731         struct priority_group *pg;
1732         struct pgpath *p;
1733         unsigned pg_num;
1734         char state;
1735
1736         spin_lock_irqsave(&m->lock, flags);
1737
1738         /* Features */
1739         if (type == STATUSTYPE_INFO)
1740                 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1741                        atomic_read(&m->pg_init_count));
1742         else {
1743                 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1744                               (m->pg_init_retries > 0) * 2 +
1745                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1746                               test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
1747                               (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
1748
1749                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1750                         DMEMIT("queue_if_no_path ");
1751                 if (m->pg_init_retries)
1752                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1753                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1754                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1755                 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1756                         DMEMIT("retain_attached_hw_handler ");
1757                 if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
1758                         switch(m->queue_mode) {
1759                         case DM_TYPE_BIO_BASED:
1760                                 DMEMIT("queue_mode bio ");
1761                                 break;
1762                         default:
1763                                 WARN_ON_ONCE(true);
1764                                 break;
1765                         }
1766                 }
1767         }
1768
1769         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1770                 DMEMIT("0 ");
1771         else
1772                 DMEMIT("1 %s ", m->hw_handler_name);
1773
1774         DMEMIT("%u ", m->nr_priority_groups);
1775
1776         if (m->next_pg)
1777                 pg_num = m->next_pg->pg_num;
1778         else if (m->current_pg)
1779                 pg_num = m->current_pg->pg_num;
1780         else
1781                 pg_num = (m->nr_priority_groups ? 1 : 0);
1782
1783         DMEMIT("%u ", pg_num);
1784
1785         switch (type) {
1786         case STATUSTYPE_INFO:
1787                 list_for_each_entry(pg, &m->priority_groups, list) {
1788                         if (pg->bypassed)
1789                                 state = 'D';    /* Disabled */
1790                         else if (pg == m->current_pg)
1791                                 state = 'A';    /* Currently Active */
1792                         else
1793                                 state = 'E';    /* Enabled */
1794
1795                         DMEMIT("%c ", state);
1796
1797                         if (pg->ps.type->status)
1798                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1799                                                           result + sz,
1800                                                           maxlen - sz);
1801                         else
1802                                 DMEMIT("0 ");
1803
1804                         DMEMIT("%u %u ", pg->nr_pgpaths,
1805                                pg->ps.type->info_args);
1806
1807                         list_for_each_entry(p, &pg->pgpaths, list) {
1808                                 DMEMIT("%s %s %u ", p->path.dev->name,
1809                                        p->is_active ? "A" : "F",
1810                                        p->fail_count);
1811                                 if (pg->ps.type->status)
1812                                         sz += pg->ps.type->status(&pg->ps,
1813                                               &p->path, type, result + sz,
1814                                               maxlen - sz);
1815                         }
1816                 }
1817                 break;
1818
1819         case STATUSTYPE_TABLE:
1820                 list_for_each_entry(pg, &m->priority_groups, list) {
1821                         DMEMIT("%s ", pg->ps.type->name);
1822
1823                         if (pg->ps.type->status)
1824                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1825                                                           result + sz,
1826                                                           maxlen - sz);
1827                         else
1828                                 DMEMIT("0 ");
1829
1830                         DMEMIT("%u %u ", pg->nr_pgpaths,
1831                                pg->ps.type->table_args);
1832
1833                         list_for_each_entry(p, &pg->pgpaths, list) {
1834                                 DMEMIT("%s ", p->path.dev->name);
1835                                 if (pg->ps.type->status)
1836                                         sz += pg->ps.type->status(&pg->ps,
1837                                               &p->path, type, result + sz,
1838                                               maxlen - sz);
1839                         }
1840                 }
1841                 break;
1842         }
1843
1844         spin_unlock_irqrestore(&m->lock, flags);
1845 }
1846
1847 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
1848                              char *result, unsigned maxlen)
1849 {
1850         int r = -EINVAL;
1851         struct dm_dev *dev;
1852         struct multipath *m = ti->private;
1853         action_fn action;
1854         unsigned long flags;
1855
1856         mutex_lock(&m->work_mutex);
1857
1858         if (dm_suspended(ti)) {
1859                 r = -EBUSY;
1860                 goto out;
1861         }
1862
1863         if (argc == 1) {
1864                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1865                         r = queue_if_no_path(m, true, false);
1866                         spin_lock_irqsave(&m->lock, flags);
1867                         enable_nopath_timeout(m);
1868                         spin_unlock_irqrestore(&m->lock, flags);
1869                         goto out;
1870                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1871                         r = queue_if_no_path(m, false, false);
1872                         disable_nopath_timeout(m);
1873                         goto out;
1874                 }
1875         }
1876
1877         if (argc != 2) {
1878                 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1879                 goto out;
1880         }
1881
1882         if (!strcasecmp(argv[0], "disable_group")) {
1883                 r = bypass_pg_num(m, argv[1], true);
1884                 goto out;
1885         } else if (!strcasecmp(argv[0], "enable_group")) {
1886                 r = bypass_pg_num(m, argv[1], false);
1887                 goto out;
1888         } else if (!strcasecmp(argv[0], "switch_group")) {
1889                 r = switch_pg_num(m, argv[1]);
1890                 goto out;
1891         } else if (!strcasecmp(argv[0], "reinstate_path"))
1892                 action = reinstate_path;
1893         else if (!strcasecmp(argv[0], "fail_path"))
1894                 action = fail_path;
1895         else {
1896                 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1897                 goto out;
1898         }
1899
1900         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1901         if (r) {
1902                 DMWARN("message: error getting device %s",
1903                        argv[1]);
1904                 goto out;
1905         }
1906
1907         r = action_dev(m, dev, action);
1908
1909         dm_put_device(ti, dev);
1910
1911 out:
1912         mutex_unlock(&m->work_mutex);
1913         return r;
1914 }
1915
1916 static int multipath_prepare_ioctl(struct dm_target *ti,
1917                                    struct block_device **bdev)
1918 {
1919         struct multipath *m = ti->private;
1920         struct pgpath *current_pgpath;
1921         int r;
1922
1923         current_pgpath = READ_ONCE(m->current_pgpath);
1924         if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
1925                 current_pgpath = choose_pgpath(m, 0);
1926
1927         if (current_pgpath) {
1928                 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1929                         *bdev = current_pgpath->path.dev->bdev;
1930                         r = 0;
1931                 } else {
1932                         /* pg_init has not started or completed */
1933                         r = -ENOTCONN;
1934                 }
1935         } else {
1936                 /* No path is available */
1937                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1938                         r = -ENOTCONN;
1939                 else
1940                         r = -EIO;
1941         }
1942
1943         if (r == -ENOTCONN) {
1944                 if (!READ_ONCE(m->current_pg)) {
1945                         /* Path status changed, redo selection */
1946                         (void) choose_pgpath(m, 0);
1947                 }
1948                 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1949                         pg_init_all_paths(m);
1950                 dm_table_run_md_queue_async(m->ti->table);
1951                 process_queued_io_list(m);
1952         }
1953
1954         /*
1955          * Only pass ioctls through if the device sizes match exactly.
1956          */
1957         if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1958                 return 1;
1959         return r;
1960 }
1961
1962 static int multipath_iterate_devices(struct dm_target *ti,
1963                                      iterate_devices_callout_fn fn, void *data)
1964 {
1965         struct multipath *m = ti->private;
1966         struct priority_group *pg;
1967         struct pgpath *p;
1968         int ret = 0;
1969
1970         list_for_each_entry(pg, &m->priority_groups, list) {
1971                 list_for_each_entry(p, &pg->pgpaths, list) {
1972                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1973                         if (ret)
1974                                 goto out;
1975                 }
1976         }
1977
1978 out:
1979         return ret;
1980 }
1981
1982 static int pgpath_busy(struct pgpath *pgpath)
1983 {
1984         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1985
1986         return blk_lld_busy(q);
1987 }
1988
1989 /*
1990  * We return "busy", only when we can map I/Os but underlying devices
1991  * are busy (so even if we map I/Os now, the I/Os will wait on
1992  * the underlying queue).
1993  * In other words, if we want to kill I/Os or queue them inside us
1994  * due to map unavailability, we don't return "busy".  Otherwise,
1995  * dm core won't give us the I/Os and we can't do what we want.
1996  */
1997 static int multipath_busy(struct dm_target *ti)
1998 {
1999         bool busy = false, has_active = false;
2000         struct multipath *m = ti->private;
2001         struct priority_group *pg, *next_pg;
2002         struct pgpath *pgpath;
2003
2004         /* pg_init in progress */
2005         if (atomic_read(&m->pg_init_in_progress))
2006                 return true;
2007
2008         /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
2009         if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
2010                 return (m->queue_mode != DM_TYPE_REQUEST_BASED);
2011
2012         /* Guess which priority_group will be used at next mapping time */
2013         pg = READ_ONCE(m->current_pg);
2014         next_pg = READ_ONCE(m->next_pg);
2015         if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
2016                 pg = next_pg;
2017
2018         if (!pg) {
2019                 /*
2020                  * We don't know which pg will be used at next mapping time.
2021                  * We don't call choose_pgpath() here to avoid to trigger
2022                  * pg_init just by busy checking.
2023                  * So we don't know whether underlying devices we will be using
2024                  * at next mapping time are busy or not. Just try mapping.
2025                  */
2026                 return busy;
2027         }
2028
2029         /*
2030          * If there is one non-busy active path at least, the path selector
2031          * will be able to select it. So we consider such a pg as not busy.
2032          */
2033         busy = true;
2034         list_for_each_entry(pgpath, &pg->pgpaths, list) {
2035                 if (pgpath->is_active) {
2036                         has_active = true;
2037                         if (!pgpath_busy(pgpath)) {
2038                                 busy = false;
2039                                 break;
2040                         }
2041                 }
2042         }
2043
2044         if (!has_active) {
2045                 /*
2046                  * No active path in this pg, so this pg won't be used and
2047                  * the current_pg will be changed at next mapping time.
2048                  * We need to try mapping to determine it.
2049                  */
2050                 busy = false;
2051         }
2052
2053         return busy;
2054 }
2055
2056 /*-----------------------------------------------------------------
2057  * Module setup
2058  *---------------------------------------------------------------*/
2059 static struct target_type multipath_target = {
2060         .name = "multipath",
2061         .version = {1, 14, 0},
2062         .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
2063                     DM_TARGET_PASSES_INTEGRITY,
2064         .module = THIS_MODULE,
2065         .ctr = multipath_ctr,
2066         .dtr = multipath_dtr,
2067         .clone_and_map_rq = multipath_clone_and_map,
2068         .release_clone_rq = multipath_release_clone,
2069         .rq_end_io = multipath_end_io,
2070         .map = multipath_map_bio,
2071         .end_io = multipath_end_io_bio,
2072         .presuspend = multipath_presuspend,
2073         .postsuspend = multipath_postsuspend,
2074         .resume = multipath_resume,
2075         .status = multipath_status,
2076         .message = multipath_message,
2077         .prepare_ioctl = multipath_prepare_ioctl,
2078         .iterate_devices = multipath_iterate_devices,
2079         .busy = multipath_busy,
2080 };
2081
2082 static int __init dm_multipath_init(void)
2083 {
2084         int r;
2085
2086         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2087         if (!kmultipathd) {
2088                 DMERR("failed to create workqueue kmpathd");
2089                 r = -ENOMEM;
2090                 goto bad_alloc_kmultipathd;
2091         }
2092
2093         /*
2094          * A separate workqueue is used to handle the device handlers
2095          * to avoid overloading existing workqueue. Overloading the
2096          * old workqueue would also create a bottleneck in the
2097          * path of the storage hardware device activation.
2098          */
2099         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2100                                                   WQ_MEM_RECLAIM);
2101         if (!kmpath_handlerd) {
2102                 DMERR("failed to create workqueue kmpath_handlerd");
2103                 r = -ENOMEM;
2104                 goto bad_alloc_kmpath_handlerd;
2105         }
2106
2107         r = dm_register_target(&multipath_target);
2108         if (r < 0) {
2109                 DMERR("request-based register failed %d", r);
2110                 r = -EINVAL;
2111                 goto bad_register_target;
2112         }
2113
2114         return 0;
2115
2116 bad_register_target:
2117         destroy_workqueue(kmpath_handlerd);
2118 bad_alloc_kmpath_handlerd:
2119         destroy_workqueue(kmultipathd);
2120 bad_alloc_kmultipathd:
2121         return r;
2122 }
2123
2124 static void __exit dm_multipath_exit(void)
2125 {
2126         destroy_workqueue(kmpath_handlerd);
2127         destroy_workqueue(kmultipathd);
2128
2129         dm_unregister_target(&multipath_target);
2130 }
2131
2132 module_init(dm_multipath_init);
2133 module_exit(dm_multipath_exit);
2134
2135 module_param_named(queue_if_no_path_timeout_secs,
2136                    queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
2137 MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
2138
2139 MODULE_DESCRIPTION(DM_NAME " multipath target");
2140 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2141 MODULE_LICENSE("GPL");