Merge branch 'drm-next-3.13' of git://people.freedesktop.org/~agd5f/linux into drm...
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm.h"
11 #include "dm-path-selector.h"
12 #include "dm-uevent.h"
13
14 #include <linux/ctype.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/pagemap.h>
19 #include <linux/slab.h>
20 #include <linux/time.h>
21 #include <linux/workqueue.h>
22 #include <linux/delay.h>
23 #include <scsi/scsi_dh.h>
24 #include <linux/atomic.h>
25
26 #define DM_MSG_PREFIX "multipath"
27 #define DM_PG_INIT_DELAY_MSECS 2000
28 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
29
30 /* Path properties */
31 struct pgpath {
32         struct list_head list;
33
34         struct priority_group *pg;      /* Owning PG */
35         unsigned is_active;             /* Path status */
36         unsigned fail_count;            /* Cumulative failure count */
37
38         struct dm_path path;
39         struct delayed_work activate_path;
40 };
41
42 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
43
44 /*
45  * Paths are grouped into Priority Groups and numbered from 1 upwards.
46  * Each has a path selector which controls which path gets used.
47  */
48 struct priority_group {
49         struct list_head list;
50
51         struct multipath *m;            /* Owning multipath instance */
52         struct path_selector ps;
53
54         unsigned pg_num;                /* Reference number */
55         unsigned bypassed;              /* Temporarily bypass this PG? */
56
57         unsigned nr_pgpaths;            /* Number of paths in PG */
58         struct list_head pgpaths;
59 };
60
61 /* Multipath context */
62 struct multipath {
63         struct list_head list;
64         struct dm_target *ti;
65
66         const char *hw_handler_name;
67         char *hw_handler_params;
68
69         spinlock_t lock;
70
71         unsigned nr_priority_groups;
72         struct list_head priority_groups;
73
74         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
75
76         unsigned pg_init_required;      /* pg_init needs calling? */
77         unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
78         unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
79
80         unsigned nr_valid_paths;        /* Total number of usable paths */
81         struct pgpath *current_pgpath;
82         struct priority_group *current_pg;
83         struct priority_group *next_pg; /* Switch to this PG if set */
84         unsigned repeat_count;          /* I/Os left before calling PS again */
85
86         unsigned queue_io:1;            /* Must we queue all I/O? */
87         unsigned queue_if_no_path:1;    /* Queue I/O if last path fails? */
88         unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
89         unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
90         unsigned pg_init_disabled:1;    /* pg_init is not currently allowed */
91
92         unsigned pg_init_retries;       /* Number of times to retry pg_init */
93         unsigned pg_init_count;         /* Number of times pg_init called */
94         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
95
96         unsigned queue_size;
97         struct work_struct process_queued_ios;
98         struct list_head queued_ios;
99
100         struct work_struct trigger_event;
101
102         /*
103          * We must use a mempool of dm_mpath_io structs so that we
104          * can resubmit bios on error.
105          */
106         mempool_t *mpio_pool;
107
108         struct mutex work_mutex;
109 };
110
111 /*
112  * Context information attached to each bio we process.
113  */
114 struct dm_mpath_io {
115         struct pgpath *pgpath;
116         size_t nr_bytes;
117 };
118
119 typedef int (*action_fn) (struct pgpath *pgpath);
120
121 static struct kmem_cache *_mpio_cache;
122
123 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
124 static void process_queued_ios(struct work_struct *work);
125 static void trigger_event(struct work_struct *work);
126 static void activate_path(struct work_struct *work);
127
128
129 /*-----------------------------------------------
130  * Allocation routines
131  *-----------------------------------------------*/
132
133 static struct pgpath *alloc_pgpath(void)
134 {
135         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
136
137         if (pgpath) {
138                 pgpath->is_active = 1;
139                 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
140         }
141
142         return pgpath;
143 }
144
145 static void free_pgpath(struct pgpath *pgpath)
146 {
147         kfree(pgpath);
148 }
149
150 static struct priority_group *alloc_priority_group(void)
151 {
152         struct priority_group *pg;
153
154         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
155
156         if (pg)
157                 INIT_LIST_HEAD(&pg->pgpaths);
158
159         return pg;
160 }
161
162 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
163 {
164         struct pgpath *pgpath, *tmp;
165         struct multipath *m = ti->private;
166
167         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
168                 list_del(&pgpath->list);
169                 if (m->hw_handler_name)
170                         scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
171                 dm_put_device(ti, pgpath->path.dev);
172                 free_pgpath(pgpath);
173         }
174 }
175
176 static void free_priority_group(struct priority_group *pg,
177                                 struct dm_target *ti)
178 {
179         struct path_selector *ps = &pg->ps;
180
181         if (ps->type) {
182                 ps->type->destroy(ps);
183                 dm_put_path_selector(ps->type);
184         }
185
186         free_pgpaths(&pg->pgpaths, ti);
187         kfree(pg);
188 }
189
190 static struct multipath *alloc_multipath(struct dm_target *ti)
191 {
192         struct multipath *m;
193         unsigned min_ios = dm_get_reserved_rq_based_ios();
194
195         m = kzalloc(sizeof(*m), GFP_KERNEL);
196         if (m) {
197                 INIT_LIST_HEAD(&m->priority_groups);
198                 INIT_LIST_HEAD(&m->queued_ios);
199                 spin_lock_init(&m->lock);
200                 m->queue_io = 1;
201                 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
202                 INIT_WORK(&m->process_queued_ios, process_queued_ios);
203                 INIT_WORK(&m->trigger_event, trigger_event);
204                 init_waitqueue_head(&m->pg_init_wait);
205                 mutex_init(&m->work_mutex);
206                 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
207                 if (!m->mpio_pool) {
208                         kfree(m);
209                         return NULL;
210                 }
211                 m->ti = ti;
212                 ti->private = m;
213         }
214
215         return m;
216 }
217
218 static void free_multipath(struct multipath *m)
219 {
220         struct priority_group *pg, *tmp;
221
222         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
223                 list_del(&pg->list);
224                 free_priority_group(pg, m->ti);
225         }
226
227         kfree(m->hw_handler_name);
228         kfree(m->hw_handler_params);
229         mempool_destroy(m->mpio_pool);
230         kfree(m);
231 }
232
233 static int set_mapinfo(struct multipath *m, union map_info *info)
234 {
235         struct dm_mpath_io *mpio;
236
237         mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
238         if (!mpio)
239                 return -ENOMEM;
240
241         memset(mpio, 0, sizeof(*mpio));
242         info->ptr = mpio;
243
244         return 0;
245 }
246
247 static void clear_mapinfo(struct multipath *m, union map_info *info)
248 {
249         struct dm_mpath_io *mpio = info->ptr;
250
251         info->ptr = NULL;
252         mempool_free(mpio, m->mpio_pool);
253 }
254
255 /*-----------------------------------------------
256  * Path selection
257  *-----------------------------------------------*/
258
259 static void __pg_init_all_paths(struct multipath *m)
260 {
261         struct pgpath *pgpath;
262         unsigned long pg_init_delay = 0;
263
264         m->pg_init_count++;
265         m->pg_init_required = 0;
266         if (m->pg_init_delay_retry)
267                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
268                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
269         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
270                 /* Skip failed paths */
271                 if (!pgpath->is_active)
272                         continue;
273                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
274                                        pg_init_delay))
275                         m->pg_init_in_progress++;
276         }
277 }
278
279 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
280 {
281         m->current_pg = pgpath->pg;
282
283         /* Must we initialise the PG first, and queue I/O till it's ready? */
284         if (m->hw_handler_name) {
285                 m->pg_init_required = 1;
286                 m->queue_io = 1;
287         } else {
288                 m->pg_init_required = 0;
289                 m->queue_io = 0;
290         }
291
292         m->pg_init_count = 0;
293 }
294
295 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
296                                size_t nr_bytes)
297 {
298         struct dm_path *path;
299
300         path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
301         if (!path)
302                 return -ENXIO;
303
304         m->current_pgpath = path_to_pgpath(path);
305
306         if (m->current_pg != pg)
307                 __switch_pg(m, m->current_pgpath);
308
309         return 0;
310 }
311
312 static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
313 {
314         struct priority_group *pg;
315         unsigned bypassed = 1;
316
317         if (!m->nr_valid_paths)
318                 goto failed;
319
320         /* Were we instructed to switch PG? */
321         if (m->next_pg) {
322                 pg = m->next_pg;
323                 m->next_pg = NULL;
324                 if (!__choose_path_in_pg(m, pg, nr_bytes))
325                         return;
326         }
327
328         /* Don't change PG until it has no remaining paths */
329         if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
330                 return;
331
332         /*
333          * Loop through priority groups until we find a valid path.
334          * First time we skip PGs marked 'bypassed'.
335          * Second time we only try the ones we skipped, but set
336          * pg_init_delay_retry so we do not hammer controllers.
337          */
338         do {
339                 list_for_each_entry(pg, &m->priority_groups, list) {
340                         if (pg->bypassed == bypassed)
341                                 continue;
342                         if (!__choose_path_in_pg(m, pg, nr_bytes)) {
343                                 if (!bypassed)
344                                         m->pg_init_delay_retry = 1;
345                                 return;
346                         }
347                 }
348         } while (bypassed--);
349
350 failed:
351         m->current_pgpath = NULL;
352         m->current_pg = NULL;
353 }
354
355 /*
356  * Check whether bios must be queued in the device-mapper core rather
357  * than here in the target.
358  *
359  * m->lock must be held on entry.
360  *
361  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
362  * same value then we are not between multipath_presuspend()
363  * and multipath_resume() calls and we have no need to check
364  * for the DMF_NOFLUSH_SUSPENDING flag.
365  */
366 static int __must_push_back(struct multipath *m)
367 {
368         return (m->queue_if_no_path != m->saved_queue_if_no_path &&
369                 dm_noflush_suspending(m->ti));
370 }
371
372 static int map_io(struct multipath *m, struct request *clone,
373                   union map_info *map_context, unsigned was_queued)
374 {
375         int r = DM_MAPIO_REMAPPED;
376         size_t nr_bytes = blk_rq_bytes(clone);
377         unsigned long flags;
378         struct pgpath *pgpath;
379         struct block_device *bdev;
380         struct dm_mpath_io *mpio = map_context->ptr;
381
382         spin_lock_irqsave(&m->lock, flags);
383
384         /* Do we need to select a new pgpath? */
385         if (!m->current_pgpath ||
386             (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
387                 __choose_pgpath(m, nr_bytes);
388
389         pgpath = m->current_pgpath;
390
391         if (was_queued)
392                 m->queue_size--;
393
394         if (m->pg_init_required) {
395                 if (!m->pg_init_in_progress)
396                         queue_work(kmultipathd, &m->process_queued_ios);
397                 r = DM_MAPIO_REQUEUE;
398         } else if ((pgpath && m->queue_io) ||
399                    (!pgpath && m->queue_if_no_path)) {
400                 /* Queue for the daemon to resubmit */
401                 list_add_tail(&clone->queuelist, &m->queued_ios);
402                 m->queue_size++;
403                 if (!m->queue_io)
404                         queue_work(kmultipathd, &m->process_queued_ios);
405                 pgpath = NULL;
406                 r = DM_MAPIO_SUBMITTED;
407         } else if (pgpath) {
408                 bdev = pgpath->path.dev->bdev;
409                 clone->q = bdev_get_queue(bdev);
410                 clone->rq_disk = bdev->bd_disk;
411         } else if (__must_push_back(m))
412                 r = DM_MAPIO_REQUEUE;
413         else
414                 r = -EIO;       /* Failed */
415
416         mpio->pgpath = pgpath;
417         mpio->nr_bytes = nr_bytes;
418
419         if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
420                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
421                                               nr_bytes);
422
423         spin_unlock_irqrestore(&m->lock, flags);
424
425         return r;
426 }
427
428 /*
429  * If we run out of usable paths, should we queue I/O or error it?
430  */
431 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
432                             unsigned save_old_value)
433 {
434         unsigned long flags;
435
436         spin_lock_irqsave(&m->lock, flags);
437
438         if (save_old_value)
439                 m->saved_queue_if_no_path = m->queue_if_no_path;
440         else
441                 m->saved_queue_if_no_path = queue_if_no_path;
442         m->queue_if_no_path = queue_if_no_path;
443         if (!m->queue_if_no_path && m->queue_size)
444                 queue_work(kmultipathd, &m->process_queued_ios);
445
446         spin_unlock_irqrestore(&m->lock, flags);
447
448         return 0;
449 }
450
451 /*-----------------------------------------------------------------
452  * The multipath daemon is responsible for resubmitting queued ios.
453  *---------------------------------------------------------------*/
454
455 static void dispatch_queued_ios(struct multipath *m)
456 {
457         int r;
458         unsigned long flags;
459         union map_info *info;
460         struct request *clone, *n;
461         LIST_HEAD(cl);
462
463         spin_lock_irqsave(&m->lock, flags);
464         list_splice_init(&m->queued_ios, &cl);
465         spin_unlock_irqrestore(&m->lock, flags);
466
467         list_for_each_entry_safe(clone, n, &cl, queuelist) {
468                 list_del_init(&clone->queuelist);
469
470                 info = dm_get_rq_mapinfo(clone);
471
472                 r = map_io(m, clone, info, 1);
473                 if (r < 0) {
474                         clear_mapinfo(m, info);
475                         dm_kill_unmapped_request(clone, r);
476                 } else if (r == DM_MAPIO_REMAPPED)
477                         dm_dispatch_request(clone);
478                 else if (r == DM_MAPIO_REQUEUE) {
479                         clear_mapinfo(m, info);
480                         dm_requeue_unmapped_request(clone);
481                 }
482         }
483 }
484
485 static void process_queued_ios(struct work_struct *work)
486 {
487         struct multipath *m =
488                 container_of(work, struct multipath, process_queued_ios);
489         struct pgpath *pgpath = NULL;
490         unsigned must_queue = 1;
491         unsigned long flags;
492
493         spin_lock_irqsave(&m->lock, flags);
494
495         if (!m->current_pgpath)
496                 __choose_pgpath(m, 0);
497
498         pgpath = m->current_pgpath;
499
500         if ((pgpath && !m->queue_io) ||
501             (!pgpath && !m->queue_if_no_path))
502                 must_queue = 0;
503
504         if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
505             !m->pg_init_disabled)
506                 __pg_init_all_paths(m);
507
508         spin_unlock_irqrestore(&m->lock, flags);
509         if (!must_queue)
510                 dispatch_queued_ios(m);
511 }
512
513 /*
514  * An event is triggered whenever a path is taken out of use.
515  * Includes path failure and PG bypass.
516  */
517 static void trigger_event(struct work_struct *work)
518 {
519         struct multipath *m =
520                 container_of(work, struct multipath, trigger_event);
521
522         dm_table_event(m->ti->table);
523 }
524
525 /*-----------------------------------------------------------------
526  * Constructor/argument parsing:
527  * <#multipath feature args> [<arg>]*
528  * <#hw_handler args> [hw_handler [<arg>]*]
529  * <#priority groups>
530  * <initial priority group>
531  *     [<selector> <#selector args> [<arg>]*
532  *      <#paths> <#per-path selector args>
533  *         [<path> [<arg>]* ]+ ]+
534  *---------------------------------------------------------------*/
535 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
536                                struct dm_target *ti)
537 {
538         int r;
539         struct path_selector_type *pst;
540         unsigned ps_argc;
541
542         static struct dm_arg _args[] = {
543                 {0, 1024, "invalid number of path selector args"},
544         };
545
546         pst = dm_get_path_selector(dm_shift_arg(as));
547         if (!pst) {
548                 ti->error = "unknown path selector type";
549                 return -EINVAL;
550         }
551
552         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
553         if (r) {
554                 dm_put_path_selector(pst);
555                 return -EINVAL;
556         }
557
558         r = pst->create(&pg->ps, ps_argc, as->argv);
559         if (r) {
560                 dm_put_path_selector(pst);
561                 ti->error = "path selector constructor failed";
562                 return r;
563         }
564
565         pg->ps.type = pst;
566         dm_consume_args(as, ps_argc);
567
568         return 0;
569 }
570
571 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
572                                struct dm_target *ti)
573 {
574         int r;
575         struct pgpath *p;
576         struct multipath *m = ti->private;
577         struct request_queue *q = NULL;
578         const char *attached_handler_name;
579
580         /* we need at least a path arg */
581         if (as->argc < 1) {
582                 ti->error = "no device given";
583                 return ERR_PTR(-EINVAL);
584         }
585
586         p = alloc_pgpath();
587         if (!p)
588                 return ERR_PTR(-ENOMEM);
589
590         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
591                           &p->path.dev);
592         if (r) {
593                 ti->error = "error getting device";
594                 goto bad;
595         }
596
597         if (m->retain_attached_hw_handler || m->hw_handler_name)
598                 q = bdev_get_queue(p->path.dev->bdev);
599
600         if (m->retain_attached_hw_handler) {
601                 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
602                 if (attached_handler_name) {
603                         /*
604                          * Reset hw_handler_name to match the attached handler
605                          * and clear any hw_handler_params associated with the
606                          * ignored handler.
607                          *
608                          * NB. This modifies the table line to show the actual
609                          * handler instead of the original table passed in.
610                          */
611                         kfree(m->hw_handler_name);
612                         m->hw_handler_name = attached_handler_name;
613
614                         kfree(m->hw_handler_params);
615                         m->hw_handler_params = NULL;
616                 }
617         }
618
619         if (m->hw_handler_name) {
620                 /*
621                  * Increments scsi_dh reference, even when using an
622                  * already-attached handler.
623                  */
624                 r = scsi_dh_attach(q, m->hw_handler_name);
625                 if (r == -EBUSY) {
626                         /*
627                          * Already attached to different hw_handler:
628                          * try to reattach with correct one.
629                          */
630                         scsi_dh_detach(q);
631                         r = scsi_dh_attach(q, m->hw_handler_name);
632                 }
633
634                 if (r < 0) {
635                         ti->error = "error attaching hardware handler";
636                         dm_put_device(ti, p->path.dev);
637                         goto bad;
638                 }
639
640                 if (m->hw_handler_params) {
641                         r = scsi_dh_set_params(q, m->hw_handler_params);
642                         if (r < 0) {
643                                 ti->error = "unable to set hardware "
644                                                         "handler parameters";
645                                 scsi_dh_detach(q);
646                                 dm_put_device(ti, p->path.dev);
647                                 goto bad;
648                         }
649                 }
650         }
651
652         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
653         if (r) {
654                 dm_put_device(ti, p->path.dev);
655                 goto bad;
656         }
657
658         return p;
659
660  bad:
661         free_pgpath(p);
662         return ERR_PTR(r);
663 }
664
665 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
666                                                    struct multipath *m)
667 {
668         static struct dm_arg _args[] = {
669                 {1, 1024, "invalid number of paths"},
670                 {0, 1024, "invalid number of selector args"}
671         };
672
673         int r;
674         unsigned i, nr_selector_args, nr_args;
675         struct priority_group *pg;
676         struct dm_target *ti = m->ti;
677
678         if (as->argc < 2) {
679                 as->argc = 0;
680                 ti->error = "not enough priority group arguments";
681                 return ERR_PTR(-EINVAL);
682         }
683
684         pg = alloc_priority_group();
685         if (!pg) {
686                 ti->error = "couldn't allocate priority group";
687                 return ERR_PTR(-ENOMEM);
688         }
689         pg->m = m;
690
691         r = parse_path_selector(as, pg, ti);
692         if (r)
693                 goto bad;
694
695         /*
696          * read the paths
697          */
698         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
699         if (r)
700                 goto bad;
701
702         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
703         if (r)
704                 goto bad;
705
706         nr_args = 1 + nr_selector_args;
707         for (i = 0; i < pg->nr_pgpaths; i++) {
708                 struct pgpath *pgpath;
709                 struct dm_arg_set path_args;
710
711                 if (as->argc < nr_args) {
712                         ti->error = "not enough path parameters";
713                         r = -EINVAL;
714                         goto bad;
715                 }
716
717                 path_args.argc = nr_args;
718                 path_args.argv = as->argv;
719
720                 pgpath = parse_path(&path_args, &pg->ps, ti);
721                 if (IS_ERR(pgpath)) {
722                         r = PTR_ERR(pgpath);
723                         goto bad;
724                 }
725
726                 pgpath->pg = pg;
727                 list_add_tail(&pgpath->list, &pg->pgpaths);
728                 dm_consume_args(as, nr_args);
729         }
730
731         return pg;
732
733  bad:
734         free_priority_group(pg, ti);
735         return ERR_PTR(r);
736 }
737
738 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
739 {
740         unsigned hw_argc;
741         int ret;
742         struct dm_target *ti = m->ti;
743
744         static struct dm_arg _args[] = {
745                 {0, 1024, "invalid number of hardware handler args"},
746         };
747
748         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
749                 return -EINVAL;
750
751         if (!hw_argc)
752                 return 0;
753
754         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
755         if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
756                                      "scsi_dh_%s", m->hw_handler_name)) {
757                 ti->error = "unknown hardware handler type";
758                 ret = -EINVAL;
759                 goto fail;
760         }
761
762         if (hw_argc > 1) {
763                 char *p;
764                 int i, j, len = 4;
765
766                 for (i = 0; i <= hw_argc - 2; i++)
767                         len += strlen(as->argv[i]) + 1;
768                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
769                 if (!p) {
770                         ti->error = "memory allocation failed";
771                         ret = -ENOMEM;
772                         goto fail;
773                 }
774                 j = sprintf(p, "%d", hw_argc - 1);
775                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
776                         j = sprintf(p, "%s", as->argv[i]);
777         }
778         dm_consume_args(as, hw_argc - 1);
779
780         return 0;
781 fail:
782         kfree(m->hw_handler_name);
783         m->hw_handler_name = NULL;
784         return ret;
785 }
786
787 static int parse_features(struct dm_arg_set *as, struct multipath *m)
788 {
789         int r;
790         unsigned argc;
791         struct dm_target *ti = m->ti;
792         const char *arg_name;
793
794         static struct dm_arg _args[] = {
795                 {0, 6, "invalid number of feature args"},
796                 {1, 50, "pg_init_retries must be between 1 and 50"},
797                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
798         };
799
800         r = dm_read_arg_group(_args, as, &argc, &ti->error);
801         if (r)
802                 return -EINVAL;
803
804         if (!argc)
805                 return 0;
806
807         do {
808                 arg_name = dm_shift_arg(as);
809                 argc--;
810
811                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
812                         r = queue_if_no_path(m, 1, 0);
813                         continue;
814                 }
815
816                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
817                         m->retain_attached_hw_handler = 1;
818                         continue;
819                 }
820
821                 if (!strcasecmp(arg_name, "pg_init_retries") &&
822                     (argc >= 1)) {
823                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
824                         argc--;
825                         continue;
826                 }
827
828                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
829                     (argc >= 1)) {
830                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
831                         argc--;
832                         continue;
833                 }
834
835                 ti->error = "Unrecognised multipath feature request";
836                 r = -EINVAL;
837         } while (argc && !r);
838
839         return r;
840 }
841
842 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
843                          char **argv)
844 {
845         /* target arguments */
846         static struct dm_arg _args[] = {
847                 {0, 1024, "invalid number of priority groups"},
848                 {0, 1024, "invalid initial priority group number"},
849         };
850
851         int r;
852         struct multipath *m;
853         struct dm_arg_set as;
854         unsigned pg_count = 0;
855         unsigned next_pg_num;
856
857         as.argc = argc;
858         as.argv = argv;
859
860         m = alloc_multipath(ti);
861         if (!m) {
862                 ti->error = "can't allocate multipath";
863                 return -EINVAL;
864         }
865
866         r = parse_features(&as, m);
867         if (r)
868                 goto bad;
869
870         r = parse_hw_handler(&as, m);
871         if (r)
872                 goto bad;
873
874         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
875         if (r)
876                 goto bad;
877
878         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
879         if (r)
880                 goto bad;
881
882         if ((!m->nr_priority_groups && next_pg_num) ||
883             (m->nr_priority_groups && !next_pg_num)) {
884                 ti->error = "invalid initial priority group";
885                 r = -EINVAL;
886                 goto bad;
887         }
888
889         /* parse the priority groups */
890         while (as.argc) {
891                 struct priority_group *pg;
892
893                 pg = parse_priority_group(&as, m);
894                 if (IS_ERR(pg)) {
895                         r = PTR_ERR(pg);
896                         goto bad;
897                 }
898
899                 m->nr_valid_paths += pg->nr_pgpaths;
900                 list_add_tail(&pg->list, &m->priority_groups);
901                 pg_count++;
902                 pg->pg_num = pg_count;
903                 if (!--next_pg_num)
904                         m->next_pg = pg;
905         }
906
907         if (pg_count != m->nr_priority_groups) {
908                 ti->error = "priority group count mismatch";
909                 r = -EINVAL;
910                 goto bad;
911         }
912
913         ti->num_flush_bios = 1;
914         ti->num_discard_bios = 1;
915         ti->num_write_same_bios = 1;
916
917         return 0;
918
919  bad:
920         free_multipath(m);
921         return r;
922 }
923
924 static void multipath_wait_for_pg_init_completion(struct multipath *m)
925 {
926         DECLARE_WAITQUEUE(wait, current);
927         unsigned long flags;
928
929         add_wait_queue(&m->pg_init_wait, &wait);
930
931         while (1) {
932                 set_current_state(TASK_UNINTERRUPTIBLE);
933
934                 spin_lock_irqsave(&m->lock, flags);
935                 if (!m->pg_init_in_progress) {
936                         spin_unlock_irqrestore(&m->lock, flags);
937                         break;
938                 }
939                 spin_unlock_irqrestore(&m->lock, flags);
940
941                 io_schedule();
942         }
943         set_current_state(TASK_RUNNING);
944
945         remove_wait_queue(&m->pg_init_wait, &wait);
946 }
947
948 static void flush_multipath_work(struct multipath *m)
949 {
950         unsigned long flags;
951
952         spin_lock_irqsave(&m->lock, flags);
953         m->pg_init_disabled = 1;
954         spin_unlock_irqrestore(&m->lock, flags);
955
956         flush_workqueue(kmpath_handlerd);
957         multipath_wait_for_pg_init_completion(m);
958         flush_workqueue(kmultipathd);
959         flush_work(&m->trigger_event);
960
961         spin_lock_irqsave(&m->lock, flags);
962         m->pg_init_disabled = 0;
963         spin_unlock_irqrestore(&m->lock, flags);
964 }
965
966 static void multipath_dtr(struct dm_target *ti)
967 {
968         struct multipath *m = ti->private;
969
970         flush_multipath_work(m);
971         free_multipath(m);
972 }
973
974 /*
975  * Map cloned requests
976  */
977 static int multipath_map(struct dm_target *ti, struct request *clone,
978                          union map_info *map_context)
979 {
980         int r;
981         struct multipath *m = (struct multipath *) ti->private;
982
983         if (set_mapinfo(m, map_context) < 0)
984                 /* ENOMEM, requeue */
985                 return DM_MAPIO_REQUEUE;
986
987         clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
988         r = map_io(m, clone, map_context, 0);
989         if (r < 0 || r == DM_MAPIO_REQUEUE)
990                 clear_mapinfo(m, map_context);
991
992         return r;
993 }
994
995 /*
996  * Take a path out of use.
997  */
998 static int fail_path(struct pgpath *pgpath)
999 {
1000         unsigned long flags;
1001         struct multipath *m = pgpath->pg->m;
1002
1003         spin_lock_irqsave(&m->lock, flags);
1004
1005         if (!pgpath->is_active)
1006                 goto out;
1007
1008         DMWARN("Failing path %s.", pgpath->path.dev->name);
1009
1010         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1011         pgpath->is_active = 0;
1012         pgpath->fail_count++;
1013
1014         m->nr_valid_paths--;
1015
1016         if (pgpath == m->current_pgpath)
1017                 m->current_pgpath = NULL;
1018
1019         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1020                       pgpath->path.dev->name, m->nr_valid_paths);
1021
1022         schedule_work(&m->trigger_event);
1023
1024 out:
1025         spin_unlock_irqrestore(&m->lock, flags);
1026
1027         return 0;
1028 }
1029
1030 /*
1031  * Reinstate a previously-failed path
1032  */
1033 static int reinstate_path(struct pgpath *pgpath)
1034 {
1035         int r = 0;
1036         unsigned long flags;
1037         struct multipath *m = pgpath->pg->m;
1038
1039         spin_lock_irqsave(&m->lock, flags);
1040
1041         if (pgpath->is_active)
1042                 goto out;
1043
1044         if (!pgpath->pg->ps.type->reinstate_path) {
1045                 DMWARN("Reinstate path not supported by path selector %s",
1046                        pgpath->pg->ps.type->name);
1047                 r = -EINVAL;
1048                 goto out;
1049         }
1050
1051         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1052         if (r)
1053                 goto out;
1054
1055         pgpath->is_active = 1;
1056
1057         if (!m->nr_valid_paths++ && m->queue_size) {
1058                 m->current_pgpath = NULL;
1059                 queue_work(kmultipathd, &m->process_queued_ios);
1060         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1061                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1062                         m->pg_init_in_progress++;
1063         }
1064
1065         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1066                       pgpath->path.dev->name, m->nr_valid_paths);
1067
1068         schedule_work(&m->trigger_event);
1069
1070 out:
1071         spin_unlock_irqrestore(&m->lock, flags);
1072
1073         return r;
1074 }
1075
1076 /*
1077  * Fail or reinstate all paths that match the provided struct dm_dev.
1078  */
1079 static int action_dev(struct multipath *m, struct dm_dev *dev,
1080                       action_fn action)
1081 {
1082         int r = -EINVAL;
1083         struct pgpath *pgpath;
1084         struct priority_group *pg;
1085
1086         list_for_each_entry(pg, &m->priority_groups, list) {
1087                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1088                         if (pgpath->path.dev == dev)
1089                                 r = action(pgpath);
1090                 }
1091         }
1092
1093         return r;
1094 }
1095
1096 /*
1097  * Temporarily try to avoid having to use the specified PG
1098  */
1099 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1100                       int bypassed)
1101 {
1102         unsigned long flags;
1103
1104         spin_lock_irqsave(&m->lock, flags);
1105
1106         pg->bypassed = bypassed;
1107         m->current_pgpath = NULL;
1108         m->current_pg = NULL;
1109
1110         spin_unlock_irqrestore(&m->lock, flags);
1111
1112         schedule_work(&m->trigger_event);
1113 }
1114
1115 /*
1116  * Switch to using the specified PG from the next I/O that gets mapped
1117  */
1118 static int switch_pg_num(struct multipath *m, const char *pgstr)
1119 {
1120         struct priority_group *pg;
1121         unsigned pgnum;
1122         unsigned long flags;
1123         char dummy;
1124
1125         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1126             (pgnum > m->nr_priority_groups)) {
1127                 DMWARN("invalid PG number supplied to switch_pg_num");
1128                 return -EINVAL;
1129         }
1130
1131         spin_lock_irqsave(&m->lock, flags);
1132         list_for_each_entry(pg, &m->priority_groups, list) {
1133                 pg->bypassed = 0;
1134                 if (--pgnum)
1135                         continue;
1136
1137                 m->current_pgpath = NULL;
1138                 m->current_pg = NULL;
1139                 m->next_pg = pg;
1140         }
1141         spin_unlock_irqrestore(&m->lock, flags);
1142
1143         schedule_work(&m->trigger_event);
1144         return 0;
1145 }
1146
1147 /*
1148  * Set/clear bypassed status of a PG.
1149  * PGs are numbered upwards from 1 in the order they were declared.
1150  */
1151 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1152 {
1153         struct priority_group *pg;
1154         unsigned pgnum;
1155         char dummy;
1156
1157         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1158             (pgnum > m->nr_priority_groups)) {
1159                 DMWARN("invalid PG number supplied to bypass_pg");
1160                 return -EINVAL;
1161         }
1162
1163         list_for_each_entry(pg, &m->priority_groups, list) {
1164                 if (!--pgnum)
1165                         break;
1166         }
1167
1168         bypass_pg(m, pg, bypassed);
1169         return 0;
1170 }
1171
1172 /*
1173  * Should we retry pg_init immediately?
1174  */
1175 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1176 {
1177         unsigned long flags;
1178         int limit_reached = 0;
1179
1180         spin_lock_irqsave(&m->lock, flags);
1181
1182         if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
1183                 m->pg_init_required = 1;
1184         else
1185                 limit_reached = 1;
1186
1187         spin_unlock_irqrestore(&m->lock, flags);
1188
1189         return limit_reached;
1190 }
1191
1192 static void pg_init_done(void *data, int errors)
1193 {
1194         struct pgpath *pgpath = data;
1195         struct priority_group *pg = pgpath->pg;
1196         struct multipath *m = pg->m;
1197         unsigned long flags;
1198         unsigned delay_retry = 0;
1199
1200         /* device or driver problems */
1201         switch (errors) {
1202         case SCSI_DH_OK:
1203                 break;
1204         case SCSI_DH_NOSYS:
1205                 if (!m->hw_handler_name) {
1206                         errors = 0;
1207                         break;
1208                 }
1209                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1210                       "Error %d.", m->hw_handler_name, errors);
1211                 /*
1212                  * Fail path for now, so we do not ping pong
1213                  */
1214                 fail_path(pgpath);
1215                 break;
1216         case SCSI_DH_DEV_TEMP_BUSY:
1217                 /*
1218                  * Probably doing something like FW upgrade on the
1219                  * controller so try the other pg.
1220                  */
1221                 bypass_pg(m, pg, 1);
1222                 break;
1223         case SCSI_DH_RETRY:
1224                 /* Wait before retrying. */
1225                 delay_retry = 1;
1226         case SCSI_DH_IMM_RETRY:
1227         case SCSI_DH_RES_TEMP_UNAVAIL:
1228                 if (pg_init_limit_reached(m, pgpath))
1229                         fail_path(pgpath);
1230                 errors = 0;
1231                 break;
1232         default:
1233                 /*
1234                  * We probably do not want to fail the path for a device
1235                  * error, but this is what the old dm did. In future
1236                  * patches we can do more advanced handling.
1237                  */
1238                 fail_path(pgpath);
1239         }
1240
1241         spin_lock_irqsave(&m->lock, flags);
1242         if (errors) {
1243                 if (pgpath == m->current_pgpath) {
1244                         DMERR("Could not failover device. Error %d.", errors);
1245                         m->current_pgpath = NULL;
1246                         m->current_pg = NULL;
1247                 }
1248         } else if (!m->pg_init_required)
1249                 pg->bypassed = 0;
1250
1251         if (--m->pg_init_in_progress)
1252                 /* Activations of other paths are still on going */
1253                 goto out;
1254
1255         if (!m->pg_init_required)
1256                 m->queue_io = 0;
1257
1258         m->pg_init_delay_retry = delay_retry;
1259         queue_work(kmultipathd, &m->process_queued_ios);
1260
1261         /*
1262          * Wake up any thread waiting to suspend.
1263          */
1264         wake_up(&m->pg_init_wait);
1265
1266 out:
1267         spin_unlock_irqrestore(&m->lock, flags);
1268 }
1269
1270 static void activate_path(struct work_struct *work)
1271 {
1272         struct pgpath *pgpath =
1273                 container_of(work, struct pgpath, activate_path.work);
1274
1275         scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1276                                 pg_init_done, pgpath);
1277 }
1278
1279 static int noretry_error(int error)
1280 {
1281         switch (error) {
1282         case -EOPNOTSUPP:
1283         case -EREMOTEIO:
1284         case -EILSEQ:
1285         case -ENODATA:
1286         case -ENOSPC:
1287                 return 1;
1288         }
1289
1290         /* Anything else could be a path failure, so should be retried */
1291         return 0;
1292 }
1293
1294 /*
1295  * end_io handling
1296  */
1297 static int do_end_io(struct multipath *m, struct request *clone,
1298                      int error, struct dm_mpath_io *mpio)
1299 {
1300         /*
1301          * We don't queue any clone request inside the multipath target
1302          * during end I/O handling, since those clone requests don't have
1303          * bio clones.  If we queue them inside the multipath target,
1304          * we need to make bio clones, that requires memory allocation.
1305          * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1306          *  don't have bio clones.)
1307          * Instead of queueing the clone request here, we queue the original
1308          * request into dm core, which will remake a clone request and
1309          * clone bios for it and resubmit it later.
1310          */
1311         int r = DM_ENDIO_REQUEUE;
1312         unsigned long flags;
1313
1314         if (!error && !clone->errors)
1315                 return 0;       /* I/O complete */
1316
1317         if (noretry_error(error)) {
1318                 if ((clone->cmd_flags & REQ_WRITE_SAME) &&
1319                     !clone->q->limits.max_write_same_sectors) {
1320                         struct queue_limits *limits;
1321
1322                         /* device doesn't really support WRITE SAME, disable it */
1323                         limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
1324                         limits->max_write_same_sectors = 0;
1325                 }
1326                 return error;
1327         }
1328
1329         if (mpio->pgpath)
1330                 fail_path(mpio->pgpath);
1331
1332         spin_lock_irqsave(&m->lock, flags);
1333         if (!m->nr_valid_paths) {
1334                 if (!m->queue_if_no_path) {
1335                         if (!__must_push_back(m))
1336                                 r = -EIO;
1337                 } else {
1338                         if (error == -EBADE)
1339                                 r = error;
1340                 }
1341         }
1342         spin_unlock_irqrestore(&m->lock, flags);
1343
1344         return r;
1345 }
1346
1347 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1348                             int error, union map_info *map_context)
1349 {
1350         struct multipath *m = ti->private;
1351         struct dm_mpath_io *mpio = map_context->ptr;
1352         struct pgpath *pgpath;
1353         struct path_selector *ps;
1354         int r;
1355
1356         BUG_ON(!mpio);
1357
1358         r  = do_end_io(m, clone, error, mpio);
1359         pgpath = mpio->pgpath;
1360         if (pgpath) {
1361                 ps = &pgpath->pg->ps;
1362                 if (ps->type->end_io)
1363                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1364         }
1365         clear_mapinfo(m, map_context);
1366
1367         return r;
1368 }
1369
1370 /*
1371  * Suspend can't complete until all the I/O is processed so if
1372  * the last path fails we must error any remaining I/O.
1373  * Note that if the freeze_bdev fails while suspending, the
1374  * queue_if_no_path state is lost - userspace should reset it.
1375  */
1376 static void multipath_presuspend(struct dm_target *ti)
1377 {
1378         struct multipath *m = (struct multipath *) ti->private;
1379
1380         queue_if_no_path(m, 0, 1);
1381 }
1382
1383 static void multipath_postsuspend(struct dm_target *ti)
1384 {
1385         struct multipath *m = ti->private;
1386
1387         mutex_lock(&m->work_mutex);
1388         flush_multipath_work(m);
1389         mutex_unlock(&m->work_mutex);
1390 }
1391
1392 /*
1393  * Restore the queue_if_no_path setting.
1394  */
1395 static void multipath_resume(struct dm_target *ti)
1396 {
1397         struct multipath *m = (struct multipath *) ti->private;
1398         unsigned long flags;
1399
1400         spin_lock_irqsave(&m->lock, flags);
1401         m->queue_if_no_path = m->saved_queue_if_no_path;
1402         spin_unlock_irqrestore(&m->lock, flags);
1403 }
1404
1405 /*
1406  * Info output has the following format:
1407  * num_multipath_feature_args [multipath_feature_args]*
1408  * num_handler_status_args [handler_status_args]*
1409  * num_groups init_group_number
1410  *            [A|D|E num_ps_status_args [ps_status_args]*
1411  *             num_paths num_selector_args
1412  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1413  *
1414  * Table output has the following format (identical to the constructor string):
1415  * num_feature_args [features_args]*
1416  * num_handler_args hw_handler [hw_handler_args]*
1417  * num_groups init_group_number
1418  *     [priority selector-name num_ps_args [ps_args]*
1419  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1420  */
1421 static void multipath_status(struct dm_target *ti, status_type_t type,
1422                              unsigned status_flags, char *result, unsigned maxlen)
1423 {
1424         int sz = 0;
1425         unsigned long flags;
1426         struct multipath *m = (struct multipath *) ti->private;
1427         struct priority_group *pg;
1428         struct pgpath *p;
1429         unsigned pg_num;
1430         char state;
1431
1432         spin_lock_irqsave(&m->lock, flags);
1433
1434         /* Features */
1435         if (type == STATUSTYPE_INFO)
1436                 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1437         else {
1438                 DMEMIT("%u ", m->queue_if_no_path +
1439                               (m->pg_init_retries > 0) * 2 +
1440                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1441                               m->retain_attached_hw_handler);
1442                 if (m->queue_if_no_path)
1443                         DMEMIT("queue_if_no_path ");
1444                 if (m->pg_init_retries)
1445                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1446                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1447                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1448                 if (m->retain_attached_hw_handler)
1449                         DMEMIT("retain_attached_hw_handler ");
1450         }
1451
1452         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1453                 DMEMIT("0 ");
1454         else
1455                 DMEMIT("1 %s ", m->hw_handler_name);
1456
1457         DMEMIT("%u ", m->nr_priority_groups);
1458
1459         if (m->next_pg)
1460                 pg_num = m->next_pg->pg_num;
1461         else if (m->current_pg)
1462                 pg_num = m->current_pg->pg_num;
1463         else
1464                 pg_num = (m->nr_priority_groups ? 1 : 0);
1465
1466         DMEMIT("%u ", pg_num);
1467
1468         switch (type) {
1469         case STATUSTYPE_INFO:
1470                 list_for_each_entry(pg, &m->priority_groups, list) {
1471                         if (pg->bypassed)
1472                                 state = 'D';    /* Disabled */
1473                         else if (pg == m->current_pg)
1474                                 state = 'A';    /* Currently Active */
1475                         else
1476                                 state = 'E';    /* Enabled */
1477
1478                         DMEMIT("%c ", state);
1479
1480                         if (pg->ps.type->status)
1481                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1482                                                           result + sz,
1483                                                           maxlen - sz);
1484                         else
1485                                 DMEMIT("0 ");
1486
1487                         DMEMIT("%u %u ", pg->nr_pgpaths,
1488                                pg->ps.type->info_args);
1489
1490                         list_for_each_entry(p, &pg->pgpaths, list) {
1491                                 DMEMIT("%s %s %u ", p->path.dev->name,
1492                                        p->is_active ? "A" : "F",
1493                                        p->fail_count);
1494                                 if (pg->ps.type->status)
1495                                         sz += pg->ps.type->status(&pg->ps,
1496                                               &p->path, type, result + sz,
1497                                               maxlen - sz);
1498                         }
1499                 }
1500                 break;
1501
1502         case STATUSTYPE_TABLE:
1503                 list_for_each_entry(pg, &m->priority_groups, list) {
1504                         DMEMIT("%s ", pg->ps.type->name);
1505
1506                         if (pg->ps.type->status)
1507                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1508                                                           result + sz,
1509                                                           maxlen - sz);
1510                         else
1511                                 DMEMIT("0 ");
1512
1513                         DMEMIT("%u %u ", pg->nr_pgpaths,
1514                                pg->ps.type->table_args);
1515
1516                         list_for_each_entry(p, &pg->pgpaths, list) {
1517                                 DMEMIT("%s ", p->path.dev->name);
1518                                 if (pg->ps.type->status)
1519                                         sz += pg->ps.type->status(&pg->ps,
1520                                               &p->path, type, result + sz,
1521                                               maxlen - sz);
1522                         }
1523                 }
1524                 break;
1525         }
1526
1527         spin_unlock_irqrestore(&m->lock, flags);
1528 }
1529
1530 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1531 {
1532         int r = -EINVAL;
1533         struct dm_dev *dev;
1534         struct multipath *m = (struct multipath *) ti->private;
1535         action_fn action;
1536
1537         mutex_lock(&m->work_mutex);
1538
1539         if (dm_suspended(ti)) {
1540                 r = -EBUSY;
1541                 goto out;
1542         }
1543
1544         if (argc == 1) {
1545                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1546                         r = queue_if_no_path(m, 1, 0);
1547                         goto out;
1548                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1549                         r = queue_if_no_path(m, 0, 0);
1550                         goto out;
1551                 }
1552         }
1553
1554         if (argc != 2) {
1555                 DMWARN("Unrecognised multipath message received.");
1556                 goto out;
1557         }
1558
1559         if (!strcasecmp(argv[0], "disable_group")) {
1560                 r = bypass_pg_num(m, argv[1], 1);
1561                 goto out;
1562         } else if (!strcasecmp(argv[0], "enable_group")) {
1563                 r = bypass_pg_num(m, argv[1], 0);
1564                 goto out;
1565         } else if (!strcasecmp(argv[0], "switch_group")) {
1566                 r = switch_pg_num(m, argv[1]);
1567                 goto out;
1568         } else if (!strcasecmp(argv[0], "reinstate_path"))
1569                 action = reinstate_path;
1570         else if (!strcasecmp(argv[0], "fail_path"))
1571                 action = fail_path;
1572         else {
1573                 DMWARN("Unrecognised multipath message received.");
1574                 goto out;
1575         }
1576
1577         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1578         if (r) {
1579                 DMWARN("message: error getting device %s",
1580                        argv[1]);
1581                 goto out;
1582         }
1583
1584         r = action_dev(m, dev, action);
1585
1586         dm_put_device(ti, dev);
1587
1588 out:
1589         mutex_unlock(&m->work_mutex);
1590         return r;
1591 }
1592
1593 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1594                            unsigned long arg)
1595 {
1596         struct multipath *m = ti->private;
1597         struct pgpath *pgpath;
1598         struct block_device *bdev;
1599         fmode_t mode;
1600         unsigned long flags;
1601         int r;
1602
1603         bdev = NULL;
1604         mode = 0;
1605         r = 0;
1606
1607         spin_lock_irqsave(&m->lock, flags);
1608
1609         if (!m->current_pgpath)
1610                 __choose_pgpath(m, 0);
1611
1612         pgpath = m->current_pgpath;
1613
1614         if (pgpath) {
1615                 bdev = pgpath->path.dev->bdev;
1616                 mode = pgpath->path.dev->mode;
1617         }
1618
1619         if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
1620                 r = -ENOTCONN;
1621         else if (!bdev)
1622                 r = -EIO;
1623
1624         spin_unlock_irqrestore(&m->lock, flags);
1625
1626         /*
1627          * Only pass ioctls through if the device sizes match exactly.
1628          */
1629         if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
1630                 r = scsi_verify_blk_ioctl(NULL, cmd);
1631
1632         if (r == -ENOTCONN && !fatal_signal_pending(current))
1633                 queue_work(kmultipathd, &m->process_queued_ios);
1634
1635         return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1636 }
1637
1638 static int multipath_iterate_devices(struct dm_target *ti,
1639                                      iterate_devices_callout_fn fn, void *data)
1640 {
1641         struct multipath *m = ti->private;
1642         struct priority_group *pg;
1643         struct pgpath *p;
1644         int ret = 0;
1645
1646         list_for_each_entry(pg, &m->priority_groups, list) {
1647                 list_for_each_entry(p, &pg->pgpaths, list) {
1648                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1649                         if (ret)
1650                                 goto out;
1651                 }
1652         }
1653
1654 out:
1655         return ret;
1656 }
1657
1658 static int __pgpath_busy(struct pgpath *pgpath)
1659 {
1660         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1661
1662         return dm_underlying_device_busy(q);
1663 }
1664
1665 /*
1666  * We return "busy", only when we can map I/Os but underlying devices
1667  * are busy (so even if we map I/Os now, the I/Os will wait on
1668  * the underlying queue).
1669  * In other words, if we want to kill I/Os or queue them inside us
1670  * due to map unavailability, we don't return "busy".  Otherwise,
1671  * dm core won't give us the I/Os and we can't do what we want.
1672  */
1673 static int multipath_busy(struct dm_target *ti)
1674 {
1675         int busy = 0, has_active = 0;
1676         struct multipath *m = ti->private;
1677         struct priority_group *pg;
1678         struct pgpath *pgpath;
1679         unsigned long flags;
1680
1681         spin_lock_irqsave(&m->lock, flags);
1682
1683         /* pg_init in progress, requeue until done */
1684         if (m->pg_init_in_progress) {
1685                 busy = 1;
1686                 goto out;
1687         }
1688         /* Guess which priority_group will be used at next mapping time */
1689         if (unlikely(!m->current_pgpath && m->next_pg))
1690                 pg = m->next_pg;
1691         else if (likely(m->current_pg))
1692                 pg = m->current_pg;
1693         else
1694                 /*
1695                  * We don't know which pg will be used at next mapping time.
1696                  * We don't call __choose_pgpath() here to avoid to trigger
1697                  * pg_init just by busy checking.
1698                  * So we don't know whether underlying devices we will be using
1699                  * at next mapping time are busy or not. Just try mapping.
1700                  */
1701                 goto out;
1702
1703         /*
1704          * If there is one non-busy active path at least, the path selector
1705          * will be able to select it. So we consider such a pg as not busy.
1706          */
1707         busy = 1;
1708         list_for_each_entry(pgpath, &pg->pgpaths, list)
1709                 if (pgpath->is_active) {
1710                         has_active = 1;
1711
1712                         if (!__pgpath_busy(pgpath)) {
1713                                 busy = 0;
1714                                 break;
1715                         }
1716                 }
1717
1718         if (!has_active)
1719                 /*
1720                  * No active path in this pg, so this pg won't be used and
1721                  * the current_pg will be changed at next mapping time.
1722                  * We need to try mapping to determine it.
1723                  */
1724                 busy = 0;
1725
1726 out:
1727         spin_unlock_irqrestore(&m->lock, flags);
1728
1729         return busy;
1730 }
1731
1732 /*-----------------------------------------------------------------
1733  * Module setup
1734  *---------------------------------------------------------------*/
1735 static struct target_type multipath_target = {
1736         .name = "multipath",
1737         .version = {1, 6, 0},
1738         .module = THIS_MODULE,
1739         .ctr = multipath_ctr,
1740         .dtr = multipath_dtr,
1741         .map_rq = multipath_map,
1742         .rq_end_io = multipath_end_io,
1743         .presuspend = multipath_presuspend,
1744         .postsuspend = multipath_postsuspend,
1745         .resume = multipath_resume,
1746         .status = multipath_status,
1747         .message = multipath_message,
1748         .ioctl  = multipath_ioctl,
1749         .iterate_devices = multipath_iterate_devices,
1750         .busy = multipath_busy,
1751 };
1752
1753 static int __init dm_multipath_init(void)
1754 {
1755         int r;
1756
1757         /* allocate a slab for the dm_ios */
1758         _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1759         if (!_mpio_cache)
1760                 return -ENOMEM;
1761
1762         r = dm_register_target(&multipath_target);
1763         if (r < 0) {
1764                 DMERR("register failed %d", r);
1765                 kmem_cache_destroy(_mpio_cache);
1766                 return -EINVAL;
1767         }
1768
1769         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1770         if (!kmultipathd) {
1771                 DMERR("failed to create workqueue kmpathd");
1772                 dm_unregister_target(&multipath_target);
1773                 kmem_cache_destroy(_mpio_cache);
1774                 return -ENOMEM;
1775         }
1776
1777         /*
1778          * A separate workqueue is used to handle the device handlers
1779          * to avoid overloading existing workqueue. Overloading the
1780          * old workqueue would also create a bottleneck in the
1781          * path of the storage hardware device activation.
1782          */
1783         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1784                                                   WQ_MEM_RECLAIM);
1785         if (!kmpath_handlerd) {
1786                 DMERR("failed to create workqueue kmpath_handlerd");
1787                 destroy_workqueue(kmultipathd);
1788                 dm_unregister_target(&multipath_target);
1789                 kmem_cache_destroy(_mpio_cache);
1790                 return -ENOMEM;
1791         }
1792
1793         DMINFO("version %u.%u.%u loaded",
1794                multipath_target.version[0], multipath_target.version[1],
1795                multipath_target.version[2]);
1796
1797         return r;
1798 }
1799
1800 static void __exit dm_multipath_exit(void)
1801 {
1802         destroy_workqueue(kmpath_handlerd);
1803         destroy_workqueue(kmultipathd);
1804
1805         dm_unregister_target(&multipath_target);
1806         kmem_cache_destroy(_mpio_cache);
1807 }
1808
1809 module_init(dm_multipath_init);
1810 module_exit(dm_multipath_exit);
1811
1812 MODULE_DESCRIPTION(DM_NAME " multipath target");
1813 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1814 MODULE_LICENSE("GPL");