dm snapshot: permit only one merge at once
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / md / dm-snap.c
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/blkdev.h>
10 #include <linux/device-mapper.h>
11 #include <linux/delay.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kdev_t.h>
15 #include <linux/list.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <linux/log2.h>
21 #include <linux/dm-kcopyd.h>
22 #include <linux/workqueue.h>
23
24 #include "dm-exception-store.h"
25
26 #define DM_MSG_PREFIX "snapshots"
27
28 static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30 #define dm_target_is_snapshot_merge(ti) \
31         ((ti)->type->name == dm_snapshot_merge_target_name)
32
33 /*
34  * The percentage increment we will wake up users at
35  */
36 #define WAKE_UP_PERCENT 5
37
38 /*
39  * kcopyd priority of snapshot operations
40  */
41 #define SNAPSHOT_COPY_PRIORITY 2
42
43 /*
44  * Reserve 1MB for each snapshot initially (with minimum of 1 page).
45  */
46 #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
47
48 /*
49  * The size of the mempool used to track chunks in use.
50  */
51 #define MIN_IOS 256
52
53 #define DM_TRACKED_CHUNK_HASH_SIZE      16
54 #define DM_TRACKED_CHUNK_HASH(x)        ((unsigned long)(x) & \
55                                          (DM_TRACKED_CHUNK_HASH_SIZE - 1))
56
57 struct dm_exception_table {
58         uint32_t hash_mask;
59         unsigned hash_shift;
60         struct list_head *table;
61 };
62
63 struct dm_snapshot {
64         struct rw_semaphore lock;
65
66         struct dm_dev *origin;
67         struct dm_dev *cow;
68
69         struct dm_target *ti;
70
71         /* List of snapshots per Origin */
72         struct list_head list;
73
74         /* You can't use a snapshot if this is 0 (e.g. if full) */
75         int valid;
76
77         /* Origin writes don't trigger exceptions until this is set */
78         int active;
79
80         /* Whether or not owning mapped_device is suspended */
81         int suspended;
82
83         mempool_t *pending_pool;
84
85         atomic_t pending_exceptions_count;
86
87         struct dm_exception_table pending;
88         struct dm_exception_table complete;
89
90         /*
91          * pe_lock protects all pending_exception operations and access
92          * as well as the snapshot_bios list.
93          */
94         spinlock_t pe_lock;
95
96         /* The on disk metadata handler */
97         struct dm_exception_store *store;
98
99         struct dm_kcopyd_client *kcopyd_client;
100
101         /* Queue of snapshot writes for ksnapd to flush */
102         struct bio_list queued_bios;
103         struct work_struct queued_bios_work;
104
105         /* Chunks with outstanding reads */
106         mempool_t *tracked_chunk_pool;
107         spinlock_t tracked_chunk_lock;
108         struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
109 };
110
111 struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
112 {
113         return s->cow;
114 }
115 EXPORT_SYMBOL(dm_snap_cow);
116
117 static struct workqueue_struct *ksnapd;
118 static void flush_queued_bios(struct work_struct *work);
119
120 static sector_t chunk_to_sector(struct dm_exception_store *store,
121                                 chunk_t chunk)
122 {
123         return chunk << store->chunk_shift;
124 }
125
126 static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
127 {
128         /*
129          * There is only ever one instance of a particular block
130          * device so we can compare pointers safely.
131          */
132         return lhs == rhs;
133 }
134
135 struct dm_snap_pending_exception {
136         struct dm_exception e;
137
138         /*
139          * Origin buffers waiting for this to complete are held
140          * in a bio list
141          */
142         struct bio_list origin_bios;
143         struct bio_list snapshot_bios;
144
145         /* Pointer back to snapshot context */
146         struct dm_snapshot *snap;
147
148         /*
149          * 1 indicates the exception has already been sent to
150          * kcopyd.
151          */
152         int started;
153 };
154
155 /*
156  * Hash table mapping origin volumes to lists of snapshots and
157  * a lock to protect it
158  */
159 static struct kmem_cache *exception_cache;
160 static struct kmem_cache *pending_cache;
161
162 struct dm_snap_tracked_chunk {
163         struct hlist_node node;
164         chunk_t chunk;
165 };
166
167 static struct kmem_cache *tracked_chunk_cache;
168
169 static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
170                                                  chunk_t chunk)
171 {
172         struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
173                                                         GFP_NOIO);
174         unsigned long flags;
175
176         c->chunk = chunk;
177
178         spin_lock_irqsave(&s->tracked_chunk_lock, flags);
179         hlist_add_head(&c->node,
180                        &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
181         spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
182
183         return c;
184 }
185
186 static void stop_tracking_chunk(struct dm_snapshot *s,
187                                 struct dm_snap_tracked_chunk *c)
188 {
189         unsigned long flags;
190
191         spin_lock_irqsave(&s->tracked_chunk_lock, flags);
192         hlist_del(&c->node);
193         spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
194
195         mempool_free(c, s->tracked_chunk_pool);
196 }
197
198 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
199 {
200         struct dm_snap_tracked_chunk *c;
201         struct hlist_node *hn;
202         int found = 0;
203
204         spin_lock_irq(&s->tracked_chunk_lock);
205
206         hlist_for_each_entry(c, hn,
207             &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
208                 if (c->chunk == chunk) {
209                         found = 1;
210                         break;
211                 }
212         }
213
214         spin_unlock_irq(&s->tracked_chunk_lock);
215
216         return found;
217 }
218
219 /*
220  * This conflicting I/O is extremely improbable in the caller,
221  * so msleep(1) is sufficient and there is no need for a wait queue.
222  */
223 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
224 {
225         while (__chunk_is_tracked(s, chunk))
226                 msleep(1);
227 }
228
229 /*
230  * One of these per registered origin, held in the snapshot_origins hash
231  */
232 struct origin {
233         /* The origin device */
234         struct block_device *bdev;
235
236         struct list_head hash_list;
237
238         /* List of snapshots for this origin */
239         struct list_head snapshots;
240 };
241
242 /*
243  * Size of the hash table for origin volumes. If we make this
244  * the size of the minors list then it should be nearly perfect
245  */
246 #define ORIGIN_HASH_SIZE 256
247 #define ORIGIN_MASK      0xFF
248 static struct list_head *_origins;
249 static struct rw_semaphore _origins_lock;
250
251 static int init_origin_hash(void)
252 {
253         int i;
254
255         _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
256                            GFP_KERNEL);
257         if (!_origins) {
258                 DMERR("unable to allocate memory");
259                 return -ENOMEM;
260         }
261
262         for (i = 0; i < ORIGIN_HASH_SIZE; i++)
263                 INIT_LIST_HEAD(_origins + i);
264         init_rwsem(&_origins_lock);
265
266         return 0;
267 }
268
269 static void exit_origin_hash(void)
270 {
271         kfree(_origins);
272 }
273
274 static unsigned origin_hash(struct block_device *bdev)
275 {
276         return bdev->bd_dev & ORIGIN_MASK;
277 }
278
279 static struct origin *__lookup_origin(struct block_device *origin)
280 {
281         struct list_head *ol;
282         struct origin *o;
283
284         ol = &_origins[origin_hash(origin)];
285         list_for_each_entry (o, ol, hash_list)
286                 if (bdev_equal(o->bdev, origin))
287                         return o;
288
289         return NULL;
290 }
291
292 static void __insert_origin(struct origin *o)
293 {
294         struct list_head *sl = &_origins[origin_hash(o->bdev)];
295         list_add_tail(&o->hash_list, sl);
296 }
297
298 /*
299  * _origins_lock must be held when calling this function.
300  * Returns number of snapshots registered using the supplied cow device, plus:
301  * snap_src - a snapshot suitable for use as a source of exception handover
302  * snap_dest - a snapshot capable of receiving exception handover.
303  * snap_merge - an existing snapshot-merge target linked to the same origin.
304  *   There can be at most one snapshot-merge target. The parameter is optional.
305  *
306  * Possible return values and states of snap_src and snap_dest.
307  *   0: NULL, NULL  - first new snapshot
308  *   1: snap_src, NULL - normal snapshot
309  *   2: snap_src, snap_dest  - waiting for handover
310  *   2: snap_src, NULL - handed over, waiting for old to be deleted
311  *   1: NULL, snap_dest - source got destroyed without handover
312  */
313 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
314                                         struct dm_snapshot **snap_src,
315                                         struct dm_snapshot **snap_dest,
316                                         struct dm_snapshot **snap_merge)
317 {
318         struct dm_snapshot *s;
319         struct origin *o;
320         int count = 0;
321         int active;
322
323         o = __lookup_origin(snap->origin->bdev);
324         if (!o)
325                 goto out;
326
327         list_for_each_entry(s, &o->snapshots, list) {
328                 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
329                         *snap_merge = s;
330                 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
331                         continue;
332
333                 down_read(&s->lock);
334                 active = s->active;
335                 up_read(&s->lock);
336
337                 if (active) {
338                         if (snap_src)
339                                 *snap_src = s;
340                 } else if (snap_dest)
341                         *snap_dest = s;
342
343                 count++;
344         }
345
346 out:
347         return count;
348 }
349
350 /*
351  * On success, returns 1 if this snapshot is a handover destination,
352  * otherwise returns 0.
353  */
354 static int __validate_exception_handover(struct dm_snapshot *snap)
355 {
356         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
357         struct dm_snapshot *snap_merge = NULL;
358
359         /* Does snapshot need exceptions handed over to it? */
360         if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
361                                           &snap_merge) == 2) ||
362             snap_dest) {
363                 snap->ti->error = "Snapshot cow pairing for exception "
364                                   "table handover failed";
365                 return -EINVAL;
366         }
367
368         /*
369          * If no snap_src was found, snap cannot become a handover
370          * destination.
371          */
372         if (!snap_src)
373                 return 0;
374
375         /*
376          * Non-snapshot-merge handover?
377          */
378         if (!dm_target_is_snapshot_merge(snap->ti))
379                 return 1;
380
381         /*
382          * Do not allow more than one merging snapshot.
383          */
384         if (snap_merge) {
385                 snap->ti->error = "A snapshot is already merging.";
386                 return -EINVAL;
387         }
388
389         return 1;
390 }
391
392 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
393 {
394         struct dm_snapshot *l;
395
396         /* Sort the list according to chunk size, largest-first smallest-last */
397         list_for_each_entry(l, &o->snapshots, list)
398                 if (l->store->chunk_size < s->store->chunk_size)
399                         break;
400         list_add_tail(&s->list, &l->list);
401 }
402
403 /*
404  * Make a note of the snapshot and its origin so we can look it
405  * up when the origin has a write on it.
406  *
407  * Also validate snapshot exception store handovers.
408  * On success, returns 1 if this registration is a handover destination,
409  * otherwise returns 0.
410  */
411 static int register_snapshot(struct dm_snapshot *snap)
412 {
413         struct origin *o, *new_o = NULL;
414         struct block_device *bdev = snap->origin->bdev;
415         int r = 0;
416
417         new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
418         if (!new_o)
419                 return -ENOMEM;
420
421         down_write(&_origins_lock);
422
423         r = __validate_exception_handover(snap);
424         if (r < 0) {
425                 kfree(new_o);
426                 goto out;
427         }
428
429         o = __lookup_origin(bdev);
430         if (o)
431                 kfree(new_o);
432         else {
433                 /* New origin */
434                 o = new_o;
435
436                 /* Initialise the struct */
437                 INIT_LIST_HEAD(&o->snapshots);
438                 o->bdev = bdev;
439
440                 __insert_origin(o);
441         }
442
443         __insert_snapshot(o, snap);
444
445 out:
446         up_write(&_origins_lock);
447
448         return r;
449 }
450
451 /*
452  * Move snapshot to correct place in list according to chunk size.
453  */
454 static void reregister_snapshot(struct dm_snapshot *s)
455 {
456         struct block_device *bdev = s->origin->bdev;
457
458         down_write(&_origins_lock);
459
460         list_del(&s->list);
461         __insert_snapshot(__lookup_origin(bdev), s);
462
463         up_write(&_origins_lock);
464 }
465
466 static void unregister_snapshot(struct dm_snapshot *s)
467 {
468         struct origin *o;
469
470         down_write(&_origins_lock);
471         o = __lookup_origin(s->origin->bdev);
472
473         list_del(&s->list);
474         if (o && list_empty(&o->snapshots)) {
475                 list_del(&o->hash_list);
476                 kfree(o);
477         }
478
479         up_write(&_origins_lock);
480 }
481
482 /*
483  * Implementation of the exception hash tables.
484  * The lowest hash_shift bits of the chunk number are ignored, allowing
485  * some consecutive chunks to be grouped together.
486  */
487 static int dm_exception_table_init(struct dm_exception_table *et,
488                                    uint32_t size, unsigned hash_shift)
489 {
490         unsigned int i;
491
492         et->hash_shift = hash_shift;
493         et->hash_mask = size - 1;
494         et->table = dm_vcalloc(size, sizeof(struct list_head));
495         if (!et->table)
496                 return -ENOMEM;
497
498         for (i = 0; i < size; i++)
499                 INIT_LIST_HEAD(et->table + i);
500
501         return 0;
502 }
503
504 static void dm_exception_table_exit(struct dm_exception_table *et,
505                                     struct kmem_cache *mem)
506 {
507         struct list_head *slot;
508         struct dm_exception *ex, *next;
509         int i, size;
510
511         size = et->hash_mask + 1;
512         for (i = 0; i < size; i++) {
513                 slot = et->table + i;
514
515                 list_for_each_entry_safe (ex, next, slot, hash_list)
516                         kmem_cache_free(mem, ex);
517         }
518
519         vfree(et->table);
520 }
521
522 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
523 {
524         return (chunk >> et->hash_shift) & et->hash_mask;
525 }
526
527 static void dm_remove_exception(struct dm_exception *e)
528 {
529         list_del(&e->hash_list);
530 }
531
532 /*
533  * Return the exception data for a sector, or NULL if not
534  * remapped.
535  */
536 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
537                                                 chunk_t chunk)
538 {
539         struct list_head *slot;
540         struct dm_exception *e;
541
542         slot = &et->table[exception_hash(et, chunk)];
543         list_for_each_entry (e, slot, hash_list)
544                 if (chunk >= e->old_chunk &&
545                     chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
546                         return e;
547
548         return NULL;
549 }
550
551 static struct dm_exception *alloc_completed_exception(void)
552 {
553         struct dm_exception *e;
554
555         e = kmem_cache_alloc(exception_cache, GFP_NOIO);
556         if (!e)
557                 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
558
559         return e;
560 }
561
562 static void free_completed_exception(struct dm_exception *e)
563 {
564         kmem_cache_free(exception_cache, e);
565 }
566
567 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
568 {
569         struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
570                                                              GFP_NOIO);
571
572         atomic_inc(&s->pending_exceptions_count);
573         pe->snap = s;
574
575         return pe;
576 }
577
578 static void free_pending_exception(struct dm_snap_pending_exception *pe)
579 {
580         struct dm_snapshot *s = pe->snap;
581
582         mempool_free(pe, s->pending_pool);
583         smp_mb__before_atomic_dec();
584         atomic_dec(&s->pending_exceptions_count);
585 }
586
587 static void dm_insert_exception(struct dm_exception_table *eh,
588                                 struct dm_exception *new_e)
589 {
590         struct list_head *l;
591         struct dm_exception *e = NULL;
592
593         l = &eh->table[exception_hash(eh, new_e->old_chunk)];
594
595         /* Add immediately if this table doesn't support consecutive chunks */
596         if (!eh->hash_shift)
597                 goto out;
598
599         /* List is ordered by old_chunk */
600         list_for_each_entry_reverse(e, l, hash_list) {
601                 /* Insert after an existing chunk? */
602                 if (new_e->old_chunk == (e->old_chunk +
603                                          dm_consecutive_chunk_count(e) + 1) &&
604                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
605                                          dm_consecutive_chunk_count(e) + 1)) {
606                         dm_consecutive_chunk_count_inc(e);
607                         free_completed_exception(new_e);
608                         return;
609                 }
610
611                 /* Insert before an existing chunk? */
612                 if (new_e->old_chunk == (e->old_chunk - 1) &&
613                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
614                         dm_consecutive_chunk_count_inc(e);
615                         e->old_chunk--;
616                         e->new_chunk--;
617                         free_completed_exception(new_e);
618                         return;
619                 }
620
621                 if (new_e->old_chunk > e->old_chunk)
622                         break;
623         }
624
625 out:
626         list_add(&new_e->hash_list, e ? &e->hash_list : l);
627 }
628
629 /*
630  * Callback used by the exception stores to load exceptions when
631  * initialising.
632  */
633 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
634 {
635         struct dm_snapshot *s = context;
636         struct dm_exception *e;
637
638         e = alloc_completed_exception();
639         if (!e)
640                 return -ENOMEM;
641
642         e->old_chunk = old;
643
644         /* Consecutive_count is implicitly initialised to zero */
645         e->new_chunk = new;
646
647         dm_insert_exception(&s->complete, e);
648
649         return 0;
650 }
651
652 #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
653
654 /*
655  * Return a minimum chunk size of all snapshots that have the specified origin.
656  * Return zero if the origin has no snapshots.
657  */
658 static sector_t __minimum_chunk_size(struct origin *o)
659 {
660         struct dm_snapshot *snap;
661         unsigned chunk_size = 0;
662
663         if (o)
664                 list_for_each_entry(snap, &o->snapshots, list)
665                         chunk_size = min_not_zero(chunk_size,
666                                                   snap->store->chunk_size);
667
668         return chunk_size;
669 }
670
671 /*
672  * Hard coded magic.
673  */
674 static int calc_max_buckets(void)
675 {
676         /* use a fixed size of 2MB */
677         unsigned long mem = 2 * 1024 * 1024;
678         mem /= sizeof(struct list_head);
679
680         return mem;
681 }
682
683 /*
684  * Allocate room for a suitable hash table.
685  */
686 static int init_hash_tables(struct dm_snapshot *s)
687 {
688         sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
689
690         /*
691          * Calculate based on the size of the original volume or
692          * the COW volume...
693          */
694         cow_dev_size = get_dev_size(s->cow->bdev);
695         origin_dev_size = get_dev_size(s->origin->bdev);
696         max_buckets = calc_max_buckets();
697
698         hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
699         hash_size = min(hash_size, max_buckets);
700
701         if (hash_size < 64)
702                 hash_size = 64;
703         hash_size = rounddown_pow_of_two(hash_size);
704         if (dm_exception_table_init(&s->complete, hash_size,
705                                     DM_CHUNK_CONSECUTIVE_BITS))
706                 return -ENOMEM;
707
708         /*
709          * Allocate hash table for in-flight exceptions
710          * Make this smaller than the real hash table
711          */
712         hash_size >>= 3;
713         if (hash_size < 64)
714                 hash_size = 64;
715
716         if (dm_exception_table_init(&s->pending, hash_size, 0)) {
717                 dm_exception_table_exit(&s->complete, exception_cache);
718                 return -ENOMEM;
719         }
720
721         return 0;
722 }
723
724 /*
725  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
726  */
727 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
728 {
729         struct dm_snapshot *s;
730         int i;
731         int r = -EINVAL;
732         char *origin_path, *cow_path;
733         unsigned args_used, num_flush_requests = 1;
734         fmode_t origin_mode = FMODE_READ;
735
736         if (argc != 4) {
737                 ti->error = "requires exactly 4 arguments";
738                 r = -EINVAL;
739                 goto bad;
740         }
741
742         if (dm_target_is_snapshot_merge(ti)) {
743                 num_flush_requests = 2;
744                 origin_mode = FMODE_WRITE;
745         }
746
747         origin_path = argv[0];
748         argv++;
749         argc--;
750
751         s = kmalloc(sizeof(*s), GFP_KERNEL);
752         if (!s) {
753                 ti->error = "Cannot allocate snapshot context private "
754                     "structure";
755                 r = -ENOMEM;
756                 goto bad;
757         }
758
759         cow_path = argv[0];
760         argv++;
761         argc--;
762
763         r = dm_get_device(ti, cow_path, 0, 0,
764                           FMODE_READ | FMODE_WRITE, &s->cow);
765         if (r) {
766                 ti->error = "Cannot get COW device";
767                 goto bad_cow;
768         }
769
770         r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
771         if (r) {
772                 ti->error = "Couldn't create exception store";
773                 r = -EINVAL;
774                 goto bad_store;
775         }
776
777         argv += args_used;
778         argc -= args_used;
779
780         r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin);
781         if (r) {
782                 ti->error = "Cannot get origin device";
783                 goto bad_origin;
784         }
785
786         s->ti = ti;
787         s->valid = 1;
788         s->active = 0;
789         s->suspended = 0;
790         atomic_set(&s->pending_exceptions_count, 0);
791         init_rwsem(&s->lock);
792         INIT_LIST_HEAD(&s->list);
793         spin_lock_init(&s->pe_lock);
794
795         /* Allocate hash table for COW data */
796         if (init_hash_tables(s)) {
797                 ti->error = "Unable to allocate hash table space";
798                 r = -ENOMEM;
799                 goto bad_hash_tables;
800         }
801
802         r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
803         if (r) {
804                 ti->error = "Could not create kcopyd client";
805                 goto bad_kcopyd;
806         }
807
808         s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
809         if (!s->pending_pool) {
810                 ti->error = "Could not allocate mempool for pending exceptions";
811                 goto bad_pending_pool;
812         }
813
814         s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
815                                                          tracked_chunk_cache);
816         if (!s->tracked_chunk_pool) {
817                 ti->error = "Could not allocate tracked_chunk mempool for "
818                             "tracking reads";
819                 goto bad_tracked_chunk_pool;
820         }
821
822         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
823                 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
824
825         spin_lock_init(&s->tracked_chunk_lock);
826
827         bio_list_init(&s->queued_bios);
828         INIT_WORK(&s->queued_bios_work, flush_queued_bios);
829
830         ti->private = s;
831         ti->num_flush_requests = num_flush_requests;
832
833         /* Add snapshot to the list of snapshots for this origin */
834         /* Exceptions aren't triggered till snapshot_resume() is called */
835         r = register_snapshot(s);
836         if (r == -ENOMEM) {
837                 ti->error = "Snapshot origin struct allocation failed";
838                 goto bad_load_and_register;
839         } else if (r < 0) {
840                 /* invalid handover, register_snapshot has set ti->error */
841                 goto bad_load_and_register;
842         }
843
844         /*
845          * Metadata must only be loaded into one table at once, so skip this
846          * if metadata will be handed over during resume.
847          * Chunk size will be set during the handover - set it to zero to
848          * ensure it's ignored.
849          */
850         if (r > 0) {
851                 s->store->chunk_size = 0;
852                 return 0;
853         }
854
855         r = s->store->type->read_metadata(s->store, dm_add_exception,
856                                           (void *)s);
857         if (r < 0) {
858                 ti->error = "Failed to read snapshot metadata";
859                 goto bad_read_metadata;
860         } else if (r > 0) {
861                 s->valid = 0;
862                 DMWARN("Snapshot is marked invalid.");
863         }
864
865         if (!s->store->chunk_size) {
866                 ti->error = "Chunk size not set";
867                 goto bad_read_metadata;
868         }
869         ti->split_io = s->store->chunk_size;
870
871         return 0;
872
873 bad_read_metadata:
874         unregister_snapshot(s);
875
876 bad_load_and_register:
877         mempool_destroy(s->tracked_chunk_pool);
878
879 bad_tracked_chunk_pool:
880         mempool_destroy(s->pending_pool);
881
882 bad_pending_pool:
883         dm_kcopyd_client_destroy(s->kcopyd_client);
884
885 bad_kcopyd:
886         dm_exception_table_exit(&s->pending, pending_cache);
887         dm_exception_table_exit(&s->complete, exception_cache);
888
889 bad_hash_tables:
890         dm_put_device(ti, s->origin);
891
892 bad_origin:
893         dm_exception_store_destroy(s->store);
894
895 bad_store:
896         dm_put_device(ti, s->cow);
897
898 bad_cow:
899         kfree(s);
900
901 bad:
902         return r;
903 }
904
905 static void __free_exceptions(struct dm_snapshot *s)
906 {
907         dm_kcopyd_client_destroy(s->kcopyd_client);
908         s->kcopyd_client = NULL;
909
910         dm_exception_table_exit(&s->pending, pending_cache);
911         dm_exception_table_exit(&s->complete, exception_cache);
912 }
913
914 static void __handover_exceptions(struct dm_snapshot *snap_src,
915                                   struct dm_snapshot *snap_dest)
916 {
917         union {
918                 struct dm_exception_table table_swap;
919                 struct dm_exception_store *store_swap;
920         } u;
921
922         /*
923          * Swap all snapshot context information between the two instances.
924          */
925         u.table_swap = snap_dest->complete;
926         snap_dest->complete = snap_src->complete;
927         snap_src->complete = u.table_swap;
928
929         u.store_swap = snap_dest->store;
930         snap_dest->store = snap_src->store;
931         snap_src->store = u.store_swap;
932
933         snap_dest->store->snap = snap_dest;
934         snap_src->store->snap = snap_src;
935
936         snap_dest->ti->split_io = snap_dest->store->chunk_size;
937         snap_dest->valid = snap_src->valid;
938
939         /*
940          * Set source invalid to ensure it receives no further I/O.
941          */
942         snap_src->valid = 0;
943 }
944
945 static void snapshot_dtr(struct dm_target *ti)
946 {
947 #ifdef CONFIG_DM_DEBUG
948         int i;
949 #endif
950         struct dm_snapshot *s = ti->private;
951         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
952
953         flush_workqueue(ksnapd);
954
955         down_read(&_origins_lock);
956         /* Check whether exception handover must be cancelled */
957         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
958         if (snap_src && snap_dest && (s == snap_src)) {
959                 down_write(&snap_dest->lock);
960                 snap_dest->valid = 0;
961                 up_write(&snap_dest->lock);
962                 DMERR("Cancelling snapshot handover.");
963         }
964         up_read(&_origins_lock);
965
966         /* Prevent further origin writes from using this snapshot. */
967         /* After this returns there can be no new kcopyd jobs. */
968         unregister_snapshot(s);
969
970         while (atomic_read(&s->pending_exceptions_count))
971                 msleep(1);
972         /*
973          * Ensure instructions in mempool_destroy aren't reordered
974          * before atomic_read.
975          */
976         smp_mb();
977
978 #ifdef CONFIG_DM_DEBUG
979         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
980                 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
981 #endif
982
983         mempool_destroy(s->tracked_chunk_pool);
984
985         __free_exceptions(s);
986
987         mempool_destroy(s->pending_pool);
988
989         dm_put_device(ti, s->origin);
990
991         dm_exception_store_destroy(s->store);
992
993         dm_put_device(ti, s->cow);
994
995         kfree(s);
996 }
997
998 /*
999  * Flush a list of buffers.
1000  */
1001 static void flush_bios(struct bio *bio)
1002 {
1003         struct bio *n;
1004
1005         while (bio) {
1006                 n = bio->bi_next;
1007                 bio->bi_next = NULL;
1008                 generic_make_request(bio);
1009                 bio = n;
1010         }
1011 }
1012
1013 static void flush_queued_bios(struct work_struct *work)
1014 {
1015         struct dm_snapshot *s =
1016                 container_of(work, struct dm_snapshot, queued_bios_work);
1017         struct bio *queued_bios;
1018         unsigned long flags;
1019
1020         spin_lock_irqsave(&s->pe_lock, flags);
1021         queued_bios = bio_list_get(&s->queued_bios);
1022         spin_unlock_irqrestore(&s->pe_lock, flags);
1023
1024         flush_bios(queued_bios);
1025 }
1026
1027 static int do_origin(struct dm_dev *origin, struct bio *bio);
1028
1029 /*
1030  * Flush a list of buffers.
1031  */
1032 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1033 {
1034         struct bio *n;
1035         int r;
1036
1037         while (bio) {
1038                 n = bio->bi_next;
1039                 bio->bi_next = NULL;
1040                 r = do_origin(s->origin, bio);
1041                 if (r == DM_MAPIO_REMAPPED)
1042                         generic_make_request(bio);
1043                 bio = n;
1044         }
1045 }
1046
1047 /*
1048  * Error a list of buffers.
1049  */
1050 static void error_bios(struct bio *bio)
1051 {
1052         struct bio *n;
1053
1054         while (bio) {
1055                 n = bio->bi_next;
1056                 bio->bi_next = NULL;
1057                 bio_io_error(bio);
1058                 bio = n;
1059         }
1060 }
1061
1062 static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1063 {
1064         if (!s->valid)
1065                 return;
1066
1067         if (err == -EIO)
1068                 DMERR("Invalidating snapshot: Error reading/writing.");
1069         else if (err == -ENOMEM)
1070                 DMERR("Invalidating snapshot: Unable to allocate exception.");
1071
1072         if (s->store->type->drop_snapshot)
1073                 s->store->type->drop_snapshot(s->store);
1074
1075         s->valid = 0;
1076
1077         dm_table_event(s->ti->table);
1078 }
1079
1080 static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1081 {
1082         struct dm_exception *e;
1083         struct dm_snapshot *s = pe->snap;
1084         struct bio *origin_bios = NULL;
1085         struct bio *snapshot_bios = NULL;
1086         int error = 0;
1087
1088         if (!success) {
1089                 /* Read/write error - snapshot is unusable */
1090                 down_write(&s->lock);
1091                 __invalidate_snapshot(s, -EIO);
1092                 error = 1;
1093                 goto out;
1094         }
1095
1096         e = alloc_completed_exception();
1097         if (!e) {
1098                 down_write(&s->lock);
1099                 __invalidate_snapshot(s, -ENOMEM);
1100                 error = 1;
1101                 goto out;
1102         }
1103         *e = pe->e;
1104
1105         down_write(&s->lock);
1106         if (!s->valid) {
1107                 free_completed_exception(e);
1108                 error = 1;
1109                 goto out;
1110         }
1111
1112         /* Check for conflicting reads */
1113         __check_for_conflicting_io(s, pe->e.old_chunk);
1114
1115         /*
1116          * Add a proper exception, and remove the
1117          * in-flight exception from the list.
1118          */
1119         dm_insert_exception(&s->complete, e);
1120
1121  out:
1122         dm_remove_exception(&pe->e);
1123         snapshot_bios = bio_list_get(&pe->snapshot_bios);
1124         origin_bios = bio_list_get(&pe->origin_bios);
1125         free_pending_exception(pe);
1126
1127         up_write(&s->lock);
1128
1129         /* Submit any pending write bios */
1130         if (error)
1131                 error_bios(snapshot_bios);
1132         else
1133                 flush_bios(snapshot_bios);
1134
1135         retry_origin_bios(s, origin_bios);
1136 }
1137
1138 static void commit_callback(void *context, int success)
1139 {
1140         struct dm_snap_pending_exception *pe = context;
1141
1142         pending_complete(pe, success);
1143 }
1144
1145 /*
1146  * Called when the copy I/O has finished.  kcopyd actually runs
1147  * this code so don't block.
1148  */
1149 static void copy_callback(int read_err, unsigned long write_err, void *context)
1150 {
1151         struct dm_snap_pending_exception *pe = context;
1152         struct dm_snapshot *s = pe->snap;
1153
1154         if (read_err || write_err)
1155                 pending_complete(pe, 0);
1156
1157         else
1158                 /* Update the metadata if we are persistent */
1159                 s->store->type->commit_exception(s->store, &pe->e,
1160                                                  commit_callback, pe);
1161 }
1162
1163 /*
1164  * Dispatches the copy operation to kcopyd.
1165  */
1166 static void start_copy(struct dm_snap_pending_exception *pe)
1167 {
1168         struct dm_snapshot *s = pe->snap;
1169         struct dm_io_region src, dest;
1170         struct block_device *bdev = s->origin->bdev;
1171         sector_t dev_size;
1172
1173         dev_size = get_dev_size(bdev);
1174
1175         src.bdev = bdev;
1176         src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
1177         src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
1178
1179         dest.bdev = s->cow->bdev;
1180         dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
1181         dest.count = src.count;
1182
1183         /* Hand over to kcopyd */
1184         dm_kcopyd_copy(s->kcopyd_client,
1185                     &src, 1, &dest, 0, copy_callback, pe);
1186 }
1187
1188 static struct dm_snap_pending_exception *
1189 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1190 {
1191         struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
1192
1193         if (!e)
1194                 return NULL;
1195
1196         return container_of(e, struct dm_snap_pending_exception, e);
1197 }
1198
1199 /*
1200  * Looks to see if this snapshot already has a pending exception
1201  * for this chunk, otherwise it allocates a new one and inserts
1202  * it into the pending table.
1203  *
1204  * NOTE: a write lock must be held on snap->lock before calling
1205  * this.
1206  */
1207 static struct dm_snap_pending_exception *
1208 __find_pending_exception(struct dm_snapshot *s,
1209                          struct dm_snap_pending_exception *pe, chunk_t chunk)
1210 {
1211         struct dm_snap_pending_exception *pe2;
1212
1213         pe2 = __lookup_pending_exception(s, chunk);
1214         if (pe2) {
1215                 free_pending_exception(pe);
1216                 return pe2;
1217         }
1218
1219         pe->e.old_chunk = chunk;
1220         bio_list_init(&pe->origin_bios);
1221         bio_list_init(&pe->snapshot_bios);
1222         pe->started = 0;
1223
1224         if (s->store->type->prepare_exception(s->store, &pe->e)) {
1225                 free_pending_exception(pe);
1226                 return NULL;
1227         }
1228
1229         dm_insert_exception(&s->pending, &pe->e);
1230
1231         return pe;
1232 }
1233
1234 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1235                             struct bio *bio, chunk_t chunk)
1236 {
1237         bio->bi_bdev = s->cow->bdev;
1238         bio->bi_sector = chunk_to_sector(s->store,
1239                                          dm_chunk_number(e->new_chunk) +
1240                                          (chunk - e->old_chunk)) +
1241                                          (bio->bi_sector &
1242                                           s->store->chunk_mask);
1243 }
1244
1245 static int snapshot_map(struct dm_target *ti, struct bio *bio,
1246                         union map_info *map_context)
1247 {
1248         struct dm_exception *e;
1249         struct dm_snapshot *s = ti->private;
1250         int r = DM_MAPIO_REMAPPED;
1251         chunk_t chunk;
1252         struct dm_snap_pending_exception *pe = NULL;
1253
1254         if (unlikely(bio_empty_barrier(bio))) {
1255                 bio->bi_bdev = s->cow->bdev;
1256                 return DM_MAPIO_REMAPPED;
1257         }
1258
1259         chunk = sector_to_chunk(s->store, bio->bi_sector);
1260
1261         /* Full snapshots are not usable */
1262         /* To get here the table must be live so s->active is always set. */
1263         if (!s->valid)
1264                 return -EIO;
1265
1266         /* FIXME: should only take write lock if we need
1267          * to copy an exception */
1268         down_write(&s->lock);
1269
1270         if (!s->valid) {
1271                 r = -EIO;
1272                 goto out_unlock;
1273         }
1274
1275         /* If the block is already remapped - use that, else remap it */
1276         e = dm_lookup_exception(&s->complete, chunk);
1277         if (e) {
1278                 remap_exception(s, e, bio, chunk);
1279                 goto out_unlock;
1280         }
1281
1282         /*
1283          * Write to snapshot - higher level takes care of RW/RO
1284          * flags so we should only get this if we are
1285          * writeable.
1286          */
1287         if (bio_rw(bio) == WRITE) {
1288                 pe = __lookup_pending_exception(s, chunk);
1289                 if (!pe) {
1290                         up_write(&s->lock);
1291                         pe = alloc_pending_exception(s);
1292                         down_write(&s->lock);
1293
1294                         if (!s->valid) {
1295                                 free_pending_exception(pe);
1296                                 r = -EIO;
1297                                 goto out_unlock;
1298                         }
1299
1300                         e = dm_lookup_exception(&s->complete, chunk);
1301                         if (e) {
1302                                 free_pending_exception(pe);
1303                                 remap_exception(s, e, bio, chunk);
1304                                 goto out_unlock;
1305                         }
1306
1307                         pe = __find_pending_exception(s, pe, chunk);
1308                         if (!pe) {
1309                                 __invalidate_snapshot(s, -ENOMEM);
1310                                 r = -EIO;
1311                                 goto out_unlock;
1312                         }
1313                 }
1314
1315                 remap_exception(s, &pe->e, bio, chunk);
1316                 bio_list_add(&pe->snapshot_bios, bio);
1317
1318                 r = DM_MAPIO_SUBMITTED;
1319
1320                 if (!pe->started) {
1321                         /* this is protected by snap->lock */
1322                         pe->started = 1;
1323                         up_write(&s->lock);
1324                         start_copy(pe);
1325                         goto out;
1326                 }
1327         } else {
1328                 bio->bi_bdev = s->origin->bdev;
1329                 map_context->ptr = track_chunk(s, chunk);
1330         }
1331
1332  out_unlock:
1333         up_write(&s->lock);
1334  out:
1335         return r;
1336 }
1337
1338 /*
1339  * A snapshot-merge target behaves like a combination of a snapshot
1340  * target and a snapshot-origin target.  It only generates new
1341  * exceptions in other snapshots and not in the one that is being
1342  * merged.
1343  *
1344  * For each chunk, if there is an existing exception, it is used to
1345  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
1346  * which in turn might generate exceptions in other snapshots.
1347  */
1348 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1349                               union map_info *map_context)
1350 {
1351         struct dm_exception *e;
1352         struct dm_snapshot *s = ti->private;
1353         int r = DM_MAPIO_REMAPPED;
1354         chunk_t chunk;
1355
1356         if (unlikely(bio_empty_barrier(bio))) {
1357                 if (!map_context->flush_request)
1358                         bio->bi_bdev = s->origin->bdev;
1359                 else
1360                         bio->bi_bdev = s->cow->bdev;
1361                 map_context->ptr = NULL;
1362                 return DM_MAPIO_REMAPPED;
1363         }
1364
1365         chunk = sector_to_chunk(s->store, bio->bi_sector);
1366
1367         down_read(&s->lock);
1368
1369         /* Full snapshots are not usable */
1370         if (!s->valid) {
1371                 r = -EIO;
1372                 goto out_unlock;
1373         }
1374
1375         /* If the block is already remapped - use that */
1376         e = dm_lookup_exception(&s->complete, chunk);
1377         if (e) {
1378                 remap_exception(s, e, bio, chunk);
1379                 goto out_unlock;
1380         }
1381
1382         bio->bi_bdev = s->origin->bdev;
1383
1384         if (bio_rw(bio) == WRITE) {
1385                 up_read(&s->lock);
1386                 return do_origin(s->origin, bio);
1387         }
1388
1389 out_unlock:
1390         up_read(&s->lock);
1391
1392         return r;
1393 }
1394
1395 static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1396                            int error, union map_info *map_context)
1397 {
1398         struct dm_snapshot *s = ti->private;
1399         struct dm_snap_tracked_chunk *c = map_context->ptr;
1400
1401         if (c)
1402                 stop_tracking_chunk(s, c);
1403
1404         return 0;
1405 }
1406
1407 static void snapshot_postsuspend(struct dm_target *ti)
1408 {
1409         struct dm_snapshot *s = ti->private;
1410
1411         down_write(&s->lock);
1412         s->suspended = 1;
1413         up_write(&s->lock);
1414 }
1415
1416 static int snapshot_preresume(struct dm_target *ti)
1417 {
1418         int r = 0;
1419         struct dm_snapshot *s = ti->private;
1420         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1421
1422         down_read(&_origins_lock);
1423         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1424         if (snap_src && snap_dest) {
1425                 down_read(&snap_src->lock);
1426                 if (s == snap_src) {
1427                         DMERR("Unable to resume snapshot source until "
1428                               "handover completes.");
1429                         r = -EINVAL;
1430                 } else if (!snap_src->suspended) {
1431                         DMERR("Unable to perform snapshot handover until "
1432                               "source is suspended.");
1433                         r = -EINVAL;
1434                 }
1435                 up_read(&snap_src->lock);
1436         }
1437         up_read(&_origins_lock);
1438
1439         return r;
1440 }
1441
1442 static void snapshot_resume(struct dm_target *ti)
1443 {
1444         struct dm_snapshot *s = ti->private;
1445         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1446
1447         down_read(&_origins_lock);
1448         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1449         if (snap_src && snap_dest) {
1450                 down_write(&snap_src->lock);
1451                 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1452                 __handover_exceptions(snap_src, snap_dest);
1453                 up_write(&snap_dest->lock);
1454                 up_write(&snap_src->lock);
1455         }
1456         up_read(&_origins_lock);
1457
1458         /* Now we have correct chunk size, reregister */
1459         reregister_snapshot(s);
1460
1461         down_write(&s->lock);
1462         s->active = 1;
1463         s->suspended = 0;
1464         up_write(&s->lock);
1465 }
1466
1467 static int snapshot_status(struct dm_target *ti, status_type_t type,
1468                            char *result, unsigned int maxlen)
1469 {
1470         unsigned sz = 0;
1471         struct dm_snapshot *snap = ti->private;
1472
1473         switch (type) {
1474         case STATUSTYPE_INFO:
1475
1476                 down_write(&snap->lock);
1477
1478                 if (!snap->valid)
1479                         DMEMIT("Invalid");
1480                 else {
1481                         if (snap->store->type->usage) {
1482                                 sector_t total_sectors, sectors_allocated,
1483                                          metadata_sectors;
1484                                 snap->store->type->usage(snap->store,
1485                                                          &total_sectors,
1486                                                          &sectors_allocated,
1487                                                          &metadata_sectors);
1488                                 DMEMIT("%llu/%llu %llu",
1489                                        (unsigned long long)sectors_allocated,
1490                                        (unsigned long long)total_sectors,
1491                                        (unsigned long long)metadata_sectors);
1492                         }
1493                         else
1494                                 DMEMIT("Unknown");
1495                 }
1496
1497                 up_write(&snap->lock);
1498
1499                 break;
1500
1501         case STATUSTYPE_TABLE:
1502                 /*
1503                  * kdevname returns a static pointer so we need
1504                  * to make private copies if the output is to
1505                  * make sense.
1506                  */
1507                 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1508                 snap->store->type->status(snap->store, type, result + sz,
1509                                           maxlen - sz);
1510                 break;
1511         }
1512
1513         return 0;
1514 }
1515
1516 static int snapshot_iterate_devices(struct dm_target *ti,
1517                                     iterate_devices_callout_fn fn, void *data)
1518 {
1519         struct dm_snapshot *snap = ti->private;
1520
1521         return fn(ti, snap->origin, 0, ti->len, data);
1522 }
1523
1524
1525 /*-----------------------------------------------------------------
1526  * Origin methods
1527  *---------------------------------------------------------------*/
1528
1529 /*
1530  * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1531  * supplied bio was ignored.  The caller may submit it immediately.
1532  * (No remapping actually occurs as the origin is always a direct linear
1533  * map.)
1534  *
1535  * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1536  * and any supplied bio is added to a list to be submitted once all
1537  * the necessary exceptions exist.
1538  */
1539 static int __origin_write(struct list_head *snapshots, sector_t sector,
1540                           struct bio *bio)
1541 {
1542         int r = DM_MAPIO_REMAPPED;
1543         struct dm_snapshot *snap;
1544         struct dm_exception *e;
1545         struct dm_snap_pending_exception *pe;
1546         struct dm_snap_pending_exception *pe_to_start_now = NULL;
1547         struct dm_snap_pending_exception *pe_to_start_last = NULL;
1548         chunk_t chunk;
1549
1550         /* Do all the snapshots on this origin */
1551         list_for_each_entry (snap, snapshots, list) {
1552                 /*
1553                  * Don't make new exceptions in a merging snapshot
1554                  * because it has effectively been deleted
1555                  */
1556                 if (dm_target_is_snapshot_merge(snap->ti))
1557                         continue;
1558
1559                 down_write(&snap->lock);
1560
1561                 /* Only deal with valid and active snapshots */
1562                 if (!snap->valid || !snap->active)
1563                         goto next_snapshot;
1564
1565                 /* Nothing to do if writing beyond end of snapshot */
1566                 if (sector >= dm_table_get_size(snap->ti->table))
1567                         goto next_snapshot;
1568
1569                 /*
1570                  * Remember, different snapshots can have
1571                  * different chunk sizes.
1572                  */
1573                 chunk = sector_to_chunk(snap->store, sector);
1574
1575                 /*
1576                  * Check exception table to see if block
1577                  * is already remapped in this snapshot
1578                  * and trigger an exception if not.
1579                  */
1580                 e = dm_lookup_exception(&snap->complete, chunk);
1581                 if (e)
1582                         goto next_snapshot;
1583
1584                 pe = __lookup_pending_exception(snap, chunk);
1585                 if (!pe) {
1586                         up_write(&snap->lock);
1587                         pe = alloc_pending_exception(snap);
1588                         down_write(&snap->lock);
1589
1590                         if (!snap->valid) {
1591                                 free_pending_exception(pe);
1592                                 goto next_snapshot;
1593                         }
1594
1595                         e = dm_lookup_exception(&snap->complete, chunk);
1596                         if (e) {
1597                                 free_pending_exception(pe);
1598                                 goto next_snapshot;
1599                         }
1600
1601                         pe = __find_pending_exception(snap, pe, chunk);
1602                         if (!pe) {
1603                                 __invalidate_snapshot(snap, -ENOMEM);
1604                                 goto next_snapshot;
1605                         }
1606                 }
1607
1608                 r = DM_MAPIO_SUBMITTED;
1609
1610                 /*
1611                  * If an origin bio was supplied, queue it to wait for the
1612                  * completion of this exception, and start this one last,
1613                  * at the end of the function.
1614                  */
1615                 if (bio) {
1616                         bio_list_add(&pe->origin_bios, bio);
1617                         bio = NULL;
1618
1619                         if (!pe->started) {
1620                                 pe->started = 1;
1621                                 pe_to_start_last = pe;
1622                         }
1623                 }
1624
1625                 if (!pe->started) {
1626                         pe->started = 1;
1627                         pe_to_start_now = pe;
1628                 }
1629
1630  next_snapshot:
1631                 up_write(&snap->lock);
1632
1633                 if (pe_to_start_now) {
1634                         start_copy(pe_to_start_now);
1635                         pe_to_start_now = NULL;
1636                 }
1637         }
1638
1639         /*
1640          * Submit the exception against which the bio is queued last,
1641          * to give the other exceptions a head start.
1642          */
1643         if (pe_to_start_last)
1644                 start_copy(pe_to_start_last);
1645
1646         return r;
1647 }
1648
1649 /*
1650  * Called on a write from the origin driver.
1651  */
1652 static int do_origin(struct dm_dev *origin, struct bio *bio)
1653 {
1654         struct origin *o;
1655         int r = DM_MAPIO_REMAPPED;
1656
1657         down_read(&_origins_lock);
1658         o = __lookup_origin(origin->bdev);
1659         if (o)
1660                 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1661         up_read(&_origins_lock);
1662
1663         return r;
1664 }
1665
1666 /*
1667  * Origin: maps a linear range of a device, with hooks for snapshotting.
1668  */
1669
1670 /*
1671  * Construct an origin mapping: <dev_path>
1672  * The context for an origin is merely a 'struct dm_dev *'
1673  * pointing to the real device.
1674  */
1675 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1676 {
1677         int r;
1678         struct dm_dev *dev;
1679
1680         if (argc != 1) {
1681                 ti->error = "origin: incorrect number of arguments";
1682                 return -EINVAL;
1683         }
1684
1685         r = dm_get_device(ti, argv[0], 0, ti->len,
1686                           dm_table_get_mode(ti->table), &dev);
1687         if (r) {
1688                 ti->error = "Cannot get target device";
1689                 return r;
1690         }
1691
1692         ti->private = dev;
1693         ti->num_flush_requests = 1;
1694
1695         return 0;
1696 }
1697
1698 static void origin_dtr(struct dm_target *ti)
1699 {
1700         struct dm_dev *dev = ti->private;
1701         dm_put_device(ti, dev);
1702 }
1703
1704 static int origin_map(struct dm_target *ti, struct bio *bio,
1705                       union map_info *map_context)
1706 {
1707         struct dm_dev *dev = ti->private;
1708         bio->bi_bdev = dev->bdev;
1709
1710         if (unlikely(bio_empty_barrier(bio)))
1711                 return DM_MAPIO_REMAPPED;
1712
1713         /* Only tell snapshots if this is a write */
1714         return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1715 }
1716
1717 /*
1718  * Set the target "split_io" field to the minimum of all the snapshots'
1719  * chunk sizes.
1720  */
1721 static void origin_resume(struct dm_target *ti)
1722 {
1723         struct dm_dev *dev = ti->private;
1724
1725         down_read(&_origins_lock);
1726
1727         ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev));
1728
1729         up_read(&_origins_lock);
1730 }
1731
1732 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1733                          unsigned int maxlen)
1734 {
1735         struct dm_dev *dev = ti->private;
1736
1737         switch (type) {
1738         case STATUSTYPE_INFO:
1739                 result[0] = '\0';
1740                 break;
1741
1742         case STATUSTYPE_TABLE:
1743                 snprintf(result, maxlen, "%s", dev->name);
1744                 break;
1745         }
1746
1747         return 0;
1748 }
1749
1750 static int origin_iterate_devices(struct dm_target *ti,
1751                                   iterate_devices_callout_fn fn, void *data)
1752 {
1753         struct dm_dev *dev = ti->private;
1754
1755         return fn(ti, dev, 0, ti->len, data);
1756 }
1757
1758 static struct target_type origin_target = {
1759         .name    = "snapshot-origin",
1760         .version = {1, 7, 0},
1761         .module  = THIS_MODULE,
1762         .ctr     = origin_ctr,
1763         .dtr     = origin_dtr,
1764         .map     = origin_map,
1765         .resume  = origin_resume,
1766         .status  = origin_status,
1767         .iterate_devices = origin_iterate_devices,
1768 };
1769
1770 static struct target_type snapshot_target = {
1771         .name    = "snapshot",
1772         .version = {1, 9, 0},
1773         .module  = THIS_MODULE,
1774         .ctr     = snapshot_ctr,
1775         .dtr     = snapshot_dtr,
1776         .map     = snapshot_map,
1777         .end_io  = snapshot_end_io,
1778         .postsuspend = snapshot_postsuspend,
1779         .preresume  = snapshot_preresume,
1780         .resume  = snapshot_resume,
1781         .status  = snapshot_status,
1782         .iterate_devices = snapshot_iterate_devices,
1783 };
1784
1785 static struct target_type merge_target = {
1786         .name    = dm_snapshot_merge_target_name,
1787         .version = {1, 0, 0},
1788         .module  = THIS_MODULE,
1789         .ctr     = snapshot_ctr,
1790         .dtr     = snapshot_dtr,
1791         .map     = snapshot_merge_map,
1792         .end_io  = snapshot_end_io,
1793         .postsuspend = snapshot_postsuspend,
1794         .preresume  = snapshot_preresume,
1795         .resume  = snapshot_resume,
1796         .status  = snapshot_status,
1797         .iterate_devices = snapshot_iterate_devices,
1798 };
1799
1800 static int __init dm_snapshot_init(void)
1801 {
1802         int r;
1803
1804         r = dm_exception_store_init();
1805         if (r) {
1806                 DMERR("Failed to initialize exception stores");
1807                 return r;
1808         }
1809
1810         r = dm_register_target(&snapshot_target);
1811         if (r < 0) {
1812                 DMERR("snapshot target register failed %d", r);
1813                 goto bad_register_snapshot_target;
1814         }
1815
1816         r = dm_register_target(&origin_target);
1817         if (r < 0) {
1818                 DMERR("Origin target register failed %d", r);
1819                 goto bad_register_origin_target;
1820         }
1821
1822         r = dm_register_target(&merge_target);
1823         if (r < 0) {
1824                 DMERR("Merge target register failed %d", r);
1825                 goto bad_register_merge_target;
1826         }
1827
1828         r = init_origin_hash();
1829         if (r) {
1830                 DMERR("init_origin_hash failed.");
1831                 goto bad_origin_hash;
1832         }
1833
1834         exception_cache = KMEM_CACHE(dm_exception, 0);
1835         if (!exception_cache) {
1836                 DMERR("Couldn't create exception cache.");
1837                 r = -ENOMEM;
1838                 goto bad_exception_cache;
1839         }
1840
1841         pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1842         if (!pending_cache) {
1843                 DMERR("Couldn't create pending cache.");
1844                 r = -ENOMEM;
1845                 goto bad_pending_cache;
1846         }
1847
1848         tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1849         if (!tracked_chunk_cache) {
1850                 DMERR("Couldn't create cache to track chunks in use.");
1851                 r = -ENOMEM;
1852                 goto bad_tracked_chunk_cache;
1853         }
1854
1855         ksnapd = create_singlethread_workqueue("ksnapd");
1856         if (!ksnapd) {
1857                 DMERR("Failed to create ksnapd workqueue.");
1858                 r = -ENOMEM;
1859                 goto bad_pending_pool;
1860         }
1861
1862         return 0;
1863
1864 bad_pending_pool:
1865         kmem_cache_destroy(tracked_chunk_cache);
1866 bad_tracked_chunk_cache:
1867         kmem_cache_destroy(pending_cache);
1868 bad_pending_cache:
1869         kmem_cache_destroy(exception_cache);
1870 bad_exception_cache:
1871         exit_origin_hash();
1872 bad_origin_hash:
1873         dm_unregister_target(&merge_target);
1874 bad_register_merge_target:
1875         dm_unregister_target(&origin_target);
1876 bad_register_origin_target:
1877         dm_unregister_target(&snapshot_target);
1878 bad_register_snapshot_target:
1879         dm_exception_store_exit();
1880
1881         return r;
1882 }
1883
1884 static void __exit dm_snapshot_exit(void)
1885 {
1886         destroy_workqueue(ksnapd);
1887
1888         dm_unregister_target(&snapshot_target);
1889         dm_unregister_target(&origin_target);
1890         dm_unregister_target(&merge_target);
1891
1892         exit_origin_hash();
1893         kmem_cache_destroy(pending_cache);
1894         kmem_cache_destroy(exception_cache);
1895         kmem_cache_destroy(tracked_chunk_cache);
1896
1897         dm_exception_store_exit();
1898 }
1899
1900 /* Module hooks */
1901 module_init(dm_snapshot_init);
1902 module_exit(dm_snapshot_exit);
1903
1904 MODULE_DESCRIPTION(DM_NAME " snapshot target");
1905 MODULE_AUTHOR("Joe Thornber");
1906 MODULE_LICENSE("GPL");