dm cache: wake the worker thread every time we free a migration object
[platform/kernel/linux-exynos.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/jiffies.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20
21 #define DM_MSG_PREFIX "cache"
22
23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
24         "A percentage of time allocated for copying to and/or from cache");
25
26 /*----------------------------------------------------------------*/
27
28 #define IOT_RESOLUTION 4
29
30 struct io_tracker {
31         spinlock_t lock;
32
33         /*
34          * Sectors of in-flight IO.
35          */
36         sector_t in_flight;
37
38         /*
39          * The time, in jiffies, when this device became idle (if it is
40          * indeed idle).
41          */
42         unsigned long idle_time;
43         unsigned long last_update_time;
44 };
45
46 static void iot_init(struct io_tracker *iot)
47 {
48         spin_lock_init(&iot->lock);
49         iot->in_flight = 0ul;
50         iot->idle_time = 0ul;
51         iot->last_update_time = jiffies;
52 }
53
54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
55 {
56         if (iot->in_flight)
57                 return false;
58
59         return time_after(jiffies, iot->idle_time + jifs);
60 }
61
62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
63 {
64         bool r;
65         unsigned long flags;
66
67         spin_lock_irqsave(&iot->lock, flags);
68         r = __iot_idle_for(iot, jifs);
69         spin_unlock_irqrestore(&iot->lock, flags);
70
71         return r;
72 }
73
74 static void iot_io_begin(struct io_tracker *iot, sector_t len)
75 {
76         unsigned long flags;
77
78         spin_lock_irqsave(&iot->lock, flags);
79         iot->in_flight += len;
80         spin_unlock_irqrestore(&iot->lock, flags);
81 }
82
83 static void __iot_io_end(struct io_tracker *iot, sector_t len)
84 {
85         iot->in_flight -= len;
86         if (!iot->in_flight)
87                 iot->idle_time = jiffies;
88 }
89
90 static void iot_io_end(struct io_tracker *iot, sector_t len)
91 {
92         unsigned long flags;
93
94         spin_lock_irqsave(&iot->lock, flags);
95         __iot_io_end(iot, len);
96         spin_unlock_irqrestore(&iot->lock, flags);
97 }
98
99 /*----------------------------------------------------------------*/
100
101 /*
102  * Glossary:
103  *
104  * oblock: index of an origin block
105  * cblock: index of a cache block
106  * promotion: movement of a block from origin to cache
107  * demotion: movement of a block from cache to origin
108  * migration: movement of a block between the origin and cache device,
109  *            either direction
110  */
111
112 /*----------------------------------------------------------------*/
113
114 /*
115  * There are a couple of places where we let a bio run, but want to do some
116  * work before calling its endio function.  We do this by temporarily
117  * changing the endio fn.
118  */
119 struct dm_hook_info {
120         bio_end_io_t *bi_end_io;
121         void *bi_private;
122 };
123
124 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
125                         bio_end_io_t *bi_end_io, void *bi_private)
126 {
127         h->bi_end_io = bio->bi_end_io;
128         h->bi_private = bio->bi_private;
129
130         bio->bi_end_io = bi_end_io;
131         bio->bi_private = bi_private;
132 }
133
134 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
135 {
136         bio->bi_end_io = h->bi_end_io;
137         bio->bi_private = h->bi_private;
138 }
139
140 /*----------------------------------------------------------------*/
141
142 #define MIGRATION_POOL_SIZE 128
143 #define COMMIT_PERIOD HZ
144 #define MIGRATION_COUNT_WINDOW 10
145
146 /*
147  * The block size of the device holding cache data must be
148  * between 32KB and 1GB.
149  */
150 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
151 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
152
153 /*
154  * FIXME: the cache is read/write for the time being.
155  */
156 enum cache_metadata_mode {
157         CM_WRITE,               /* metadata may be changed */
158         CM_READ_ONLY,           /* metadata may not be changed */
159 };
160
161 enum cache_io_mode {
162         /*
163          * Data is written to cached blocks only.  These blocks are marked
164          * dirty.  If you lose the cache device you will lose data.
165          * Potential performance increase for both reads and writes.
166          */
167         CM_IO_WRITEBACK,
168
169         /*
170          * Data is written to both cache and origin.  Blocks are never
171          * dirty.  Potential performance benfit for reads only.
172          */
173         CM_IO_WRITETHROUGH,
174
175         /*
176          * A degraded mode useful for various cache coherency situations
177          * (eg, rolling back snapshots).  Reads and writes always go to the
178          * origin.  If a write goes to a cached oblock, then the cache
179          * block is invalidated.
180          */
181         CM_IO_PASSTHROUGH
182 };
183
184 struct cache_features {
185         enum cache_metadata_mode mode;
186         enum cache_io_mode io_mode;
187 };
188
189 struct cache_stats {
190         atomic_t read_hit;
191         atomic_t read_miss;
192         atomic_t write_hit;
193         atomic_t write_miss;
194         atomic_t demotion;
195         atomic_t promotion;
196         atomic_t copies_avoided;
197         atomic_t cache_cell_clash;
198         atomic_t commit_count;
199         atomic_t discard_count;
200 };
201
202 /*
203  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
204  * the one-past-the-end value.
205  */
206 struct cblock_range {
207         dm_cblock_t begin;
208         dm_cblock_t end;
209 };
210
211 struct invalidation_request {
212         struct list_head list;
213         struct cblock_range *cblocks;
214
215         atomic_t complete;
216         int err;
217
218         wait_queue_head_t result_wait;
219 };
220
221 struct cache {
222         struct dm_target *ti;
223         struct dm_target_callbacks callbacks;
224
225         struct dm_cache_metadata *cmd;
226
227         /*
228          * Metadata is written to this device.
229          */
230         struct dm_dev *metadata_dev;
231
232         /*
233          * The slower of the two data devices.  Typically a spindle.
234          */
235         struct dm_dev *origin_dev;
236
237         /*
238          * The faster of the two data devices.  Typically an SSD.
239          */
240         struct dm_dev *cache_dev;
241
242         /*
243          * Size of the origin device in _complete_ blocks and native sectors.
244          */
245         dm_oblock_t origin_blocks;
246         sector_t origin_sectors;
247
248         /*
249          * Size of the cache device in blocks.
250          */
251         dm_cblock_t cache_size;
252
253         /*
254          * Fields for converting from sectors to blocks.
255          */
256         uint32_t sectors_per_block;
257         int sectors_per_block_shift;
258
259         spinlock_t lock;
260         struct list_head deferred_cells;
261         struct bio_list deferred_bios;
262         struct bio_list deferred_flush_bios;
263         struct bio_list deferred_writethrough_bios;
264         struct list_head quiesced_migrations;
265         struct list_head completed_migrations;
266         struct list_head need_commit_migrations;
267         sector_t migration_threshold;
268         wait_queue_head_t migration_wait;
269         atomic_t nr_allocated_migrations;
270
271         /*
272          * The number of in flight migrations that are performing
273          * background io. eg, promotion, writeback.
274          */
275         atomic_t nr_io_migrations;
276
277         wait_queue_head_t quiescing_wait;
278         atomic_t quiescing;
279         atomic_t quiescing_ack;
280
281         /*
282          * cache_size entries, dirty if set
283          */
284         atomic_t nr_dirty;
285         unsigned long *dirty_bitset;
286
287         /*
288          * origin_blocks entries, discarded if set.
289          */
290         dm_dblock_t discard_nr_blocks;
291         unsigned long *discard_bitset;
292         uint32_t discard_block_size; /* a power of 2 times sectors per block */
293
294         /*
295          * Rather than reconstructing the table line for the status we just
296          * save it and regurgitate.
297          */
298         unsigned nr_ctr_args;
299         const char **ctr_args;
300
301         struct dm_kcopyd_client *copier;
302         struct workqueue_struct *wq;
303         struct work_struct worker;
304
305         struct delayed_work waker;
306         unsigned long last_commit_jiffies;
307
308         struct dm_bio_prison *prison;
309         struct dm_deferred_set *all_io_ds;
310
311         mempool_t *migration_pool;
312
313         struct dm_cache_policy *policy;
314         unsigned policy_nr_args;
315
316         bool need_tick_bio:1;
317         bool sized:1;
318         bool invalidate:1;
319         bool commit_requested:1;
320         bool loaded_mappings:1;
321         bool loaded_discards:1;
322
323         /*
324          * Cache features such as write-through.
325          */
326         struct cache_features features;
327
328         struct cache_stats stats;
329
330         /*
331          * Invalidation fields.
332          */
333         spinlock_t invalidation_lock;
334         struct list_head invalidation_requests;
335
336         struct io_tracker origin_tracker;
337 };
338
339 struct per_bio_data {
340         bool tick:1;
341         unsigned req_nr:2;
342         struct dm_deferred_entry *all_io_entry;
343         struct dm_hook_info hook_info;
344         sector_t len;
345
346         /*
347          * writethrough fields.  These MUST remain at the end of this
348          * structure and the 'cache' member must be the first as it
349          * is used to determine the offset of the writethrough fields.
350          */
351         struct cache *cache;
352         dm_cblock_t cblock;
353         struct dm_bio_details bio_details;
354 };
355
356 struct dm_cache_migration {
357         struct list_head list;
358         struct cache *cache;
359
360         unsigned long start_jiffies;
361         dm_oblock_t old_oblock;
362         dm_oblock_t new_oblock;
363         dm_cblock_t cblock;
364
365         bool err:1;
366         bool discard:1;
367         bool writeback:1;
368         bool demote:1;
369         bool promote:1;
370         bool requeue_holder:1;
371         bool invalidate:1;
372
373         struct dm_bio_prison_cell *old_ocell;
374         struct dm_bio_prison_cell *new_ocell;
375 };
376
377 /*
378  * Processing a bio in the worker thread may require these memory
379  * allocations.  We prealloc to avoid deadlocks (the same worker thread
380  * frees them back to the mempool).
381  */
382 struct prealloc {
383         struct dm_cache_migration *mg;
384         struct dm_bio_prison_cell *cell1;
385         struct dm_bio_prison_cell *cell2;
386 };
387
388 static void wake_worker(struct cache *cache)
389 {
390         queue_work(cache->wq, &cache->worker);
391 }
392
393 /*----------------------------------------------------------------*/
394
395 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
396 {
397         /* FIXME: change to use a local slab. */
398         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
399 }
400
401 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
402 {
403         dm_bio_prison_free_cell(cache->prison, cell);
404 }
405
406 static struct dm_cache_migration *alloc_migration(struct cache *cache)
407 {
408         struct dm_cache_migration *mg;
409
410         mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
411         if (mg) {
412                 mg->cache = cache;
413                 atomic_inc(&mg->cache->nr_allocated_migrations);
414         }
415
416         return mg;
417 }
418
419 static void free_migration(struct dm_cache_migration *mg)
420 {
421         struct cache *cache = mg->cache;
422
423         if (atomic_dec_and_test(&cache->nr_allocated_migrations))
424                 wake_up(&cache->migration_wait);
425
426         mempool_free(mg, cache->migration_pool);
427         wake_worker(cache);
428 }
429
430 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
431 {
432         if (!p->mg) {
433                 p->mg = alloc_migration(cache);
434                 if (!p->mg)
435                         return -ENOMEM;
436         }
437
438         if (!p->cell1) {
439                 p->cell1 = alloc_prison_cell(cache);
440                 if (!p->cell1)
441                         return -ENOMEM;
442         }
443
444         if (!p->cell2) {
445                 p->cell2 = alloc_prison_cell(cache);
446                 if (!p->cell2)
447                         return -ENOMEM;
448         }
449
450         return 0;
451 }
452
453 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
454 {
455         if (p->cell2)
456                 free_prison_cell(cache, p->cell2);
457
458         if (p->cell1)
459                 free_prison_cell(cache, p->cell1);
460
461         if (p->mg)
462                 free_migration(p->mg);
463 }
464
465 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
466 {
467         struct dm_cache_migration *mg = p->mg;
468
469         BUG_ON(!mg);
470         p->mg = NULL;
471
472         return mg;
473 }
474
475 /*
476  * You must have a cell within the prealloc struct to return.  If not this
477  * function will BUG() rather than returning NULL.
478  */
479 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
480 {
481         struct dm_bio_prison_cell *r = NULL;
482
483         if (p->cell1) {
484                 r = p->cell1;
485                 p->cell1 = NULL;
486
487         } else if (p->cell2) {
488                 r = p->cell2;
489                 p->cell2 = NULL;
490         } else
491                 BUG();
492
493         return r;
494 }
495
496 /*
497  * You can't have more than two cells in a prealloc struct.  BUG() will be
498  * called if you try and overfill.
499  */
500 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
501 {
502         if (!p->cell2)
503                 p->cell2 = cell;
504
505         else if (!p->cell1)
506                 p->cell1 = cell;
507
508         else
509                 BUG();
510 }
511
512 /*----------------------------------------------------------------*/
513
514 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
515 {
516         key->virtual = 0;
517         key->dev = 0;
518         key->block_begin = from_oblock(begin);
519         key->block_end = from_oblock(end);
520 }
521
522 /*
523  * The caller hands in a preallocated cell, and a free function for it.
524  * The cell will be freed if there's an error, or if it wasn't used because
525  * a cell with that key already exists.
526  */
527 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
528
529 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
530                             struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
531                             cell_free_fn free_fn, void *free_context,
532                             struct dm_bio_prison_cell **cell_result)
533 {
534         int r;
535         struct dm_cell_key key;
536
537         build_key(oblock_begin, oblock_end, &key);
538         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
539         if (r)
540                 free_fn(free_context, cell_prealloc);
541
542         return r;
543 }
544
545 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
546                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
547                       cell_free_fn free_fn, void *free_context,
548                       struct dm_bio_prison_cell **cell_result)
549 {
550         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
551         return bio_detain_range(cache, oblock, end, bio,
552                                 cell_prealloc, free_fn, free_context, cell_result);
553 }
554
555 static int get_cell(struct cache *cache,
556                     dm_oblock_t oblock,
557                     struct prealloc *structs,
558                     struct dm_bio_prison_cell **cell_result)
559 {
560         int r;
561         struct dm_cell_key key;
562         struct dm_bio_prison_cell *cell_prealloc;
563
564         cell_prealloc = prealloc_get_cell(structs);
565
566         build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
567         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
568         if (r)
569                 prealloc_put_cell(structs, cell_prealloc);
570
571         return r;
572 }
573
574 /*----------------------------------------------------------------*/
575
576 static bool is_dirty(struct cache *cache, dm_cblock_t b)
577 {
578         return test_bit(from_cblock(b), cache->dirty_bitset);
579 }
580
581 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
582 {
583         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
584                 atomic_inc(&cache->nr_dirty);
585                 policy_set_dirty(cache->policy, oblock);
586         }
587 }
588
589 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
590 {
591         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
592                 policy_clear_dirty(cache->policy, oblock);
593                 if (atomic_dec_return(&cache->nr_dirty) == 0)
594                         dm_table_event(cache->ti->table);
595         }
596 }
597
598 /*----------------------------------------------------------------*/
599
600 static bool block_size_is_power_of_two(struct cache *cache)
601 {
602         return cache->sectors_per_block_shift >= 0;
603 }
604
605 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
606 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
607 __always_inline
608 #endif
609 static dm_block_t block_div(dm_block_t b, uint32_t n)
610 {
611         do_div(b, n);
612
613         return b;
614 }
615
616 static dm_block_t oblocks_per_dblock(struct cache *cache)
617 {
618         dm_block_t oblocks = cache->discard_block_size;
619
620         if (block_size_is_power_of_two(cache))
621                 oblocks >>= cache->sectors_per_block_shift;
622         else
623                 oblocks = block_div(oblocks, cache->sectors_per_block);
624
625         return oblocks;
626 }
627
628 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
629 {
630         return to_dblock(block_div(from_oblock(oblock),
631                                    oblocks_per_dblock(cache)));
632 }
633
634 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
635 {
636         return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
637 }
638
639 static void set_discard(struct cache *cache, dm_dblock_t b)
640 {
641         unsigned long flags;
642
643         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
644         atomic_inc(&cache->stats.discard_count);
645
646         spin_lock_irqsave(&cache->lock, flags);
647         set_bit(from_dblock(b), cache->discard_bitset);
648         spin_unlock_irqrestore(&cache->lock, flags);
649 }
650
651 static void clear_discard(struct cache *cache, dm_dblock_t b)
652 {
653         unsigned long flags;
654
655         spin_lock_irqsave(&cache->lock, flags);
656         clear_bit(from_dblock(b), cache->discard_bitset);
657         spin_unlock_irqrestore(&cache->lock, flags);
658 }
659
660 static bool is_discarded(struct cache *cache, dm_dblock_t b)
661 {
662         int r;
663         unsigned long flags;
664
665         spin_lock_irqsave(&cache->lock, flags);
666         r = test_bit(from_dblock(b), cache->discard_bitset);
667         spin_unlock_irqrestore(&cache->lock, flags);
668
669         return r;
670 }
671
672 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
673 {
674         int r;
675         unsigned long flags;
676
677         spin_lock_irqsave(&cache->lock, flags);
678         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
679                      cache->discard_bitset);
680         spin_unlock_irqrestore(&cache->lock, flags);
681
682         return r;
683 }
684
685 /*----------------------------------------------------------------*/
686
687 static void load_stats(struct cache *cache)
688 {
689         struct dm_cache_statistics stats;
690
691         dm_cache_metadata_get_stats(cache->cmd, &stats);
692         atomic_set(&cache->stats.read_hit, stats.read_hits);
693         atomic_set(&cache->stats.read_miss, stats.read_misses);
694         atomic_set(&cache->stats.write_hit, stats.write_hits);
695         atomic_set(&cache->stats.write_miss, stats.write_misses);
696 }
697
698 static void save_stats(struct cache *cache)
699 {
700         struct dm_cache_statistics stats;
701
702         stats.read_hits = atomic_read(&cache->stats.read_hit);
703         stats.read_misses = atomic_read(&cache->stats.read_miss);
704         stats.write_hits = atomic_read(&cache->stats.write_hit);
705         stats.write_misses = atomic_read(&cache->stats.write_miss);
706
707         dm_cache_metadata_set_stats(cache->cmd, &stats);
708 }
709
710 /*----------------------------------------------------------------
711  * Per bio data
712  *--------------------------------------------------------------*/
713
714 /*
715  * If using writeback, leave out struct per_bio_data's writethrough fields.
716  */
717 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
718 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
719
720 static bool writethrough_mode(struct cache_features *f)
721 {
722         return f->io_mode == CM_IO_WRITETHROUGH;
723 }
724
725 static bool writeback_mode(struct cache_features *f)
726 {
727         return f->io_mode == CM_IO_WRITEBACK;
728 }
729
730 static bool passthrough_mode(struct cache_features *f)
731 {
732         return f->io_mode == CM_IO_PASSTHROUGH;
733 }
734
735 static size_t get_per_bio_data_size(struct cache *cache)
736 {
737         return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
738 }
739
740 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
741 {
742         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
743         BUG_ON(!pb);
744         return pb;
745 }
746
747 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
748 {
749         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
750
751         pb->tick = false;
752         pb->req_nr = dm_bio_get_target_bio_nr(bio);
753         pb->all_io_entry = NULL;
754         pb->len = 0;
755
756         return pb;
757 }
758
759 /*----------------------------------------------------------------
760  * Remapping
761  *--------------------------------------------------------------*/
762 static void remap_to_origin(struct cache *cache, struct bio *bio)
763 {
764         bio->bi_bdev = cache->origin_dev->bdev;
765 }
766
767 static void remap_to_cache(struct cache *cache, struct bio *bio,
768                            dm_cblock_t cblock)
769 {
770         sector_t bi_sector = bio->bi_iter.bi_sector;
771         sector_t block = from_cblock(cblock);
772
773         bio->bi_bdev = cache->cache_dev->bdev;
774         if (!block_size_is_power_of_two(cache))
775                 bio->bi_iter.bi_sector =
776                         (block * cache->sectors_per_block) +
777                         sector_div(bi_sector, cache->sectors_per_block);
778         else
779                 bio->bi_iter.bi_sector =
780                         (block << cache->sectors_per_block_shift) |
781                         (bi_sector & (cache->sectors_per_block - 1));
782 }
783
784 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
785 {
786         unsigned long flags;
787         size_t pb_data_size = get_per_bio_data_size(cache);
788         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
789
790         spin_lock_irqsave(&cache->lock, flags);
791         if (cache->need_tick_bio &&
792             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
793                 pb->tick = true;
794                 cache->need_tick_bio = false;
795         }
796         spin_unlock_irqrestore(&cache->lock, flags);
797 }
798
799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
800                                   dm_oblock_t oblock)
801 {
802         check_if_tick_bio_needed(cache, bio);
803         remap_to_origin(cache, bio);
804         if (bio_data_dir(bio) == WRITE)
805                 clear_discard(cache, oblock_to_dblock(cache, oblock));
806 }
807
808 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
809                                  dm_oblock_t oblock, dm_cblock_t cblock)
810 {
811         check_if_tick_bio_needed(cache, bio);
812         remap_to_cache(cache, bio, cblock);
813         if (bio_data_dir(bio) == WRITE) {
814                 set_dirty(cache, oblock, cblock);
815                 clear_discard(cache, oblock_to_dblock(cache, oblock));
816         }
817 }
818
819 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
820 {
821         sector_t block_nr = bio->bi_iter.bi_sector;
822
823         if (!block_size_is_power_of_two(cache))
824                 (void) sector_div(block_nr, cache->sectors_per_block);
825         else
826                 block_nr >>= cache->sectors_per_block_shift;
827
828         return to_oblock(block_nr);
829 }
830
831 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
832 {
833         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
834 }
835
836 /*
837  * You must increment the deferred set whilst the prison cell is held.  To
838  * encourage this, we ask for 'cell' to be passed in.
839  */
840 static void inc_ds(struct cache *cache, struct bio *bio,
841                    struct dm_bio_prison_cell *cell)
842 {
843         size_t pb_data_size = get_per_bio_data_size(cache);
844         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
845
846         BUG_ON(!cell);
847         BUG_ON(pb->all_io_entry);
848
849         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
850 }
851
852 static bool accountable_bio(struct cache *cache, struct bio *bio)
853 {
854         return ((bio->bi_bdev == cache->origin_dev->bdev) &&
855                 !(bio->bi_rw & REQ_DISCARD));
856 }
857
858 static void accounted_begin(struct cache *cache, struct bio *bio)
859 {
860         size_t pb_data_size = get_per_bio_data_size(cache);
861         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
862
863         if (accountable_bio(cache, bio)) {
864                 pb->len = bio_sectors(bio);
865                 iot_io_begin(&cache->origin_tracker, pb->len);
866         }
867 }
868
869 static void accounted_complete(struct cache *cache, struct bio *bio)
870 {
871         size_t pb_data_size = get_per_bio_data_size(cache);
872         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
873
874         iot_io_end(&cache->origin_tracker, pb->len);
875 }
876
877 static void accounted_request(struct cache *cache, struct bio *bio)
878 {
879         accounted_begin(cache, bio);
880         generic_make_request(bio);
881 }
882
883 static void issue(struct cache *cache, struct bio *bio)
884 {
885         unsigned long flags;
886
887         if (!bio_triggers_commit(cache, bio)) {
888                 accounted_request(cache, bio);
889                 return;
890         }
891
892         /*
893          * Batch together any bios that trigger commits and then issue a
894          * single commit for them in do_worker().
895          */
896         spin_lock_irqsave(&cache->lock, flags);
897         cache->commit_requested = true;
898         bio_list_add(&cache->deferred_flush_bios, bio);
899         spin_unlock_irqrestore(&cache->lock, flags);
900 }
901
902 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
903 {
904         inc_ds(cache, bio, cell);
905         issue(cache, bio);
906 }
907
908 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
909 {
910         unsigned long flags;
911
912         spin_lock_irqsave(&cache->lock, flags);
913         bio_list_add(&cache->deferred_writethrough_bios, bio);
914         spin_unlock_irqrestore(&cache->lock, flags);
915
916         wake_worker(cache);
917 }
918
919 static void writethrough_endio(struct bio *bio, int err)
920 {
921         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
922
923         dm_unhook_bio(&pb->hook_info, bio);
924
925         if (err) {
926                 bio_endio(bio, err);
927                 return;
928         }
929
930         dm_bio_restore(&pb->bio_details, bio);
931         remap_to_cache(pb->cache, bio, pb->cblock);
932
933         /*
934          * We can't issue this bio directly, since we're in interrupt
935          * context.  So it gets put on a bio list for processing by the
936          * worker thread.
937          */
938         defer_writethrough_bio(pb->cache, bio);
939 }
940
941 /*
942  * When running in writethrough mode we need to send writes to clean blocks
943  * to both the cache and origin devices.  In future we'd like to clone the
944  * bio and send them in parallel, but for now we're doing them in
945  * series as this is easier.
946  */
947 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
948                                        dm_oblock_t oblock, dm_cblock_t cblock)
949 {
950         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
951
952         pb->cache = cache;
953         pb->cblock = cblock;
954         dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
955         dm_bio_record(&pb->bio_details, bio);
956
957         remap_to_origin_clear_discard(pb->cache, bio, oblock);
958 }
959
960 /*----------------------------------------------------------------
961  * Migration processing
962  *
963  * Migration covers moving data from the origin device to the cache, or
964  * vice versa.
965  *--------------------------------------------------------------*/
966 static void inc_io_migrations(struct cache *cache)
967 {
968         atomic_inc(&cache->nr_io_migrations);
969 }
970
971 static void dec_io_migrations(struct cache *cache)
972 {
973         atomic_dec(&cache->nr_io_migrations);
974 }
975
976 static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
977                            bool holder, struct bio_list *bios)
978 {
979         (holder ? dm_cell_release : dm_cell_release_no_holder)
980                 (cache->prison, cell, bios);
981         free_prison_cell(cache, cell);
982 }
983
984 static bool discard_or_flush(struct bio *bio)
985 {
986         return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
987 }
988
989 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
990 {
991         if (discard_or_flush(cell->holder))
992                 /*
993                  * We have to handle these bios
994                  * individually.
995                  */
996                 __cell_release(cache, cell, true, &cache->deferred_bios);
997
998         else
999                 list_add_tail(&cell->user_list, &cache->deferred_cells);
1000 }
1001
1002 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1003 {
1004         unsigned long flags;
1005
1006         if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1007                 /*
1008                  * There was no prisoner to promote to holder, the
1009                  * cell has been released.
1010                  */
1011                 free_prison_cell(cache, cell);
1012                 return;
1013         }
1014
1015         spin_lock_irqsave(&cache->lock, flags);
1016         __cell_defer(cache, cell);
1017         spin_unlock_irqrestore(&cache->lock, flags);
1018
1019         wake_worker(cache);
1020 }
1021
1022 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
1023 {
1024         dm_cell_error(cache->prison, cell, err);
1025         dm_bio_prison_free_cell(cache->prison, cell);
1026 }
1027
1028 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
1029 {
1030         cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
1031 }
1032
1033 static void free_io_migration(struct dm_cache_migration *mg)
1034 {
1035         dec_io_migrations(mg->cache);
1036         free_migration(mg);
1037 }
1038
1039 static void migration_failure(struct dm_cache_migration *mg)
1040 {
1041         struct cache *cache = mg->cache;
1042
1043         if (mg->writeback) {
1044                 DMWARN_LIMIT("writeback failed; couldn't copy block");
1045                 set_dirty(cache, mg->old_oblock, mg->cblock);
1046                 cell_defer(cache, mg->old_ocell, false);
1047
1048         } else if (mg->demote) {
1049                 DMWARN_LIMIT("demotion failed; couldn't copy block");
1050                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1051
1052                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1053                 if (mg->promote)
1054                         cell_defer(cache, mg->new_ocell, true);
1055         } else {
1056                 DMWARN_LIMIT("promotion failed; couldn't copy block");
1057                 policy_remove_mapping(cache->policy, mg->new_oblock);
1058                 cell_defer(cache, mg->new_ocell, true);
1059         }
1060
1061         free_io_migration(mg);
1062 }
1063
1064 static void migration_success_pre_commit(struct dm_cache_migration *mg)
1065 {
1066         unsigned long flags;
1067         struct cache *cache = mg->cache;
1068
1069         if (mg->writeback) {
1070                 clear_dirty(cache, mg->old_oblock, mg->cblock);
1071                 cell_defer(cache, mg->old_ocell, false);
1072                 free_io_migration(mg);
1073                 return;
1074
1075         } else if (mg->demote) {
1076                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
1077                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
1078                         policy_force_mapping(cache->policy, mg->new_oblock,
1079                                              mg->old_oblock);
1080                         if (mg->promote)
1081                                 cell_defer(cache, mg->new_ocell, true);
1082                         free_io_migration(mg);
1083                         return;
1084                 }
1085         } else {
1086                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
1087                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
1088                         policy_remove_mapping(cache->policy, mg->new_oblock);
1089                         free_io_migration(mg);
1090                         return;
1091                 }
1092         }
1093
1094         spin_lock_irqsave(&cache->lock, flags);
1095         list_add_tail(&mg->list, &cache->need_commit_migrations);
1096         cache->commit_requested = true;
1097         spin_unlock_irqrestore(&cache->lock, flags);
1098 }
1099
1100 static void migration_success_post_commit(struct dm_cache_migration *mg)
1101 {
1102         unsigned long flags;
1103         struct cache *cache = mg->cache;
1104
1105         if (mg->writeback) {
1106                 DMWARN("writeback unexpectedly triggered commit");
1107                 return;
1108
1109         } else if (mg->demote) {
1110                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1111
1112                 if (mg->promote) {
1113                         mg->demote = false;
1114
1115                         spin_lock_irqsave(&cache->lock, flags);
1116                         list_add_tail(&mg->list, &cache->quiesced_migrations);
1117                         spin_unlock_irqrestore(&cache->lock, flags);
1118
1119                 } else {
1120                         if (mg->invalidate)
1121                                 policy_remove_mapping(cache->policy, mg->old_oblock);
1122                         free_io_migration(mg);
1123                 }
1124
1125         } else {
1126                 if (mg->requeue_holder) {
1127                         clear_dirty(cache, mg->new_oblock, mg->cblock);
1128                         cell_defer(cache, mg->new_ocell, true);
1129                 } else {
1130                         /*
1131                          * The block was promoted via an overwrite, so it's dirty.
1132                          */
1133                         set_dirty(cache, mg->new_oblock, mg->cblock);
1134                         bio_endio(mg->new_ocell->holder, 0);
1135                         cell_defer(cache, mg->new_ocell, false);
1136                 }
1137                 free_io_migration(mg);
1138         }
1139 }
1140
1141 static void copy_complete(int read_err, unsigned long write_err, void *context)
1142 {
1143         unsigned long flags;
1144         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1145         struct cache *cache = mg->cache;
1146
1147         if (read_err || write_err)
1148                 mg->err = true;
1149
1150         spin_lock_irqsave(&cache->lock, flags);
1151         list_add_tail(&mg->list, &cache->completed_migrations);
1152         spin_unlock_irqrestore(&cache->lock, flags);
1153
1154         wake_worker(cache);
1155 }
1156
1157 static void issue_copy(struct dm_cache_migration *mg)
1158 {
1159         int r;
1160         struct dm_io_region o_region, c_region;
1161         struct cache *cache = mg->cache;
1162         sector_t cblock = from_cblock(mg->cblock);
1163
1164         o_region.bdev = cache->origin_dev->bdev;
1165         o_region.count = cache->sectors_per_block;
1166
1167         c_region.bdev = cache->cache_dev->bdev;
1168         c_region.sector = cblock * cache->sectors_per_block;
1169         c_region.count = cache->sectors_per_block;
1170
1171         if (mg->writeback || mg->demote) {
1172                 /* demote */
1173                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1174                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1175         } else {
1176                 /* promote */
1177                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1178                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1179         }
1180
1181         if (r < 0) {
1182                 DMERR_LIMIT("issuing migration failed");
1183                 migration_failure(mg);
1184         }
1185 }
1186
1187 static void overwrite_endio(struct bio *bio, int err)
1188 {
1189         struct dm_cache_migration *mg = bio->bi_private;
1190         struct cache *cache = mg->cache;
1191         size_t pb_data_size = get_per_bio_data_size(cache);
1192         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1193         unsigned long flags;
1194
1195         dm_unhook_bio(&pb->hook_info, bio);
1196
1197         if (err)
1198                 mg->err = true;
1199
1200         mg->requeue_holder = false;
1201
1202         spin_lock_irqsave(&cache->lock, flags);
1203         list_add_tail(&mg->list, &cache->completed_migrations);
1204         spin_unlock_irqrestore(&cache->lock, flags);
1205
1206         wake_worker(cache);
1207 }
1208
1209 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1210 {
1211         size_t pb_data_size = get_per_bio_data_size(mg->cache);
1212         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1213
1214         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1215         remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1216
1217         /*
1218          * No need to inc_ds() here, since the cell will be held for the
1219          * duration of the io.
1220          */
1221         accounted_request(mg->cache, bio);
1222 }
1223
1224 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1225 {
1226         return (bio_data_dir(bio) == WRITE) &&
1227                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1228 }
1229
1230 static void avoid_copy(struct dm_cache_migration *mg)
1231 {
1232         atomic_inc(&mg->cache->stats.copies_avoided);
1233         migration_success_pre_commit(mg);
1234 }
1235
1236 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1237                                      dm_dblock_t *b, dm_dblock_t *e)
1238 {
1239         sector_t sb = bio->bi_iter.bi_sector;
1240         sector_t se = bio_end_sector(bio);
1241
1242         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1243
1244         if (se - sb < cache->discard_block_size)
1245                 *e = *b;
1246         else
1247                 *e = to_dblock(block_div(se, cache->discard_block_size));
1248 }
1249
1250 static void issue_discard(struct dm_cache_migration *mg)
1251 {
1252         dm_dblock_t b, e;
1253         struct bio *bio = mg->new_ocell->holder;
1254
1255         calc_discard_block_range(mg->cache, bio, &b, &e);
1256         while (b != e) {
1257                 set_discard(mg->cache, b);
1258                 b = to_dblock(from_dblock(b) + 1);
1259         }
1260
1261         bio_endio(bio, 0);
1262         cell_defer(mg->cache, mg->new_ocell, false);
1263         free_migration(mg);
1264 }
1265
1266 static void issue_copy_or_discard(struct dm_cache_migration *mg)
1267 {
1268         bool avoid;
1269         struct cache *cache = mg->cache;
1270
1271         if (mg->discard) {
1272                 issue_discard(mg);
1273                 return;
1274         }
1275
1276         if (mg->writeback || mg->demote)
1277                 avoid = !is_dirty(cache, mg->cblock) ||
1278                         is_discarded_oblock(cache, mg->old_oblock);
1279         else {
1280                 struct bio *bio = mg->new_ocell->holder;
1281
1282                 avoid = is_discarded_oblock(cache, mg->new_oblock);
1283
1284                 if (writeback_mode(&cache->features) &&
1285                     !avoid && bio_writes_complete_block(cache, bio)) {
1286                         issue_overwrite(mg, bio);
1287                         return;
1288                 }
1289         }
1290
1291         avoid ? avoid_copy(mg) : issue_copy(mg);
1292 }
1293
1294 static void complete_migration(struct dm_cache_migration *mg)
1295 {
1296         if (mg->err)
1297                 migration_failure(mg);
1298         else
1299                 migration_success_pre_commit(mg);
1300 }
1301
1302 static void process_migrations(struct cache *cache, struct list_head *head,
1303                                void (*fn)(struct dm_cache_migration *))
1304 {
1305         unsigned long flags;
1306         struct list_head list;
1307         struct dm_cache_migration *mg, *tmp;
1308
1309         INIT_LIST_HEAD(&list);
1310         spin_lock_irqsave(&cache->lock, flags);
1311         list_splice_init(head, &list);
1312         spin_unlock_irqrestore(&cache->lock, flags);
1313
1314         list_for_each_entry_safe(mg, tmp, &list, list)
1315                 fn(mg);
1316 }
1317
1318 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1319 {
1320         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1321 }
1322
1323 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1324 {
1325         unsigned long flags;
1326         struct cache *cache = mg->cache;
1327
1328         spin_lock_irqsave(&cache->lock, flags);
1329         __queue_quiesced_migration(mg);
1330         spin_unlock_irqrestore(&cache->lock, flags);
1331
1332         wake_worker(cache);
1333 }
1334
1335 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1336 {
1337         unsigned long flags;
1338         struct dm_cache_migration *mg, *tmp;
1339
1340         spin_lock_irqsave(&cache->lock, flags);
1341         list_for_each_entry_safe(mg, tmp, work, list)
1342                 __queue_quiesced_migration(mg);
1343         spin_unlock_irqrestore(&cache->lock, flags);
1344
1345         wake_worker(cache);
1346 }
1347
1348 static void check_for_quiesced_migrations(struct cache *cache,
1349                                           struct per_bio_data *pb)
1350 {
1351         struct list_head work;
1352
1353         if (!pb->all_io_entry)
1354                 return;
1355
1356         INIT_LIST_HEAD(&work);
1357         dm_deferred_entry_dec(pb->all_io_entry, &work);
1358
1359         if (!list_empty(&work))
1360                 queue_quiesced_migrations(cache, &work);
1361 }
1362
1363 static void quiesce_migration(struct dm_cache_migration *mg)
1364 {
1365         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1366                 queue_quiesced_migration(mg);
1367 }
1368
1369 static void promote(struct cache *cache, struct prealloc *structs,
1370                     dm_oblock_t oblock, dm_cblock_t cblock,
1371                     struct dm_bio_prison_cell *cell)
1372 {
1373         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1374
1375         mg->err = false;
1376         mg->discard = false;
1377         mg->writeback = false;
1378         mg->demote = false;
1379         mg->promote = true;
1380         mg->requeue_holder = true;
1381         mg->invalidate = false;
1382         mg->cache = cache;
1383         mg->new_oblock = oblock;
1384         mg->cblock = cblock;
1385         mg->old_ocell = NULL;
1386         mg->new_ocell = cell;
1387         mg->start_jiffies = jiffies;
1388
1389         inc_io_migrations(cache);
1390         quiesce_migration(mg);
1391 }
1392
1393 static void writeback(struct cache *cache, struct prealloc *structs,
1394                       dm_oblock_t oblock, dm_cblock_t cblock,
1395                       struct dm_bio_prison_cell *cell)
1396 {
1397         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1398
1399         mg->err = false;
1400         mg->discard = false;
1401         mg->writeback = true;
1402         mg->demote = false;
1403         mg->promote = false;
1404         mg->requeue_holder = true;
1405         mg->invalidate = false;
1406         mg->cache = cache;
1407         mg->old_oblock = oblock;
1408         mg->cblock = cblock;
1409         mg->old_ocell = cell;
1410         mg->new_ocell = NULL;
1411         mg->start_jiffies = jiffies;
1412
1413         inc_io_migrations(cache);
1414         quiesce_migration(mg);
1415 }
1416
1417 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1418                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1419                                 dm_cblock_t cblock,
1420                                 struct dm_bio_prison_cell *old_ocell,
1421                                 struct dm_bio_prison_cell *new_ocell)
1422 {
1423         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1424
1425         mg->err = false;
1426         mg->discard = false;
1427         mg->writeback = false;
1428         mg->demote = true;
1429         mg->promote = true;
1430         mg->requeue_holder = true;
1431         mg->invalidate = false;
1432         mg->cache = cache;
1433         mg->old_oblock = old_oblock;
1434         mg->new_oblock = new_oblock;
1435         mg->cblock = cblock;
1436         mg->old_ocell = old_ocell;
1437         mg->new_ocell = new_ocell;
1438         mg->start_jiffies = jiffies;
1439
1440         inc_io_migrations(cache);
1441         quiesce_migration(mg);
1442 }
1443
1444 /*
1445  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1446  * block are thrown away.
1447  */
1448 static void invalidate(struct cache *cache, struct prealloc *structs,
1449                        dm_oblock_t oblock, dm_cblock_t cblock,
1450                        struct dm_bio_prison_cell *cell)
1451 {
1452         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1453
1454         mg->err = false;
1455         mg->discard = false;
1456         mg->writeback = false;
1457         mg->demote = true;
1458         mg->promote = false;
1459         mg->requeue_holder = true;
1460         mg->invalidate = true;
1461         mg->cache = cache;
1462         mg->old_oblock = oblock;
1463         mg->cblock = cblock;
1464         mg->old_ocell = cell;
1465         mg->new_ocell = NULL;
1466         mg->start_jiffies = jiffies;
1467
1468         inc_io_migrations(cache);
1469         quiesce_migration(mg);
1470 }
1471
1472 static void discard(struct cache *cache, struct prealloc *structs,
1473                     struct dm_bio_prison_cell *cell)
1474 {
1475         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1476
1477         mg->err = false;
1478         mg->discard = true;
1479         mg->writeback = false;
1480         mg->demote = false;
1481         mg->promote = false;
1482         mg->requeue_holder = false;
1483         mg->invalidate = false;
1484         mg->cache = cache;
1485         mg->old_ocell = NULL;
1486         mg->new_ocell = cell;
1487         mg->start_jiffies = jiffies;
1488
1489         quiesce_migration(mg);
1490 }
1491
1492 /*----------------------------------------------------------------
1493  * bio processing
1494  *--------------------------------------------------------------*/
1495 static void defer_bio(struct cache *cache, struct bio *bio)
1496 {
1497         unsigned long flags;
1498
1499         spin_lock_irqsave(&cache->lock, flags);
1500         bio_list_add(&cache->deferred_bios, bio);
1501         spin_unlock_irqrestore(&cache->lock, flags);
1502
1503         wake_worker(cache);
1504 }
1505
1506 static void process_flush_bio(struct cache *cache, struct bio *bio)
1507 {
1508         size_t pb_data_size = get_per_bio_data_size(cache);
1509         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1510
1511         BUG_ON(bio->bi_iter.bi_size);
1512         if (!pb->req_nr)
1513                 remap_to_origin(cache, bio);
1514         else
1515                 remap_to_cache(cache, bio, 0);
1516
1517         /*
1518          * REQ_FLUSH is not directed at any particular block so we don't
1519          * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1520          * by dm-core.
1521          */
1522         issue(cache, bio);
1523 }
1524
1525 static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1526                                 struct bio *bio)
1527 {
1528         int r;
1529         dm_dblock_t b, e;
1530         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1531
1532         calc_discard_block_range(cache, bio, &b, &e);
1533         if (b == e) {
1534                 bio_endio(bio, 0);
1535                 return;
1536         }
1537
1538         cell_prealloc = prealloc_get_cell(structs);
1539         r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1540                              (cell_free_fn) prealloc_put_cell,
1541                              structs, &new_ocell);
1542         if (r > 0)
1543                 return;
1544
1545         discard(cache, structs, new_ocell);
1546 }
1547
1548 static bool spare_migration_bandwidth(struct cache *cache)
1549 {
1550         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1551                 cache->sectors_per_block;
1552         return current_volume < cache->migration_threshold;
1553 }
1554
1555 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1556 {
1557         atomic_inc(bio_data_dir(bio) == READ ?
1558                    &cache->stats.read_hit : &cache->stats.write_hit);
1559 }
1560
1561 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1562 {
1563         atomic_inc(bio_data_dir(bio) == READ ?
1564                    &cache->stats.read_miss : &cache->stats.write_miss);
1565 }
1566
1567 /*----------------------------------------------------------------*/
1568
1569 struct inc_detail {
1570         struct cache *cache;
1571         struct bio_list bios_for_issue;
1572         struct bio_list unhandled_bios;
1573         bool any_writes;
1574 };
1575
1576 static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1577 {
1578         struct bio *bio;
1579         struct inc_detail *detail = context;
1580         struct cache *cache = detail->cache;
1581
1582         inc_ds(cache, cell->holder, cell);
1583         if (bio_data_dir(cell->holder) == WRITE)
1584                 detail->any_writes = true;
1585
1586         while ((bio = bio_list_pop(&cell->bios))) {
1587                 if (discard_or_flush(bio)) {
1588                         bio_list_add(&detail->unhandled_bios, bio);
1589                         continue;
1590                 }
1591
1592                 if (bio_data_dir(bio) == WRITE)
1593                         detail->any_writes = true;
1594
1595                 bio_list_add(&detail->bios_for_issue, bio);
1596                 inc_ds(cache, bio, cell);
1597         }
1598 }
1599
1600 // FIXME: refactor these two
1601 static void remap_cell_to_origin_clear_discard(struct cache *cache,
1602                                                struct dm_bio_prison_cell *cell,
1603                                                dm_oblock_t oblock, bool issue_holder)
1604 {
1605         struct bio *bio;
1606         unsigned long flags;
1607         struct inc_detail detail;
1608
1609         detail.cache = cache;
1610         bio_list_init(&detail.bios_for_issue);
1611         bio_list_init(&detail.unhandled_bios);
1612         detail.any_writes = false;
1613
1614         spin_lock_irqsave(&cache->lock, flags);
1615         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1616         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1617         spin_unlock_irqrestore(&cache->lock, flags);
1618
1619         remap_to_origin(cache, cell->holder);
1620         if (issue_holder)
1621                 issue(cache, cell->holder);
1622         else
1623                 accounted_begin(cache, cell->holder);
1624
1625         if (detail.any_writes)
1626                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1627
1628         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1629                 remap_to_origin(cache, bio);
1630                 issue(cache, bio);
1631         }
1632 }
1633
1634 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
1635                                       dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
1636 {
1637         struct bio *bio;
1638         unsigned long flags;
1639         struct inc_detail detail;
1640
1641         detail.cache = cache;
1642         bio_list_init(&detail.bios_for_issue);
1643         bio_list_init(&detail.unhandled_bios);
1644         detail.any_writes = false;
1645
1646         spin_lock_irqsave(&cache->lock, flags);
1647         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1648         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1649         spin_unlock_irqrestore(&cache->lock, flags);
1650
1651         remap_to_cache(cache, cell->holder, cblock);
1652         if (issue_holder)
1653                 issue(cache, cell->holder);
1654         else
1655                 accounted_begin(cache, cell->holder);
1656
1657         if (detail.any_writes) {
1658                 set_dirty(cache, oblock, cblock);
1659                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1660         }
1661
1662         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1663                 remap_to_cache(cache, bio, cblock);
1664                 issue(cache, bio);
1665         }
1666 }
1667
1668 /*----------------------------------------------------------------*/
1669
1670 struct old_oblock_lock {
1671         struct policy_locker locker;
1672         struct cache *cache;
1673         struct prealloc *structs;
1674         struct dm_bio_prison_cell *cell;
1675 };
1676
1677 static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1678 {
1679         /* This should never be called */
1680         BUG();
1681         return 0;
1682 }
1683
1684 static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
1685 {
1686         struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
1687         struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
1688
1689         return bio_detain(l->cache, b, NULL, cell_prealloc,
1690                           (cell_free_fn) prealloc_put_cell,
1691                           l->structs, &l->cell);
1692 }
1693
1694 static void process_cell(struct cache *cache, struct prealloc *structs,
1695                          struct dm_bio_prison_cell *new_ocell)
1696 {
1697         int r;
1698         bool release_cell = true;
1699         struct bio *bio = new_ocell->holder;
1700         dm_oblock_t block = get_bio_block(cache, bio);
1701         struct policy_result lookup_result;
1702         bool passthrough = passthrough_mode(&cache->features);
1703         bool fast_promotion, can_migrate;
1704         struct old_oblock_lock ool;
1705
1706         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1707         can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1708
1709         ool.locker.fn = cell_locker;
1710         ool.cache = cache;
1711         ool.structs = structs;
1712         ool.cell = NULL;
1713         r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1714                        bio, &ool.locker, &lookup_result);
1715
1716         if (r == -EWOULDBLOCK)
1717                 /* migration has been denied */
1718                 lookup_result.op = POLICY_MISS;
1719
1720         switch (lookup_result.op) {
1721         case POLICY_HIT:
1722                 if (passthrough) {
1723                         inc_miss_counter(cache, bio);
1724
1725                         /*
1726                          * Passthrough always maps to the origin,
1727                          * invalidating any cache blocks that are written
1728                          * to.
1729                          */
1730
1731                         if (bio_data_dir(bio) == WRITE) {
1732                                 atomic_inc(&cache->stats.demotion);
1733                                 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1734                                 release_cell = false;
1735
1736                         } else {
1737                                 /* FIXME: factor out issue_origin() */
1738                                 remap_to_origin_clear_discard(cache, bio, block);
1739                                 inc_and_issue(cache, bio, new_ocell);
1740                         }
1741                 } else {
1742                         inc_hit_counter(cache, bio);
1743
1744                         if (bio_data_dir(bio) == WRITE &&
1745                             writethrough_mode(&cache->features) &&
1746                             !is_dirty(cache, lookup_result.cblock)) {
1747                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1748                                 inc_and_issue(cache, bio, new_ocell);
1749
1750                         } else {
1751                                 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1752                                 release_cell = false;
1753                         }
1754                 }
1755
1756                 break;
1757
1758         case POLICY_MISS:
1759                 inc_miss_counter(cache, bio);
1760                 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1761                 release_cell = false;
1762                 break;
1763
1764         case POLICY_NEW:
1765                 atomic_inc(&cache->stats.promotion);
1766                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1767                 release_cell = false;
1768                 break;
1769
1770         case POLICY_REPLACE:
1771                 atomic_inc(&cache->stats.demotion);
1772                 atomic_inc(&cache->stats.promotion);
1773                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1774                                     block, lookup_result.cblock,
1775                                     ool.cell, new_ocell);
1776                 release_cell = false;
1777                 break;
1778
1779         default:
1780                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1781                             (unsigned) lookup_result.op);
1782                 bio_io_error(bio);
1783         }
1784
1785         if (release_cell)
1786                 cell_defer(cache, new_ocell, false);
1787 }
1788
1789 static void process_bio(struct cache *cache, struct prealloc *structs,
1790                         struct bio *bio)
1791 {
1792         int r;
1793         dm_oblock_t block = get_bio_block(cache, bio);
1794         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1795
1796         /*
1797          * Check to see if that block is currently migrating.
1798          */
1799         cell_prealloc = prealloc_get_cell(structs);
1800         r = bio_detain(cache, block, bio, cell_prealloc,
1801                        (cell_free_fn) prealloc_put_cell,
1802                        structs, &new_ocell);
1803         if (r > 0)
1804                 return;
1805
1806         process_cell(cache, structs, new_ocell);
1807 }
1808
1809 static int need_commit_due_to_time(struct cache *cache)
1810 {
1811         return jiffies < cache->last_commit_jiffies ||
1812                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1813 }
1814
1815 static int commit_if_needed(struct cache *cache)
1816 {
1817         int r = 0;
1818
1819         if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1820             dm_cache_changed_this_transaction(cache->cmd)) {
1821                 atomic_inc(&cache->stats.commit_count);
1822                 cache->commit_requested = false;
1823                 r = dm_cache_commit(cache->cmd, false);
1824                 cache->last_commit_jiffies = jiffies;
1825         }
1826
1827         return r;
1828 }
1829
1830 static void process_deferred_bios(struct cache *cache)
1831 {
1832         unsigned long flags;
1833         struct bio_list bios;
1834         struct bio *bio;
1835         struct prealloc structs;
1836
1837         memset(&structs, 0, sizeof(structs));
1838         bio_list_init(&bios);
1839
1840         spin_lock_irqsave(&cache->lock, flags);
1841         bio_list_merge(&bios, &cache->deferred_bios);
1842         bio_list_init(&cache->deferred_bios);
1843         spin_unlock_irqrestore(&cache->lock, flags);
1844
1845         while (!bio_list_empty(&bios)) {
1846                 /*
1847                  * If we've got no free migration structs, and processing
1848                  * this bio might require one, we pause until there are some
1849                  * prepared mappings to process.
1850                  */
1851                 if (prealloc_data_structs(cache, &structs)) {
1852                         spin_lock_irqsave(&cache->lock, flags);
1853                         bio_list_merge(&cache->deferred_bios, &bios);
1854                         spin_unlock_irqrestore(&cache->lock, flags);
1855                         break;
1856                 }
1857
1858                 bio = bio_list_pop(&bios);
1859
1860                 if (bio->bi_rw & REQ_FLUSH)
1861                         process_flush_bio(cache, bio);
1862                 else if (bio->bi_rw & REQ_DISCARD)
1863                         process_discard_bio(cache, &structs, bio);
1864                 else
1865                         process_bio(cache, &structs, bio);
1866         }
1867
1868         prealloc_free_structs(cache, &structs);
1869 }
1870
1871 static void process_deferred_cells(struct cache *cache)
1872 {
1873         unsigned long flags;
1874         struct dm_bio_prison_cell *cell, *tmp;
1875         struct list_head cells;
1876         struct prealloc structs;
1877
1878         memset(&structs, 0, sizeof(structs));
1879
1880         INIT_LIST_HEAD(&cells);
1881
1882         spin_lock_irqsave(&cache->lock, flags);
1883         list_splice_init(&cache->deferred_cells, &cells);
1884         spin_unlock_irqrestore(&cache->lock, flags);
1885
1886         list_for_each_entry_safe(cell, tmp, &cells, user_list) {
1887                 /*
1888                  * If we've got no free migration structs, and processing
1889                  * this bio might require one, we pause until there are some
1890                  * prepared mappings to process.
1891                  */
1892                 if (prealloc_data_structs(cache, &structs)) {
1893                         spin_lock_irqsave(&cache->lock, flags);
1894                         list_splice(&cells, &cache->deferred_cells);
1895                         spin_unlock_irqrestore(&cache->lock, flags);
1896                         break;
1897                 }
1898
1899                 process_cell(cache, &structs, cell);
1900         }
1901
1902         prealloc_free_structs(cache, &structs);
1903 }
1904
1905 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1906 {
1907         unsigned long flags;
1908         struct bio_list bios;
1909         struct bio *bio;
1910
1911         bio_list_init(&bios);
1912
1913         spin_lock_irqsave(&cache->lock, flags);
1914         bio_list_merge(&bios, &cache->deferred_flush_bios);
1915         bio_list_init(&cache->deferred_flush_bios);
1916         spin_unlock_irqrestore(&cache->lock, flags);
1917
1918         /*
1919          * These bios have already been through inc_ds()
1920          */
1921         while ((bio = bio_list_pop(&bios)))
1922                 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
1923 }
1924
1925 static void process_deferred_writethrough_bios(struct cache *cache)
1926 {
1927         unsigned long flags;
1928         struct bio_list bios;
1929         struct bio *bio;
1930
1931         bio_list_init(&bios);
1932
1933         spin_lock_irqsave(&cache->lock, flags);
1934         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1935         bio_list_init(&cache->deferred_writethrough_bios);
1936         spin_unlock_irqrestore(&cache->lock, flags);
1937
1938         /*
1939          * These bios have already been through inc_ds()
1940          */
1941         while ((bio = bio_list_pop(&bios)))
1942                 accounted_request(cache, bio);
1943 }
1944
1945 static void writeback_some_dirty_blocks(struct cache *cache)
1946 {
1947         int r = 0;
1948         dm_oblock_t oblock;
1949         dm_cblock_t cblock;
1950         struct prealloc structs;
1951         struct dm_bio_prison_cell *old_ocell;
1952         bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
1953
1954         memset(&structs, 0, sizeof(structs));
1955
1956         while (spare_migration_bandwidth(cache)) {
1957                 if (prealloc_data_structs(cache, &structs))
1958                         break;
1959
1960                 r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
1961                 if (r)
1962                         break;
1963
1964                 r = get_cell(cache, oblock, &structs, &old_ocell);
1965                 if (r) {
1966                         policy_set_dirty(cache->policy, oblock);
1967                         break;
1968                 }
1969
1970                 writeback(cache, &structs, oblock, cblock, old_ocell);
1971         }
1972
1973         prealloc_free_structs(cache, &structs);
1974 }
1975
1976 /*----------------------------------------------------------------
1977  * Invalidations.
1978  * Dropping something from the cache *without* writing back.
1979  *--------------------------------------------------------------*/
1980
1981 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1982 {
1983         int r = 0;
1984         uint64_t begin = from_cblock(req->cblocks->begin);
1985         uint64_t end = from_cblock(req->cblocks->end);
1986
1987         while (begin != end) {
1988                 r = policy_remove_cblock(cache->policy, to_cblock(begin));
1989                 if (!r) {
1990                         r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1991                         if (r)
1992                                 break;
1993
1994                 } else if (r == -ENODATA) {
1995                         /* harmless, already unmapped */
1996                         r = 0;
1997
1998                 } else {
1999                         DMERR("policy_remove_cblock failed");
2000                         break;
2001                 }
2002
2003                 begin++;
2004         }
2005
2006         cache->commit_requested = true;
2007
2008         req->err = r;
2009         atomic_set(&req->complete, 1);
2010
2011         wake_up(&req->result_wait);
2012 }
2013
2014 static void process_invalidation_requests(struct cache *cache)
2015 {
2016         struct list_head list;
2017         struct invalidation_request *req, *tmp;
2018
2019         INIT_LIST_HEAD(&list);
2020         spin_lock(&cache->invalidation_lock);
2021         list_splice_init(&cache->invalidation_requests, &list);
2022         spin_unlock(&cache->invalidation_lock);
2023
2024         list_for_each_entry_safe (req, tmp, &list, list)
2025                 process_invalidation_request(cache, req);
2026 }
2027
2028 /*----------------------------------------------------------------
2029  * Main worker loop
2030  *--------------------------------------------------------------*/
2031 static bool is_quiescing(struct cache *cache)
2032 {
2033         return atomic_read(&cache->quiescing);
2034 }
2035
2036 static void ack_quiescing(struct cache *cache)
2037 {
2038         if (is_quiescing(cache)) {
2039                 atomic_inc(&cache->quiescing_ack);
2040                 wake_up(&cache->quiescing_wait);
2041         }
2042 }
2043
2044 static void wait_for_quiescing_ack(struct cache *cache)
2045 {
2046         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2047 }
2048
2049 static void start_quiescing(struct cache *cache)
2050 {
2051         atomic_inc(&cache->quiescing);
2052         wait_for_quiescing_ack(cache);
2053 }
2054
2055 static void stop_quiescing(struct cache *cache)
2056 {
2057         atomic_set(&cache->quiescing, 0);
2058         atomic_set(&cache->quiescing_ack, 0);
2059 }
2060
2061 static void wait_for_migrations(struct cache *cache)
2062 {
2063         wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2064 }
2065
2066 static void stop_worker(struct cache *cache)
2067 {
2068         cancel_delayed_work(&cache->waker);
2069         flush_workqueue(cache->wq);
2070 }
2071
2072 static void requeue_deferred_cells(struct cache *cache)
2073 {
2074         unsigned long flags;
2075         struct list_head cells;
2076         struct dm_bio_prison_cell *cell, *tmp;
2077
2078         INIT_LIST_HEAD(&cells);
2079         spin_lock_irqsave(&cache->lock, flags);
2080         list_splice_init(&cache->deferred_cells, &cells);
2081         spin_unlock_irqrestore(&cache->lock, flags);
2082
2083         list_for_each_entry_safe(cell, tmp, &cells, user_list)
2084                 cell_requeue(cache, cell);
2085 }
2086
2087 static void requeue_deferred_bios(struct cache *cache)
2088 {
2089         struct bio *bio;
2090         struct bio_list bios;
2091
2092         bio_list_init(&bios);
2093         bio_list_merge(&bios, &cache->deferred_bios);
2094         bio_list_init(&cache->deferred_bios);
2095
2096         while ((bio = bio_list_pop(&bios)))
2097                 bio_endio(bio, DM_ENDIO_REQUEUE);
2098 }
2099
2100 static int more_work(struct cache *cache)
2101 {
2102         if (is_quiescing(cache))
2103                 return !list_empty(&cache->quiesced_migrations) ||
2104                         !list_empty(&cache->completed_migrations) ||
2105                         !list_empty(&cache->need_commit_migrations);
2106         else
2107                 return !bio_list_empty(&cache->deferred_bios) ||
2108                         !list_empty(&cache->deferred_cells) ||
2109                         !bio_list_empty(&cache->deferred_flush_bios) ||
2110                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
2111                         !list_empty(&cache->quiesced_migrations) ||
2112                         !list_empty(&cache->completed_migrations) ||
2113                         !list_empty(&cache->need_commit_migrations) ||
2114                         cache->invalidate;
2115 }
2116
2117 static void do_worker(struct work_struct *ws)
2118 {
2119         struct cache *cache = container_of(ws, struct cache, worker);
2120
2121         do {
2122                 if (!is_quiescing(cache)) {
2123                         writeback_some_dirty_blocks(cache);
2124                         process_deferred_writethrough_bios(cache);
2125                         process_deferred_bios(cache);
2126                         process_deferred_cells(cache);
2127                         process_invalidation_requests(cache);
2128                 }
2129
2130                 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2131                 process_migrations(cache, &cache->completed_migrations, complete_migration);
2132
2133                 if (commit_if_needed(cache)) {
2134                         process_deferred_flush_bios(cache, false);
2135                         process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2136
2137                         /*
2138                          * FIXME: rollback metadata or just go into a
2139                          * failure mode and error everything
2140                          */
2141
2142                 } else {
2143                         process_deferred_flush_bios(cache, true);
2144                         process_migrations(cache, &cache->need_commit_migrations,
2145                                            migration_success_post_commit);
2146                 }
2147
2148                 ack_quiescing(cache);
2149
2150         } while (more_work(cache));
2151 }
2152
2153 /*
2154  * We want to commit periodically so that not too much
2155  * unwritten metadata builds up.
2156  */
2157 static void do_waker(struct work_struct *ws)
2158 {
2159         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2160         policy_tick(cache->policy);
2161         wake_worker(cache);
2162         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2163 }
2164
2165 /*----------------------------------------------------------------*/
2166
2167 static int is_congested(struct dm_dev *dev, int bdi_bits)
2168 {
2169         struct request_queue *q = bdev_get_queue(dev->bdev);
2170         return bdi_congested(&q->backing_dev_info, bdi_bits);
2171 }
2172
2173 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2174 {
2175         struct cache *cache = container_of(cb, struct cache, callbacks);
2176
2177         return is_congested(cache->origin_dev, bdi_bits) ||
2178                 is_congested(cache->cache_dev, bdi_bits);
2179 }
2180
2181 /*----------------------------------------------------------------
2182  * Target methods
2183  *--------------------------------------------------------------*/
2184
2185 /*
2186  * This function gets called on the error paths of the constructor, so we
2187  * have to cope with a partially initialised struct.
2188  */
2189 static void destroy(struct cache *cache)
2190 {
2191         unsigned i;
2192
2193         if (cache->migration_pool)
2194                 mempool_destroy(cache->migration_pool);
2195
2196         if (cache->all_io_ds)
2197                 dm_deferred_set_destroy(cache->all_io_ds);
2198
2199         if (cache->prison)
2200                 dm_bio_prison_destroy(cache->prison);
2201
2202         if (cache->wq)
2203                 destroy_workqueue(cache->wq);
2204
2205         if (cache->dirty_bitset)
2206                 free_bitset(cache->dirty_bitset);
2207
2208         if (cache->discard_bitset)
2209                 free_bitset(cache->discard_bitset);
2210
2211         if (cache->copier)
2212                 dm_kcopyd_client_destroy(cache->copier);
2213
2214         if (cache->cmd)
2215                 dm_cache_metadata_close(cache->cmd);
2216
2217         if (cache->metadata_dev)
2218                 dm_put_device(cache->ti, cache->metadata_dev);
2219
2220         if (cache->origin_dev)
2221                 dm_put_device(cache->ti, cache->origin_dev);
2222
2223         if (cache->cache_dev)
2224                 dm_put_device(cache->ti, cache->cache_dev);
2225
2226         if (cache->policy)
2227                 dm_cache_policy_destroy(cache->policy);
2228
2229         for (i = 0; i < cache->nr_ctr_args ; i++)
2230                 kfree(cache->ctr_args[i]);
2231         kfree(cache->ctr_args);
2232
2233         kfree(cache);
2234 }
2235
2236 static void cache_dtr(struct dm_target *ti)
2237 {
2238         struct cache *cache = ti->private;
2239
2240         destroy(cache);
2241 }
2242
2243 static sector_t get_dev_size(struct dm_dev *dev)
2244 {
2245         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2246 }
2247
2248 /*----------------------------------------------------------------*/
2249
2250 /*
2251  * Construct a cache device mapping.
2252  *
2253  * cache <metadata dev> <cache dev> <origin dev> <block size>
2254  *       <#feature args> [<feature arg>]*
2255  *       <policy> <#policy args> [<policy arg>]*
2256  *
2257  * metadata dev    : fast device holding the persistent metadata
2258  * cache dev       : fast device holding cached data blocks
2259  * origin dev      : slow device holding original data blocks
2260  * block size      : cache unit size in sectors
2261  *
2262  * #feature args   : number of feature arguments passed
2263  * feature args    : writethrough.  (The default is writeback.)
2264  *
2265  * policy          : the replacement policy to use
2266  * #policy args    : an even number of policy arguments corresponding
2267  *                   to key/value pairs passed to the policy
2268  * policy args     : key/value pairs passed to the policy
2269  *                   E.g. 'sequential_threshold 1024'
2270  *                   See cache-policies.txt for details.
2271  *
2272  * Optional feature arguments are:
2273  *   writethrough  : write through caching that prohibits cache block
2274  *                   content from being different from origin block content.
2275  *                   Without this argument, the default behaviour is to write
2276  *                   back cache block contents later for performance reasons,
2277  *                   so they may differ from the corresponding origin blocks.
2278  */
2279 struct cache_args {
2280         struct dm_target *ti;
2281
2282         struct dm_dev *metadata_dev;
2283
2284         struct dm_dev *cache_dev;
2285         sector_t cache_sectors;
2286
2287         struct dm_dev *origin_dev;
2288         sector_t origin_sectors;
2289
2290         uint32_t block_size;
2291
2292         const char *policy_name;
2293         int policy_argc;
2294         const char **policy_argv;
2295
2296         struct cache_features features;
2297 };
2298
2299 static void destroy_cache_args(struct cache_args *ca)
2300 {
2301         if (ca->metadata_dev)
2302                 dm_put_device(ca->ti, ca->metadata_dev);
2303
2304         if (ca->cache_dev)
2305                 dm_put_device(ca->ti, ca->cache_dev);
2306
2307         if (ca->origin_dev)
2308                 dm_put_device(ca->ti, ca->origin_dev);
2309
2310         kfree(ca);
2311 }
2312
2313 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2314 {
2315         if (!as->argc) {
2316                 *error = "Insufficient args";
2317                 return false;
2318         }
2319
2320         return true;
2321 }
2322
2323 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2324                               char **error)
2325 {
2326         int r;
2327         sector_t metadata_dev_size;
2328         char b[BDEVNAME_SIZE];
2329
2330         if (!at_least_one_arg(as, error))
2331                 return -EINVAL;
2332
2333         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2334                           &ca->metadata_dev);
2335         if (r) {
2336                 *error = "Error opening metadata device";
2337                 return r;
2338         }
2339
2340         metadata_dev_size = get_dev_size(ca->metadata_dev);
2341         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2342                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2343                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2344
2345         return 0;
2346 }
2347
2348 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2349                            char **error)
2350 {
2351         int r;
2352
2353         if (!at_least_one_arg(as, error))
2354                 return -EINVAL;
2355
2356         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2357                           &ca->cache_dev);
2358         if (r) {
2359                 *error = "Error opening cache device";
2360                 return r;
2361         }
2362         ca->cache_sectors = get_dev_size(ca->cache_dev);
2363
2364         return 0;
2365 }
2366
2367 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2368                             char **error)
2369 {
2370         int r;
2371
2372         if (!at_least_one_arg(as, error))
2373                 return -EINVAL;
2374
2375         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2376                           &ca->origin_dev);
2377         if (r) {
2378                 *error = "Error opening origin device";
2379                 return r;
2380         }
2381
2382         ca->origin_sectors = get_dev_size(ca->origin_dev);
2383         if (ca->ti->len > ca->origin_sectors) {
2384                 *error = "Device size larger than cached device";
2385                 return -EINVAL;
2386         }
2387
2388         return 0;
2389 }
2390
2391 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2392                             char **error)
2393 {
2394         unsigned long block_size;
2395
2396         if (!at_least_one_arg(as, error))
2397                 return -EINVAL;
2398
2399         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2400             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2401             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2402             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2403                 *error = "Invalid data block size";
2404                 return -EINVAL;
2405         }
2406
2407         if (block_size > ca->cache_sectors) {
2408                 *error = "Data block size is larger than the cache device";
2409                 return -EINVAL;
2410         }
2411
2412         ca->block_size = block_size;
2413
2414         return 0;
2415 }
2416
2417 static void init_features(struct cache_features *cf)
2418 {
2419         cf->mode = CM_WRITE;
2420         cf->io_mode = CM_IO_WRITEBACK;
2421 }
2422
2423 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2424                           char **error)
2425 {
2426         static struct dm_arg _args[] = {
2427                 {0, 1, "Invalid number of cache feature arguments"},
2428         };
2429
2430         int r;
2431         unsigned argc;
2432         const char *arg;
2433         struct cache_features *cf = &ca->features;
2434
2435         init_features(cf);
2436
2437         r = dm_read_arg_group(_args, as, &argc, error);
2438         if (r)
2439                 return -EINVAL;
2440
2441         while (argc--) {
2442                 arg = dm_shift_arg(as);
2443
2444                 if (!strcasecmp(arg, "writeback"))
2445                         cf->io_mode = CM_IO_WRITEBACK;
2446
2447                 else if (!strcasecmp(arg, "writethrough"))
2448                         cf->io_mode = CM_IO_WRITETHROUGH;
2449
2450                 else if (!strcasecmp(arg, "passthrough"))
2451                         cf->io_mode = CM_IO_PASSTHROUGH;
2452
2453                 else {
2454                         *error = "Unrecognised cache feature requested";
2455                         return -EINVAL;
2456                 }
2457         }
2458
2459         return 0;
2460 }
2461
2462 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2463                         char **error)
2464 {
2465         static struct dm_arg _args[] = {
2466                 {0, 1024, "Invalid number of policy arguments"},
2467         };
2468
2469         int r;
2470
2471         if (!at_least_one_arg(as, error))
2472                 return -EINVAL;
2473
2474         ca->policy_name = dm_shift_arg(as);
2475
2476         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2477         if (r)
2478                 return -EINVAL;
2479
2480         ca->policy_argv = (const char **)as->argv;
2481         dm_consume_args(as, ca->policy_argc);
2482
2483         return 0;
2484 }
2485
2486 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2487                             char **error)
2488 {
2489         int r;
2490         struct dm_arg_set as;
2491
2492         as.argc = argc;
2493         as.argv = argv;
2494
2495         r = parse_metadata_dev(ca, &as, error);
2496         if (r)
2497                 return r;
2498
2499         r = parse_cache_dev(ca, &as, error);
2500         if (r)
2501                 return r;
2502
2503         r = parse_origin_dev(ca, &as, error);
2504         if (r)
2505                 return r;
2506
2507         r = parse_block_size(ca, &as, error);
2508         if (r)
2509                 return r;
2510
2511         r = parse_features(ca, &as, error);
2512         if (r)
2513                 return r;
2514
2515         r = parse_policy(ca, &as, error);
2516         if (r)
2517                 return r;
2518
2519         return 0;
2520 }
2521
2522 /*----------------------------------------------------------------*/
2523
2524 static struct kmem_cache *migration_cache;
2525
2526 #define NOT_CORE_OPTION 1
2527
2528 static int process_config_option(struct cache *cache, const char *key, const char *value)
2529 {
2530         unsigned long tmp;
2531
2532         if (!strcasecmp(key, "migration_threshold")) {
2533                 if (kstrtoul(value, 10, &tmp))
2534                         return -EINVAL;
2535
2536                 cache->migration_threshold = tmp;
2537                 return 0;
2538         }
2539
2540         return NOT_CORE_OPTION;
2541 }
2542
2543 static int set_config_value(struct cache *cache, const char *key, const char *value)
2544 {
2545         int r = process_config_option(cache, key, value);
2546
2547         if (r == NOT_CORE_OPTION)
2548                 r = policy_set_config_value(cache->policy, key, value);
2549
2550         if (r)
2551                 DMWARN("bad config value for %s: %s", key, value);
2552
2553         return r;
2554 }
2555
2556 static int set_config_values(struct cache *cache, int argc, const char **argv)
2557 {
2558         int r = 0;
2559
2560         if (argc & 1) {
2561                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2562                 return -EINVAL;
2563         }
2564
2565         while (argc) {
2566                 r = set_config_value(cache, argv[0], argv[1]);
2567                 if (r)
2568                         break;
2569
2570                 argc -= 2;
2571                 argv += 2;
2572         }
2573
2574         return r;
2575 }
2576
2577 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2578                                char **error)
2579 {
2580         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2581                                                            cache->cache_size,
2582                                                            cache->origin_sectors,
2583                                                            cache->sectors_per_block);
2584         if (IS_ERR(p)) {
2585                 *error = "Error creating cache's policy";
2586                 return PTR_ERR(p);
2587         }
2588         cache->policy = p;
2589
2590         return 0;
2591 }
2592
2593 /*
2594  * We want the discard block size to be at least the size of the cache
2595  * block size and have no more than 2^14 discard blocks across the origin.
2596  */
2597 #define MAX_DISCARD_BLOCKS (1 << 14)
2598
2599 static bool too_many_discard_blocks(sector_t discard_block_size,
2600                                     sector_t origin_size)
2601 {
2602         (void) sector_div(origin_size, discard_block_size);
2603
2604         return origin_size > MAX_DISCARD_BLOCKS;
2605 }
2606
2607 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2608                                              sector_t origin_size)
2609 {
2610         sector_t discard_block_size = cache_block_size;
2611
2612         if (origin_size)
2613                 while (too_many_discard_blocks(discard_block_size, origin_size))
2614                         discard_block_size *= 2;
2615
2616         return discard_block_size;
2617 }
2618
2619 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2620 {
2621         dm_block_t nr_blocks = from_cblock(size);
2622
2623         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2624                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2625                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2626                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2627                              (unsigned long long) nr_blocks);
2628
2629         cache->cache_size = size;
2630 }
2631
2632 #define DEFAULT_MIGRATION_THRESHOLD 2048
2633
2634 static int cache_create(struct cache_args *ca, struct cache **result)
2635 {
2636         int r = 0;
2637         char **error = &ca->ti->error;
2638         struct cache *cache;
2639         struct dm_target *ti = ca->ti;
2640         dm_block_t origin_blocks;
2641         struct dm_cache_metadata *cmd;
2642         bool may_format = ca->features.mode == CM_WRITE;
2643
2644         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2645         if (!cache)
2646                 return -ENOMEM;
2647
2648         cache->ti = ca->ti;
2649         ti->private = cache;
2650         ti->num_flush_bios = 2;
2651         ti->flush_supported = true;
2652
2653         ti->num_discard_bios = 1;
2654         ti->discards_supported = true;
2655         ti->discard_zeroes_data_unsupported = true;
2656         ti->split_discard_bios = false;
2657
2658         cache->features = ca->features;
2659         ti->per_bio_data_size = get_per_bio_data_size(cache);
2660
2661         cache->callbacks.congested_fn = cache_is_congested;
2662         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2663
2664         cache->metadata_dev = ca->metadata_dev;
2665         cache->origin_dev = ca->origin_dev;
2666         cache->cache_dev = ca->cache_dev;
2667
2668         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2669
2670         /* FIXME: factor out this whole section */
2671         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2672         origin_blocks = block_div(origin_blocks, ca->block_size);
2673         cache->origin_blocks = to_oblock(origin_blocks);
2674
2675         cache->sectors_per_block = ca->block_size;
2676         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2677                 r = -EINVAL;
2678                 goto bad;
2679         }
2680
2681         if (ca->block_size & (ca->block_size - 1)) {
2682                 dm_block_t cache_size = ca->cache_sectors;
2683
2684                 cache->sectors_per_block_shift = -1;
2685                 cache_size = block_div(cache_size, ca->block_size);
2686                 set_cache_size(cache, to_cblock(cache_size));
2687         } else {
2688                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2689                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2690         }
2691
2692         r = create_cache_policy(cache, ca, error);
2693         if (r)
2694                 goto bad;
2695
2696         cache->policy_nr_args = ca->policy_argc;
2697         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2698
2699         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2700         if (r) {
2701                 *error = "Error setting cache policy's config values";
2702                 goto bad;
2703         }
2704
2705         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2706                                      ca->block_size, may_format,
2707                                      dm_cache_policy_get_hint_size(cache->policy));
2708         if (IS_ERR(cmd)) {
2709                 *error = "Error creating metadata object";
2710                 r = PTR_ERR(cmd);
2711                 goto bad;
2712         }
2713         cache->cmd = cmd;
2714
2715         if (passthrough_mode(&cache->features)) {
2716                 bool all_clean;
2717
2718                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2719                 if (r) {
2720                         *error = "dm_cache_metadata_all_clean() failed";
2721                         goto bad;
2722                 }
2723
2724                 if (!all_clean) {
2725                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2726                         r = -EINVAL;
2727                         goto bad;
2728                 }
2729         }
2730
2731         spin_lock_init(&cache->lock);
2732         INIT_LIST_HEAD(&cache->deferred_cells);
2733         bio_list_init(&cache->deferred_bios);
2734         bio_list_init(&cache->deferred_flush_bios);
2735         bio_list_init(&cache->deferred_writethrough_bios);
2736         INIT_LIST_HEAD(&cache->quiesced_migrations);
2737         INIT_LIST_HEAD(&cache->completed_migrations);
2738         INIT_LIST_HEAD(&cache->need_commit_migrations);
2739         atomic_set(&cache->nr_allocated_migrations, 0);
2740         atomic_set(&cache->nr_io_migrations, 0);
2741         init_waitqueue_head(&cache->migration_wait);
2742
2743         init_waitqueue_head(&cache->quiescing_wait);
2744         atomic_set(&cache->quiescing, 0);
2745         atomic_set(&cache->quiescing_ack, 0);
2746
2747         r = -ENOMEM;
2748         atomic_set(&cache->nr_dirty, 0);
2749         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2750         if (!cache->dirty_bitset) {
2751                 *error = "could not allocate dirty bitset";
2752                 goto bad;
2753         }
2754         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2755
2756         cache->discard_block_size =
2757                 calculate_discard_block_size(cache->sectors_per_block,
2758                                              cache->origin_sectors);
2759         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2760                                                               cache->discard_block_size));
2761         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2762         if (!cache->discard_bitset) {
2763                 *error = "could not allocate discard bitset";
2764                 goto bad;
2765         }
2766         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2767
2768         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2769         if (IS_ERR(cache->copier)) {
2770                 *error = "could not create kcopyd client";
2771                 r = PTR_ERR(cache->copier);
2772                 goto bad;
2773         }
2774
2775         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2776         if (!cache->wq) {
2777                 *error = "could not create workqueue for metadata object";
2778                 goto bad;
2779         }
2780         INIT_WORK(&cache->worker, do_worker);
2781         INIT_DELAYED_WORK(&cache->waker, do_waker);
2782         cache->last_commit_jiffies = jiffies;
2783
2784         cache->prison = dm_bio_prison_create();
2785         if (!cache->prison) {
2786                 *error = "could not create bio prison";
2787                 goto bad;
2788         }
2789
2790         cache->all_io_ds = dm_deferred_set_create();
2791         if (!cache->all_io_ds) {
2792                 *error = "could not create all_io deferred set";
2793                 goto bad;
2794         }
2795
2796         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2797                                                          migration_cache);
2798         if (!cache->migration_pool) {
2799                 *error = "Error creating cache's migration mempool";
2800                 goto bad;
2801         }
2802
2803         cache->need_tick_bio = true;
2804         cache->sized = false;
2805         cache->invalidate = false;
2806         cache->commit_requested = false;
2807         cache->loaded_mappings = false;
2808         cache->loaded_discards = false;
2809
2810         load_stats(cache);
2811
2812         atomic_set(&cache->stats.demotion, 0);
2813         atomic_set(&cache->stats.promotion, 0);
2814         atomic_set(&cache->stats.copies_avoided, 0);
2815         atomic_set(&cache->stats.cache_cell_clash, 0);
2816         atomic_set(&cache->stats.commit_count, 0);
2817         atomic_set(&cache->stats.discard_count, 0);
2818
2819         spin_lock_init(&cache->invalidation_lock);
2820         INIT_LIST_HEAD(&cache->invalidation_requests);
2821
2822         iot_init(&cache->origin_tracker);
2823
2824         *result = cache;
2825         return 0;
2826
2827 bad:
2828         destroy(cache);
2829         return r;
2830 }
2831
2832 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2833 {
2834         unsigned i;
2835         const char **copy;
2836
2837         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2838         if (!copy)
2839                 return -ENOMEM;
2840         for (i = 0; i < argc; i++) {
2841                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2842                 if (!copy[i]) {
2843                         while (i--)
2844                                 kfree(copy[i]);
2845                         kfree(copy);
2846                         return -ENOMEM;
2847                 }
2848         }
2849
2850         cache->nr_ctr_args = argc;
2851         cache->ctr_args = copy;
2852
2853         return 0;
2854 }
2855
2856 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2857 {
2858         int r = -EINVAL;
2859         struct cache_args *ca;
2860         struct cache *cache = NULL;
2861
2862         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2863         if (!ca) {
2864                 ti->error = "Error allocating memory for cache";
2865                 return -ENOMEM;
2866         }
2867         ca->ti = ti;
2868
2869         r = parse_cache_args(ca, argc, argv, &ti->error);
2870         if (r)
2871                 goto out;
2872
2873         r = cache_create(ca, &cache);
2874         if (r)
2875                 goto out;
2876
2877         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2878         if (r) {
2879                 destroy(cache);
2880                 goto out;
2881         }
2882
2883         ti->private = cache;
2884
2885 out:
2886         destroy_cache_args(ca);
2887         return r;
2888 }
2889
2890 /*----------------------------------------------------------------*/
2891
2892 static int cache_map(struct dm_target *ti, struct bio *bio)
2893 {
2894         struct cache *cache = ti->private;
2895
2896         int r;
2897         struct dm_bio_prison_cell *cell = NULL;
2898         dm_oblock_t block = get_bio_block(cache, bio);
2899         size_t pb_data_size = get_per_bio_data_size(cache);
2900         bool can_migrate = false;
2901         bool fast_promotion;
2902         struct policy_result lookup_result;
2903         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2904         struct old_oblock_lock ool;
2905
2906         ool.locker.fn = null_locker;
2907
2908         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2909                 /*
2910                  * This can only occur if the io goes to a partial block at
2911                  * the end of the origin device.  We don't cache these.
2912                  * Just remap to the origin and carry on.
2913                  */
2914                 remap_to_origin(cache, bio);
2915                 accounted_begin(cache, bio);
2916                 return DM_MAPIO_REMAPPED;
2917         }
2918
2919         if (discard_or_flush(bio)) {
2920                 defer_bio(cache, bio);
2921                 return DM_MAPIO_SUBMITTED;
2922         }
2923
2924         /*
2925          * Check to see if that block is currently migrating.
2926          */
2927         cell = alloc_prison_cell(cache);
2928         if (!cell) {
2929                 defer_bio(cache, bio);
2930                 return DM_MAPIO_SUBMITTED;
2931         }
2932
2933         r = bio_detain(cache, block, bio, cell,
2934                        (cell_free_fn) free_prison_cell,
2935                        cache, &cell);
2936         if (r) {
2937                 if (r < 0)
2938                         defer_bio(cache, bio);
2939
2940                 return DM_MAPIO_SUBMITTED;
2941         }
2942
2943         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
2944
2945         r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
2946                        bio, &ool.locker, &lookup_result);
2947         if (r == -EWOULDBLOCK) {
2948                 cell_defer(cache, cell, true);
2949                 return DM_MAPIO_SUBMITTED;
2950
2951         } else if (r) {
2952                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2953                 cell_defer(cache, cell, false);
2954                 bio_io_error(bio);
2955                 return DM_MAPIO_SUBMITTED;
2956         }
2957
2958         r = DM_MAPIO_REMAPPED;
2959         switch (lookup_result.op) {
2960         case POLICY_HIT:
2961                 if (passthrough_mode(&cache->features)) {
2962                         if (bio_data_dir(bio) == WRITE) {
2963                                 /*
2964                                  * We need to invalidate this block, so
2965                                  * defer for the worker thread.
2966                                  */
2967                                 cell_defer(cache, cell, true);
2968                                 r = DM_MAPIO_SUBMITTED;
2969
2970                         } else {
2971                                 inc_miss_counter(cache, bio);
2972                                 remap_to_origin_clear_discard(cache, bio, block);
2973                                 accounted_begin(cache, bio);
2974                                 inc_ds(cache, bio, cell);
2975                                 // FIXME: we want to remap hits or misses straight
2976                                 // away rather than passing over to the worker.
2977                                 cell_defer(cache, cell, false);
2978                         }
2979
2980                 } else {
2981                         inc_hit_counter(cache, bio);
2982                         if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2983                             !is_dirty(cache, lookup_result.cblock)) {
2984                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2985                                 accounted_begin(cache, bio);
2986                                 inc_ds(cache, bio, cell);
2987                                 cell_defer(cache, cell, false);
2988
2989                         } else
2990                                 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
2991                 }
2992                 break;
2993
2994         case POLICY_MISS:
2995                 inc_miss_counter(cache, bio);
2996                 if (pb->req_nr != 0) {
2997                         /*
2998                          * This is a duplicate writethrough io that is no
2999                          * longer needed because the block has been demoted.
3000                          */
3001                         bio_endio(bio, 0);
3002                         // FIXME: remap everything as a miss
3003                         cell_defer(cache, cell, false);
3004                         r = DM_MAPIO_SUBMITTED;
3005
3006                 } else
3007                         remap_cell_to_origin_clear_discard(cache, cell, block, false);
3008                 break;
3009
3010         default:
3011                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
3012                             (unsigned) lookup_result.op);
3013                 cell_defer(cache, cell, false);
3014                 bio_io_error(bio);
3015                 r = DM_MAPIO_SUBMITTED;
3016         }
3017
3018         return r;
3019 }
3020
3021 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3022 {
3023         struct cache *cache = ti->private;
3024         unsigned long flags;
3025         size_t pb_data_size = get_per_bio_data_size(cache);
3026         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
3027
3028         if (pb->tick) {
3029                 policy_tick(cache->policy);
3030
3031                 spin_lock_irqsave(&cache->lock, flags);
3032                 cache->need_tick_bio = true;
3033                 spin_unlock_irqrestore(&cache->lock, flags);
3034         }
3035
3036         check_for_quiesced_migrations(cache, pb);
3037         accounted_complete(cache, bio);
3038
3039         return 0;
3040 }
3041
3042 static int write_dirty_bitset(struct cache *cache)
3043 {
3044         unsigned i, r;
3045
3046         for (i = 0; i < from_cblock(cache->cache_size); i++) {
3047                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
3048                                        is_dirty(cache, to_cblock(i)));
3049                 if (r)
3050                         return r;
3051         }
3052
3053         return 0;
3054 }
3055
3056 static int write_discard_bitset(struct cache *cache)
3057 {
3058         unsigned i, r;
3059
3060         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
3061                                            cache->discard_nr_blocks);
3062         if (r) {
3063                 DMERR("could not resize on-disk discard bitset");
3064                 return r;
3065         }
3066
3067         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
3068                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
3069                                          is_discarded(cache, to_dblock(i)));
3070                 if (r)
3071                         return r;
3072         }
3073
3074         return 0;
3075 }
3076
3077 /*
3078  * returns true on success
3079  */
3080 static bool sync_metadata(struct cache *cache)
3081 {
3082         int r1, r2, r3, r4;
3083
3084         r1 = write_dirty_bitset(cache);
3085         if (r1)
3086                 DMERR("could not write dirty bitset");
3087
3088         r2 = write_discard_bitset(cache);
3089         if (r2)
3090                 DMERR("could not write discard bitset");
3091
3092         save_stats(cache);
3093
3094         r3 = dm_cache_write_hints(cache->cmd, cache->policy);
3095         if (r3)
3096                 DMERR("could not write hints");
3097
3098         /*
3099          * If writing the above metadata failed, we still commit, but don't
3100          * set the clean shutdown flag.  This will effectively force every
3101          * dirty bit to be set on reload.
3102          */
3103         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
3104         if (r4)
3105                 DMERR("could not write cache metadata.  Data loss may occur.");
3106
3107         return !r1 && !r2 && !r3 && !r4;
3108 }
3109
3110 static void cache_postsuspend(struct dm_target *ti)
3111 {
3112         struct cache *cache = ti->private;
3113
3114         start_quiescing(cache);
3115         wait_for_migrations(cache);
3116         stop_worker(cache);
3117         requeue_deferred_bios(cache);
3118         requeue_deferred_cells(cache);
3119         stop_quiescing(cache);
3120
3121         (void) sync_metadata(cache);
3122 }
3123
3124 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3125                         bool dirty, uint32_t hint, bool hint_valid)
3126 {
3127         int r;
3128         struct cache *cache = context;
3129
3130         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
3131         if (r)
3132                 return r;
3133
3134         if (dirty)
3135                 set_dirty(cache, oblock, cblock);
3136         else
3137                 clear_dirty(cache, oblock, cblock);
3138
3139         return 0;
3140 }
3141
3142 /*
3143  * The discard block size in the on disk metadata is not
3144  * neccessarily the same as we're currently using.  So we have to
3145  * be careful to only set the discarded attribute if we know it
3146  * covers a complete block of the new size.
3147  */
3148 struct discard_load_info {
3149         struct cache *cache;
3150
3151         /*
3152          * These blocks are sized using the on disk dblock size, rather
3153          * than the current one.
3154          */
3155         dm_block_t block_size;
3156         dm_block_t discard_begin, discard_end;
3157 };
3158
3159 static void discard_load_info_init(struct cache *cache,
3160                                    struct discard_load_info *li)
3161 {
3162         li->cache = cache;
3163         li->discard_begin = li->discard_end = 0;
3164 }
3165
3166 static void set_discard_range(struct discard_load_info *li)
3167 {
3168         sector_t b, e;
3169
3170         if (li->discard_begin == li->discard_end)
3171                 return;
3172
3173         /*
3174          * Convert to sectors.
3175          */
3176         b = li->discard_begin * li->block_size;
3177         e = li->discard_end * li->block_size;
3178
3179         /*
3180          * Then convert back to the current dblock size.
3181          */
3182         b = dm_sector_div_up(b, li->cache->discard_block_size);
3183         sector_div(e, li->cache->discard_block_size);
3184
3185         /*
3186          * The origin may have shrunk, so we need to check we're still in
3187          * bounds.
3188          */
3189         if (e > from_dblock(li->cache->discard_nr_blocks))
3190                 e = from_dblock(li->cache->discard_nr_blocks);
3191
3192         for (; b < e; b++)
3193                 set_discard(li->cache, to_dblock(b));
3194 }
3195
3196 static int load_discard(void *context, sector_t discard_block_size,
3197                         dm_dblock_t dblock, bool discard)
3198 {
3199         struct discard_load_info *li = context;
3200
3201         li->block_size = discard_block_size;
3202
3203         if (discard) {
3204                 if (from_dblock(dblock) == li->discard_end)
3205                         /*
3206                          * We're already in a discard range, just extend it.
3207                          */
3208                         li->discard_end = li->discard_end + 1ULL;
3209
3210                 else {
3211                         /*
3212                          * Emit the old range and start a new one.
3213                          */
3214                         set_discard_range(li);
3215                         li->discard_begin = from_dblock(dblock);
3216                         li->discard_end = li->discard_begin + 1ULL;
3217                 }
3218         } else {
3219                 set_discard_range(li);
3220                 li->discard_begin = li->discard_end = 0;
3221         }
3222
3223         return 0;
3224 }
3225
3226 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3227 {
3228         sector_t size = get_dev_size(cache->cache_dev);
3229         (void) sector_div(size, cache->sectors_per_block);
3230         return to_cblock(size);
3231 }
3232
3233 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3234 {
3235         if (from_cblock(new_size) > from_cblock(cache->cache_size))
3236                 return true;
3237
3238         /*
3239          * We can't drop a dirty block when shrinking the cache.
3240          */
3241         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3242                 new_size = to_cblock(from_cblock(new_size) + 1);
3243                 if (is_dirty(cache, new_size)) {
3244                         DMERR("unable to shrink cache; cache block %llu is dirty",
3245                               (unsigned long long) from_cblock(new_size));
3246                         return false;
3247                 }
3248         }
3249
3250         return true;
3251 }
3252
3253 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3254 {
3255         int r;
3256
3257         r = dm_cache_resize(cache->cmd, new_size);
3258         if (r) {
3259                 DMERR("could not resize cache metadata");
3260                 return r;
3261         }
3262
3263         set_cache_size(cache, new_size);
3264
3265         return 0;
3266 }
3267
3268 static int cache_preresume(struct dm_target *ti)
3269 {
3270         int r = 0;
3271         struct cache *cache = ti->private;
3272         dm_cblock_t csize = get_cache_dev_size(cache);
3273
3274         /*
3275          * Check to see if the cache has resized.
3276          */
3277         if (!cache->sized) {
3278                 r = resize_cache_dev(cache, csize);
3279                 if (r)
3280                         return r;
3281
3282                 cache->sized = true;
3283
3284         } else if (csize != cache->cache_size) {
3285                 if (!can_resize(cache, csize))
3286                         return -EINVAL;
3287
3288                 r = resize_cache_dev(cache, csize);
3289                 if (r)
3290                         return r;
3291         }
3292
3293         if (!cache->loaded_mappings) {
3294                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3295                                            load_mapping, cache);
3296                 if (r) {
3297                         DMERR("could not load cache mappings");
3298                         return r;
3299                 }
3300
3301                 cache->loaded_mappings = true;
3302         }
3303
3304         if (!cache->loaded_discards) {
3305                 struct discard_load_info li;
3306
3307                 /*
3308                  * The discard bitset could have been resized, or the
3309                  * discard block size changed.  To be safe we start by
3310                  * setting every dblock to not discarded.
3311                  */
3312                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3313
3314                 discard_load_info_init(cache, &li);
3315                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3316                 if (r) {
3317                         DMERR("could not load origin discards");
3318                         return r;
3319                 }
3320                 set_discard_range(&li);
3321
3322                 cache->loaded_discards = true;
3323         }
3324
3325         return r;
3326 }
3327
3328 static void cache_resume(struct dm_target *ti)
3329 {
3330         struct cache *cache = ti->private;
3331
3332         cache->need_tick_bio = true;
3333         do_waker(&cache->waker.work);
3334 }
3335
3336 /*
3337  * Status format:
3338  *
3339  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3340  * <cache block size> <#used cache blocks>/<#total cache blocks>
3341  * <#read hits> <#read misses> <#write hits> <#write misses>
3342  * <#demotions> <#promotions> <#dirty>
3343  * <#features> <features>*
3344  * <#core args> <core args>
3345  * <policy name> <#policy args> <policy args>*
3346  */
3347 static void cache_status(struct dm_target *ti, status_type_t type,
3348                          unsigned status_flags, char *result, unsigned maxlen)
3349 {
3350         int r = 0;
3351         unsigned i;
3352         ssize_t sz = 0;
3353         dm_block_t nr_free_blocks_metadata = 0;
3354         dm_block_t nr_blocks_metadata = 0;
3355         char buf[BDEVNAME_SIZE];
3356         struct cache *cache = ti->private;
3357         dm_cblock_t residency;
3358
3359         switch (type) {
3360         case STATUSTYPE_INFO:
3361                 /* Commit to ensure statistics aren't out-of-date */
3362                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
3363                         r = dm_cache_commit(cache->cmd, false);
3364                         if (r)
3365                                 DMERR("could not commit metadata for accurate status");
3366                 }
3367
3368                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
3369                                                            &nr_free_blocks_metadata);
3370                 if (r) {
3371                         DMERR("could not get metadata free block count");
3372                         goto err;
3373                 }
3374
3375                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3376                 if (r) {
3377                         DMERR("could not get metadata device size");
3378                         goto err;
3379                 }
3380
3381                 residency = policy_residency(cache->policy);
3382
3383                 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
3384                        (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3385                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3386                        (unsigned long long)nr_blocks_metadata,
3387                        cache->sectors_per_block,
3388                        (unsigned long long) from_cblock(residency),
3389                        (unsigned long long) from_cblock(cache->cache_size),
3390                        (unsigned) atomic_read(&cache->stats.read_hit),
3391                        (unsigned) atomic_read(&cache->stats.read_miss),
3392                        (unsigned) atomic_read(&cache->stats.write_hit),
3393                        (unsigned) atomic_read(&cache->stats.write_miss),
3394                        (unsigned) atomic_read(&cache->stats.demotion),
3395                        (unsigned) atomic_read(&cache->stats.promotion),
3396                        (unsigned long) atomic_read(&cache->nr_dirty));
3397
3398                 if (writethrough_mode(&cache->features))
3399                         DMEMIT("1 writethrough ");
3400
3401                 else if (passthrough_mode(&cache->features))
3402                         DMEMIT("1 passthrough ");
3403
3404                 else if (writeback_mode(&cache->features))
3405                         DMEMIT("1 writeback ");
3406
3407                 else {
3408                         DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
3409                         goto err;
3410                 }
3411
3412                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3413
3414                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3415                 if (sz < maxlen) {
3416                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
3417                         if (r)
3418                                 DMERR("policy_emit_config_values returned %d", r);
3419                 }
3420
3421                 break;
3422
3423         case STATUSTYPE_TABLE:
3424                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3425                 DMEMIT("%s ", buf);
3426                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3427                 DMEMIT("%s ", buf);
3428                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3429                 DMEMIT("%s", buf);
3430
3431                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3432                         DMEMIT(" %s", cache->ctr_args[i]);
3433                 if (cache->nr_ctr_args)
3434                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3435         }
3436
3437         return;
3438
3439 err:
3440         DMEMIT("Error");
3441 }
3442
3443 /*
3444  * A cache block range can take two forms:
3445  *
3446  * i) A single cblock, eg. '3456'
3447  * ii) A begin and end cblock with dots between, eg. 123-234
3448  */
3449 static int parse_cblock_range(struct cache *cache, const char *str,
3450                               struct cblock_range *result)
3451 {
3452         char dummy;
3453         uint64_t b, e;
3454         int r;
3455
3456         /*
3457          * Try and parse form (ii) first.
3458          */
3459         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3460         if (r < 0)
3461                 return r;
3462
3463         if (r == 2) {
3464                 result->begin = to_cblock(b);
3465                 result->end = to_cblock(e);
3466                 return 0;
3467         }
3468
3469         /*
3470          * That didn't work, try form (i).
3471          */
3472         r = sscanf(str, "%llu%c", &b, &dummy);
3473         if (r < 0)
3474                 return r;
3475
3476         if (r == 1) {
3477                 result->begin = to_cblock(b);
3478                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3479                 return 0;
3480         }
3481
3482         DMERR("invalid cblock range '%s'", str);
3483         return -EINVAL;
3484 }
3485
3486 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3487 {
3488         uint64_t b = from_cblock(range->begin);
3489         uint64_t e = from_cblock(range->end);
3490         uint64_t n = from_cblock(cache->cache_size);
3491
3492         if (b >= n) {
3493                 DMERR("begin cblock out of range: %llu >= %llu", b, n);
3494                 return -EINVAL;
3495         }
3496
3497         if (e > n) {
3498                 DMERR("end cblock out of range: %llu > %llu", e, n);
3499                 return -EINVAL;
3500         }
3501
3502         if (b >= e) {
3503                 DMERR("invalid cblock range: %llu >= %llu", b, e);
3504                 return -EINVAL;
3505         }
3506
3507         return 0;
3508 }
3509
3510 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3511 {
3512         struct invalidation_request req;
3513
3514         INIT_LIST_HEAD(&req.list);
3515         req.cblocks = range;
3516         atomic_set(&req.complete, 0);
3517         req.err = 0;
3518         init_waitqueue_head(&req.result_wait);
3519
3520         spin_lock(&cache->invalidation_lock);
3521         list_add(&req.list, &cache->invalidation_requests);
3522         spin_unlock(&cache->invalidation_lock);
3523         wake_worker(cache);
3524
3525         wait_event(req.result_wait, atomic_read(&req.complete));
3526         return req.err;
3527 }
3528
3529 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3530                                               const char **cblock_ranges)
3531 {
3532         int r = 0;
3533         unsigned i;
3534         struct cblock_range range;
3535
3536         if (!passthrough_mode(&cache->features)) {
3537                 DMERR("cache has to be in passthrough mode for invalidation");
3538                 return -EPERM;
3539         }
3540
3541         for (i = 0; i < count; i++) {
3542                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3543                 if (r)
3544                         break;
3545
3546                 r = validate_cblock_range(cache, &range);
3547                 if (r)
3548                         break;
3549
3550                 /*
3551                  * Pass begin and end origin blocks to the worker and wake it.
3552                  */
3553                 r = request_invalidation(cache, &range);
3554                 if (r)
3555                         break;
3556         }
3557
3558         return r;
3559 }
3560
3561 /*
3562  * Supports
3563  *      "<key> <value>"
3564  * and
3565  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3566  *
3567  * The key migration_threshold is supported by the cache target core.
3568  */
3569 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3570 {
3571         struct cache *cache = ti->private;
3572
3573         if (!argc)
3574                 return -EINVAL;
3575
3576         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3577                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3578
3579         if (argc != 2)
3580                 return -EINVAL;
3581
3582         return set_config_value(cache, argv[0], argv[1]);
3583 }
3584
3585 static int cache_iterate_devices(struct dm_target *ti,
3586                                  iterate_devices_callout_fn fn, void *data)
3587 {
3588         int r = 0;
3589         struct cache *cache = ti->private;
3590
3591         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3592         if (!r)
3593                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3594
3595         return r;
3596 }
3597
3598 /*
3599  * We assume I/O is going to the origin (which is the volume
3600  * more likely to have restrictions e.g. by being striped).
3601  * (Looking up the exact location of the data would be expensive
3602  * and could always be out of date by the time the bio is submitted.)
3603  */
3604 static int cache_bvec_merge(struct dm_target *ti,
3605                             struct bvec_merge_data *bvm,
3606                             struct bio_vec *biovec, int max_size)
3607 {
3608         struct cache *cache = ti->private;
3609         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3610
3611         if (!q->merge_bvec_fn)
3612                 return max_size;
3613
3614         bvm->bi_bdev = cache->origin_dev->bdev;
3615         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3616 }
3617
3618 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3619 {
3620         /*
3621          * FIXME: these limits may be incompatible with the cache device
3622          */
3623         limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3624                                             cache->origin_sectors);
3625         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3626 }
3627
3628 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3629 {
3630         struct cache *cache = ti->private;
3631         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3632
3633         /*
3634          * If the system-determined stacked limits are compatible with the
3635          * cache's blocksize (io_opt is a factor) do not override them.
3636          */
3637         if (io_opt_sectors < cache->sectors_per_block ||
3638             do_div(io_opt_sectors, cache->sectors_per_block)) {
3639                 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3640                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3641         }
3642         set_discard_limits(cache, limits);
3643 }
3644
3645 /*----------------------------------------------------------------*/
3646
3647 static struct target_type cache_target = {
3648         .name = "cache",
3649         .version = {1, 6, 0},
3650         .module = THIS_MODULE,
3651         .ctr = cache_ctr,
3652         .dtr = cache_dtr,
3653         .map = cache_map,
3654         .end_io = cache_end_io,
3655         .postsuspend = cache_postsuspend,
3656         .preresume = cache_preresume,
3657         .resume = cache_resume,
3658         .status = cache_status,
3659         .message = cache_message,
3660         .iterate_devices = cache_iterate_devices,
3661         .merge = cache_bvec_merge,
3662         .io_hints = cache_io_hints,
3663 };
3664
3665 static int __init dm_cache_init(void)
3666 {
3667         int r;
3668
3669         r = dm_register_target(&cache_target);
3670         if (r) {
3671                 DMERR("cache target registration failed: %d", r);
3672                 return r;
3673         }
3674
3675         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3676         if (!migration_cache) {
3677                 dm_unregister_target(&cache_target);
3678                 return -ENOMEM;
3679         }
3680
3681         return 0;
3682 }
3683
3684 static void __exit dm_cache_exit(void)
3685 {
3686         dm_unregister_target(&cache_target);
3687         kmem_cache_destroy(migration_cache);
3688 }
3689
3690 module_init(dm_cache_init);
3691 module_exit(dm_cache_exit);
3692
3693 MODULE_DESCRIPTION(DM_NAME " cache target");
3694 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3695 MODULE_LICENSE("GPL");