drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison-v2.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/jiffies.h>
  15 #include <linux/init.h>
  16 #include <linux/mempool.h>
  17 #include <linux/module.h>
  18 #include <linux/rwsem.h>
  19 #include <linux/slab.h>
  20 #include <linux/vmalloc.h>
  21
  22 #define DM_MSG_PREFIX "cache"
  23
  24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  25         "A percentage of time allocated for copying to and/or from cache");
  26
  27 /*----------------------------------------------------------------*/
  28
  29 /*
  30  * Glossary:
  31  *
  32  * oblock: index of an origin block
  33  * cblock: index of a cache block
  34  * promotion: movement of a block from origin to cache
  35  * demotion: movement of a block from cache to origin
  36  * migration: movement of a block between the origin and cache device,
  37  *            either direction
  38  */
  39
  40 /*----------------------------------------------------------------*/
  41
  42 struct io_tracker {
  43         spinlock_t lock;
  44
  45         /*
  46          * Sectors of in-flight IO.
  47          */
  48         sector_t in_flight;
  49
  50         /*
  51          * The time, in jiffies, when this device became idle (if it is
  52          * indeed idle).
  53          */
  54         unsigned long idle_time;
  55         unsigned long last_update_time;
  56 };
  57
  58 static void iot_init(struct io_tracker *iot)
  59 {
  60         spin_lock_init(&iot->lock);
  61         iot->in_flight = 0ul;
  62         iot->idle_time = 0ul;
  63         iot->last_update_time = jiffies;
  64 }
  65
  66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  67 {
  68         if (iot->in_flight)
  69                 return false;
  70
  71         return time_after(jiffies, iot->idle_time + jifs);
  72 }
  73
  74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  75 {
  76         bool r;
  77
  78         spin_lock_irq(&iot->lock);
  79         r = __iot_idle_for(iot, jifs);
  80         spin_unlock_irq(&iot->lock);
  81
  82         return r;
  83 }
  84
  85 static void iot_io_begin(struct io_tracker *iot, sector_t len)
  86 {
  87         spin_lock_irq(&iot->lock);
  88         iot->in_flight += len;
  89         spin_unlock_irq(&iot->lock);
  90 }
  91
  92 static void __iot_io_end(struct io_tracker *iot, sector_t len)
  93 {
  94         if (!len)
  95                 return;
  96
  97         iot->in_flight -= len;
  98         if (!iot->in_flight)
  99                 iot->idle_time = jiffies;
 100 }
 101
 102 static void iot_io_end(struct io_tracker *iot, sector_t len)
 103 {
 104         unsigned long flags;
 105
 106         spin_lock_irqsave(&iot->lock, flags);
 107         __iot_io_end(iot, len);
 108         spin_unlock_irqrestore(&iot->lock, flags);
 109 }
 110
 111 /*----------------------------------------------------------------*/
 112
 113 /*
 114  * Represents a chunk of future work.  'input' allows continuations to pass
 115  * values between themselves, typically error values.
 116  */
 117 struct continuation {
 118         struct work_struct ws;
 119         blk_status_t input;
 120 };
 121
 122 static inline void init_continuation(struct continuation *k,
 123                                      void (*fn)(struct work_struct *))
 124 {
 125         INIT_WORK(&k->ws, fn);
 126         k->input = 0;
 127 }
 128
 129 static inline void queue_continuation(struct workqueue_struct *wq,
 130                                       struct continuation *k)
 131 {
 132         queue_work(wq, &k->ws);
 133 }
 134
 135 /*----------------------------------------------------------------*/
 136
 137 /*
 138  * The batcher collects together pieces of work that need a particular
 139  * operation to occur before they can proceed (typically a commit).
 140  */
 141 struct batcher {
 142         /*
 143          * The operation that everyone is waiting for.
 144          */
 145         blk_status_t (*commit_op)(void *context);
 146         void *commit_context;
 147
 148         /*
 149          * This is how bios should be issued once the commit op is complete
 150          * (accounted_request).
 151          */
 152         void (*issue_op)(struct bio *bio, void *context);
 153         void *issue_context;
 154
 155         /*
 156          * Queued work gets put on here after commit.
 157          */
 158         struct workqueue_struct *wq;
 159
 160         spinlock_t lock;
 161         struct list_head work_items;
 162         struct bio_list bios;
 163         struct work_struct commit_work;
 164
 165         bool commit_scheduled;
 166 };
 167
 168 static void __commit(struct work_struct *_ws)
 169 {
 170         struct batcher *b = container_of(_ws, struct batcher, commit_work);
 171         blk_status_t r;
 172         struct list_head work_items;
 173         struct work_struct *ws, *tmp;
 174         struct continuation *k;
 175         struct bio *bio;
 176         struct bio_list bios;
 177
 178         INIT_LIST_HEAD(&work_items);
 179         bio_list_init(&bios);
 180
 181         /*
 182          * We have to grab these before the commit_op to avoid a race
 183          * condition.
 184          */
 185         spin_lock_irq(&b->lock);
 186         list_splice_init(&b->work_items, &work_items);
 187         bio_list_merge(&bios, &b->bios);
 188         bio_list_init(&b->bios);
 189         b->commit_scheduled = false;
 190         spin_unlock_irq(&b->lock);
 191
 192         r = b->commit_op(b->commit_context);
 193
 194         list_for_each_entry_safe(ws, tmp, &work_items, entry) {
 195                 k = container_of(ws, struct continuation, ws);
 196                 k->input = r;
 197                 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
 198                 queue_work(b->wq, ws);
 199         }
 200
 201         while ((bio = bio_list_pop(&bios))) {
 202                 if (r) {
 203                         bio->bi_status = r;
 204                         bio_endio(bio);
 205                 } else
 206                         b->issue_op(bio, b->issue_context);
 207         }
 208 }
 209
 210 static void batcher_init(struct batcher *b,
 211                          blk_status_t (*commit_op)(void *),
 212                          void *commit_context,
 213                          void (*issue_op)(struct bio *bio, void *),
 214                          void *issue_context,
 215                          struct workqueue_struct *wq)
 216 {
 217         b->commit_op = commit_op;
 218         b->commit_context = commit_context;
 219         b->issue_op = issue_op;
 220         b->issue_context = issue_context;
 221         b->wq = wq;
 222
 223         spin_lock_init(&b->lock);
 224         INIT_LIST_HEAD(&b->work_items);
 225         bio_list_init(&b->bios);
 226         INIT_WORK(&b->commit_work, __commit);
 227         b->commit_scheduled = false;
 228 }
 229
 230 static void async_commit(struct batcher *b)
 231 {
 232         queue_work(b->wq, &b->commit_work);
 233 }
 234
 235 static void continue_after_commit(struct batcher *b, struct continuation *k)
 236 {
 237         bool commit_scheduled;
 238
 239         spin_lock_irq(&b->lock);
 240         commit_scheduled = b->commit_scheduled;
 241         list_add_tail(&k->ws.entry, &b->work_items);
 242         spin_unlock_irq(&b->lock);
 243
 244         if (commit_scheduled)
 245                 async_commit(b);
 246 }
 247
 248 /*
 249  * Bios are errored if commit failed.
 250  */
 251 static void issue_after_commit(struct batcher *b, struct bio *bio)
 252 {
 253        bool commit_scheduled;
 254
 255        spin_lock_irq(&b->lock);
 256        commit_scheduled = b->commit_scheduled;
 257        bio_list_add(&b->bios, bio);
 258        spin_unlock_irq(&b->lock);
 259
 260        if (commit_scheduled)
 261                async_commit(b);
 262 }
 263
 264 /*
 265  * Call this if some urgent work is waiting for the commit to complete.
 266  */
 267 static void schedule_commit(struct batcher *b)
 268 {
 269         bool immediate;
 270
 271         spin_lock_irq(&b->lock);
 272         immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
 273         b->commit_scheduled = true;
 274         spin_unlock_irq(&b->lock);
 275
 276         if (immediate)
 277                 async_commit(b);
 278 }
 279
 280 /*
 281  * There are a couple of places where we let a bio run, but want to do some
 282  * work before calling its endio function.  We do this by temporarily
 283  * changing the endio fn.
 284  */
 285 struct dm_hook_info {
 286         bio_end_io_t *bi_end_io;
 287 };
 288
 289 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 290                         bio_end_io_t *bi_end_io, void *bi_private)
 291 {
 292         h->bi_end_io = bio->bi_end_io;
 293
 294         bio->bi_end_io = bi_end_io;
 295         bio->bi_private = bi_private;
 296 }
 297
 298 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 299 {
 300         bio->bi_end_io = h->bi_end_io;
 301 }
 302
 303 /*----------------------------------------------------------------*/
 304
 305 #define MIGRATION_POOL_SIZE 128
 306 #define COMMIT_PERIOD HZ
 307 #define MIGRATION_COUNT_WINDOW 10
 308
 309 /*
 310  * The block size of the device holding cache data must be
 311  * between 32KB and 1GB.
 312  */
 313 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 314 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 315
 316 enum cache_metadata_mode {
 317         CM_WRITE,               /* metadata may be changed */
 318         CM_READ_ONLY,           /* metadata may not be changed */
 319         CM_FAIL
 320 };
 321
 322 enum cache_io_mode {
 323         /*
 324          * Data is written to cached blocks only.  These blocks are marked
 325          * dirty.  If you lose the cache device you will lose data.
 326          * Potential performance increase for both reads and writes.
 327          */
 328         CM_IO_WRITEBACK,
 329
 330         /*
 331          * Data is written to both cache and origin.  Blocks are never
 332          * dirty.  Potential performance benfit for reads only.
 333          */
 334         CM_IO_WRITETHROUGH,
 335
 336         /*
 337          * A degraded mode useful for various cache coherency situations
 338          * (eg, rolling back snapshots).  Reads and writes always go to the
 339          * origin.  If a write goes to a cached oblock, then the cache
 340          * block is invalidated.
 341          */
 342         CM_IO_PASSTHROUGH
 343 };
 344
 345 struct cache_features {
 346         enum cache_metadata_mode mode;
 347         enum cache_io_mode io_mode;
 348         unsigned metadata_version;
 349         bool discard_passdown:1;
 350 };
 351
 352 struct cache_stats {
 353         atomic_t read_hit;
 354         atomic_t read_miss;
 355         atomic_t write_hit;
 356         atomic_t write_miss;
 357         atomic_t demotion;
 358         atomic_t promotion;
 359         atomic_t writeback;
 360         atomic_t copies_avoided;
 361         atomic_t cache_cell_clash;
 362         atomic_t commit_count;
 363         atomic_t discard_count;
 364 };
 365
 366 struct cache {
 367         struct dm_target *ti;
 368         spinlock_t lock;
 369
 370         /*
 371          * Fields for converting from sectors to blocks.
 372          */
 373         int sectors_per_block_shift;
 374         sector_t sectors_per_block;
 375
 376         struct dm_cache_metadata *cmd;
 377
 378         /*
 379          * Metadata is written to this device.
 380          */
 381         struct dm_dev *metadata_dev;
 382
 383         /*
 384          * The slower of the two data devices.  Typically a spindle.
 385          */
 386         struct dm_dev *origin_dev;
 387
 388         /*
 389          * The faster of the two data devices.  Typically an SSD.
 390          */
 391         struct dm_dev *cache_dev;
 392
 393         /*
 394          * Size of the origin device in _complete_ blocks and native sectors.
 395          */
 396         dm_oblock_t origin_blocks;
 397         sector_t origin_sectors;
 398
 399         /*
 400          * Size of the cache device in blocks.
 401          */
 402         dm_cblock_t cache_size;
 403
 404         /*
 405          * Invalidation fields.
 406          */
 407         spinlock_t invalidation_lock;
 408         struct list_head invalidation_requests;
 409
 410         sector_t migration_threshold;
 411         wait_queue_head_t migration_wait;
 412         atomic_t nr_allocated_migrations;
 413
 414         /*
 415          * The number of in flight migrations that are performing
 416          * background io. eg, promotion, writeback.
 417          */
 418         atomic_t nr_io_migrations;
 419
 420         struct bio_list deferred_bios;
 421
 422         struct rw_semaphore quiesce_lock;
 423
 424         struct dm_target_callbacks callbacks;
 425
 426         /*
 427          * origin_blocks entries, discarded if set.
 428          */
 429         dm_dblock_t discard_nr_blocks;
 430         unsigned long *discard_bitset;
 431         uint32_t discard_block_size; /* a power of 2 times sectors per block */
 432
 433         /*
 434          * Rather than reconstructing the table line for the status we just
 435          * save it and regurgitate.
 436          */
 437         unsigned nr_ctr_args;
 438         const char **ctr_args;
 439
 440         struct dm_kcopyd_client *copier;
 441         struct work_struct deferred_bio_worker;
 442         struct work_struct migration_worker;
 443         struct workqueue_struct *wq;
 444         struct delayed_work waker;
 445         struct dm_bio_prison_v2 *prison;
 446
 447         /*
 448          * cache_size entries, dirty if set
 449          */
 450         unsigned long *dirty_bitset;
 451         atomic_t nr_dirty;
 452
 453         unsigned policy_nr_args;
 454         struct dm_cache_policy *policy;
 455
 456         /*
 457          * Cache features such as write-through.
 458          */
 459         struct cache_features features;
 460
 461         struct cache_stats stats;
 462
 463         bool need_tick_bio:1;
 464         bool sized:1;
 465         bool invalidate:1;
 466         bool commit_requested:1;
 467         bool loaded_mappings:1;
 468         bool loaded_discards:1;
 469
 470         struct rw_semaphore background_work_lock;
 471
 472         struct batcher committer;
 473         struct work_struct commit_ws;
 474
 475         struct io_tracker tracker;
 476
 477         mempool_t migration_pool;
 478
 479         struct bio_set bs;
 480 };
 481
 482 struct per_bio_data {
 483         bool tick:1;
 484         unsigned req_nr:2;
 485         struct dm_bio_prison_cell_v2 *cell;
 486         struct dm_hook_info hook_info;
 487         sector_t len;
 488 };
 489
 490 struct dm_cache_migration {
 491         struct continuation k;
 492         struct cache *cache;
 493
 494         struct policy_work *op;
 495         struct bio *overwrite_bio;
 496         struct dm_bio_prison_cell_v2 *cell;
 497
 498         dm_cblock_t invalidate_cblock;
 499         dm_oblock_t invalidate_oblock;
 500 };
 501
 502 /*----------------------------------------------------------------*/
 503
 504 static bool writethrough_mode(struct cache *cache)
 505 {
 506         return cache->features.io_mode == CM_IO_WRITETHROUGH;
 507 }
 508
 509 static bool writeback_mode(struct cache *cache)
 510 {
 511         return cache->features.io_mode == CM_IO_WRITEBACK;
 512 }
 513
 514 static inline bool passthrough_mode(struct cache *cache)
 515 {
 516         return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
 517 }
 518
 519 /*----------------------------------------------------------------*/
 520
 521 static void wake_deferred_bio_worker(struct cache *cache)
 522 {
 523         queue_work(cache->wq, &cache->deferred_bio_worker);
 524 }
 525
 526 static void wake_migration_worker(struct cache *cache)
 527 {
 528         if (passthrough_mode(cache))
 529                 return;
 530
 531         queue_work(cache->wq, &cache->migration_worker);
 532 }
 533
 534 /*----------------------------------------------------------------*/
 535
 536 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
 537 {
 538         return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
 539 }
 540
 541 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
 542 {
 543         dm_bio_prison_free_cell_v2(cache->prison, cell);
 544 }
 545
 546 static struct dm_cache_migration *alloc_migration(struct cache *cache)
 547 {
 548         struct dm_cache_migration *mg;
 549
 550         mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
 551
 552         memset(mg, 0, sizeof(*mg));
 553
 554         mg->cache = cache;
 555         atomic_inc(&cache->nr_allocated_migrations);
 556
 557         return mg;
 558 }
 559
 560 static void free_migration(struct dm_cache_migration *mg)
 561 {
 562         struct cache *cache = mg->cache;
 563
 564         if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 565                 wake_up(&cache->migration_wait);
 566
 567         mempool_free(mg, &cache->migration_pool);
 568 }
 569
 570 /*----------------------------------------------------------------*/
 571
 572 static inline dm_oblock_t oblock_succ(dm_oblock_t b)
 573 {
 574         return to_oblock(from_oblock(b) + 1ull);
 575 }
 576
 577 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 578 {
 579         key->virtual = 0;
 580         key->dev = 0;
 581         key->block_begin = from_oblock(begin);
 582         key->block_end = from_oblock(end);
 583 }
 584
 585 /*
 586  * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
 587  * level 1 which prevents *both* READs and WRITEs.
 588  */
 589 #define WRITE_LOCK_LEVEL 0
 590 #define READ_WRITE_LOCK_LEVEL 1
 591
 592 static unsigned lock_level(struct bio *bio)
 593 {
 594         return bio_data_dir(bio) == WRITE ?
 595                 WRITE_LOCK_LEVEL :
 596                 READ_WRITE_LOCK_LEVEL;
 597 }
 598
 599 /*----------------------------------------------------------------
 600  * Per bio data
 601  *--------------------------------------------------------------*/
 602
 603 static struct per_bio_data *get_per_bio_data(struct bio *bio)
 604 {
 605         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 606         BUG_ON(!pb);
 607         return pb;
 608 }
 609
 610 static struct per_bio_data *init_per_bio_data(struct bio *bio)
 611 {
 612         struct per_bio_data *pb = get_per_bio_data(bio);
 613
 614         pb->tick = false;
 615         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 616         pb->cell = NULL;
 617         pb->len = 0;
 618
 619         return pb;
 620 }
 621
 622 /*----------------------------------------------------------------*/
 623
 624 static void defer_bio(struct cache *cache, struct bio *bio)
 625 {
 626         spin_lock_irq(&cache->lock);
 627         bio_list_add(&cache->deferred_bios, bio);
 628         spin_unlock_irq(&cache->lock);
 629
 630         wake_deferred_bio_worker(cache);
 631 }
 632
 633 static void defer_bios(struct cache *cache, struct bio_list *bios)
 634 {
 635         spin_lock_irq(&cache->lock);
 636         bio_list_merge(&cache->deferred_bios, bios);
 637         bio_list_init(bios);
 638         spin_unlock_irq(&cache->lock);
 639
 640         wake_deferred_bio_worker(cache);
 641 }
 642
 643 /*----------------------------------------------------------------*/
 644
 645 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
 646 {
 647         bool r;
 648         struct per_bio_data *pb;
 649         struct dm_cell_key_v2 key;
 650         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
 651         struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
 652
 653         cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
 654
 655         build_key(oblock, end, &key);
 656         r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
 657         if (!r) {
 658                 /*
 659                  * Failed to get the lock.
 660                  */
 661                 free_prison_cell(cache, cell_prealloc);
 662                 return r;
 663         }
 664
 665         if (cell != cell_prealloc)
 666                 free_prison_cell(cache, cell_prealloc);
 667
 668         pb = get_per_bio_data(bio);
 669         pb->cell = cell;
 670
 671         return r;
 672 }
 673
 674 /*----------------------------------------------------------------*/
 675
 676 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 677 {
 678         return test_bit(from_cblock(b), cache->dirty_bitset);
 679 }
 680
 681 static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 682 {
 683         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 684                 atomic_inc(&cache->nr_dirty);
 685                 policy_set_dirty(cache->policy, cblock);
 686         }
 687 }
 688
 689 /*
 690  * These two are called when setting after migrations to force the policy
 691  * and dirty bitset to be in sync.
 692  */
 693 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
 694 {
 695         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
 696                 atomic_inc(&cache->nr_dirty);
 697         policy_set_dirty(cache->policy, cblock);
 698 }
 699
 700 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 701 {
 702         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 703                 if (atomic_dec_return(&cache->nr_dirty) == 0)
 704                         dm_table_event(cache->ti->table);
 705         }
 706
 707         policy_clear_dirty(cache->policy, cblock);
 708 }
 709
 710 /*----------------------------------------------------------------*/
 711
 712 static bool block_size_is_power_of_two(struct cache *cache)
 713 {
 714         return cache->sectors_per_block_shift >= 0;
 715 }
 716
 717 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 718 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 719 __always_inline
 720 #endif
 721 static dm_block_t block_div(dm_block_t b, uint32_t n)
 722 {
 723         do_div(b, n);
 724
 725         return b;
 726 }
 727
 728 static dm_block_t oblocks_per_dblock(struct cache *cache)
 729 {
 730         dm_block_t oblocks = cache->discard_block_size;
 731
 732         if (block_size_is_power_of_two(cache))
 733                 oblocks >>= cache->sectors_per_block_shift;
 734         else
 735                 oblocks = block_div(oblocks, cache->sectors_per_block);
 736
 737         return oblocks;
 738 }
 739
 740 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 741 {
 742         return to_dblock(block_div(from_oblock(oblock),
 743                                    oblocks_per_dblock(cache)));
 744 }
 745
 746 static void set_discard(struct cache *cache, dm_dblock_t b)
 747 {
 748         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 749         atomic_inc(&cache->stats.discard_count);
 750
 751         spin_lock_irq(&cache->lock);
 752         set_bit(from_dblock(b), cache->discard_bitset);
 753         spin_unlock_irq(&cache->lock);
 754 }
 755
 756 static void clear_discard(struct cache *cache, dm_dblock_t b)
 757 {
 758         spin_lock_irq(&cache->lock);
 759         clear_bit(from_dblock(b), cache->discard_bitset);
 760         spin_unlock_irq(&cache->lock);
 761 }
 762
 763 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 764 {
 765         int r;
 766         spin_lock_irq(&cache->lock);
 767         r = test_bit(from_dblock(b), cache->discard_bitset);
 768         spin_unlock_irq(&cache->lock);
 769
 770         return r;
 771 }
 772
 773 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 774 {
 775         int r;
 776         spin_lock_irq(&cache->lock);
 777         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 778                      cache->discard_bitset);
 779         spin_unlock_irq(&cache->lock);
 780
 781         return r;
 782 }
 783
 784 /*----------------------------------------------------------------
 785  * Remapping
 786  *--------------------------------------------------------------*/
 787 static void remap_to_origin(struct cache *cache, struct bio *bio)
 788 {
 789         bio_set_dev(bio, cache->origin_dev->bdev);
 790 }
 791
 792 static void remap_to_cache(struct cache *cache, struct bio *bio,
 793                            dm_cblock_t cblock)
 794 {
 795         sector_t bi_sector = bio->bi_iter.bi_sector;
 796         sector_t block = from_cblock(cblock);
 797
 798         bio_set_dev(bio, cache->cache_dev->bdev);
 799         if (!block_size_is_power_of_two(cache))
 800                 bio->bi_iter.bi_sector =
 801                         (block * cache->sectors_per_block) +
 802                         sector_div(bi_sector, cache->sectors_per_block);
 803         else
 804                 bio->bi_iter.bi_sector =
 805                         (block << cache->sectors_per_block_shift) |
 806                         (bi_sector & (cache->sectors_per_block - 1));
 807 }
 808
 809 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 810 {
 811         struct per_bio_data *pb;
 812
 813         spin_lock_irq(&cache->lock);
 814         if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 815             bio_op(bio) != REQ_OP_DISCARD) {
 816                 pb = get_per_bio_data(bio);
 817                 pb->tick = true;
 818                 cache->need_tick_bio = false;
 819         }
 820         spin_unlock_irq(&cache->lock);
 821 }
 822
 823 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 824                                             dm_oblock_t oblock, bool bio_has_pbd)
 825 {
 826         if (bio_has_pbd)
 827                 check_if_tick_bio_needed(cache, bio);
 828         remap_to_origin(cache, bio);
 829         if (bio_data_dir(bio) == WRITE)
 830                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 831 }
 832
 833 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 834                                           dm_oblock_t oblock)
 835 {
 836         // FIXME: check_if_tick_bio_needed() is called way too much through this interface
 837         __remap_to_origin_clear_discard(cache, bio, oblock, true);
 838 }
 839
 840 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 841                                  dm_oblock_t oblock, dm_cblock_t cblock)
 842 {
 843         check_if_tick_bio_needed(cache, bio);
 844         remap_to_cache(cache, bio, cblock);
 845         if (bio_data_dir(bio) == WRITE) {
 846                 set_dirty(cache, cblock);
 847                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 848         }
 849 }
 850
 851 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 852 {
 853         sector_t block_nr = bio->bi_iter.bi_sector;
 854
 855         if (!block_size_is_power_of_two(cache))
 856                 (void) sector_div(block_nr, cache->sectors_per_block);
 857         else
 858                 block_nr >>= cache->sectors_per_block_shift;
 859
 860         return to_oblock(block_nr);
 861 }
 862
 863 static bool accountable_bio(struct cache *cache, struct bio *bio)
 864 {
 865         return bio_op(bio) != REQ_OP_DISCARD;
 866 }
 867
 868 static void accounted_begin(struct cache *cache, struct bio *bio)
 869 {
 870         struct per_bio_data *pb;
 871
 872         if (accountable_bio(cache, bio)) {
 873                 pb = get_per_bio_data(bio);
 874                 pb->len = bio_sectors(bio);
 875                 iot_io_begin(&cache->tracker, pb->len);
 876         }
 877 }
 878
 879 static void accounted_complete(struct cache *cache, struct bio *bio)
 880 {
 881         struct per_bio_data *pb = get_per_bio_data(bio);
 882
 883         iot_io_end(&cache->tracker, pb->len);
 884 }
 885
 886 static void accounted_request(struct cache *cache, struct bio *bio)
 887 {
 888         accounted_begin(cache, bio);
 889         generic_make_request(bio);
 890 }
 891
 892 static void issue_op(struct bio *bio, void *context)
 893 {
 894         struct cache *cache = context;
 895         accounted_request(cache, bio);
 896 }
 897
 898 /*
 899  * When running in writethrough mode we need to send writes to clean blocks
 900  * to both the cache and origin devices.  Clone the bio and send them in parallel.
 901  */
 902 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 903                                       dm_oblock_t oblock, dm_cblock_t cblock)
 904 {
 905         struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
 906
 907         BUG_ON(!origin_bio);
 908
 909         bio_chain(origin_bio, bio);
 910         /*
 911          * Passing false to __remap_to_origin_clear_discard() skips
 912          * all code that might use per_bio_data (since clone doesn't have it)
 913          */
 914         __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
 915         submit_bio(origin_bio);
 916
 917         remap_to_cache(cache, bio, cblock);
 918 }
 919
 920 /*----------------------------------------------------------------
 921  * Failure modes
 922  *--------------------------------------------------------------*/
 923 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 924 {
 925         return cache->features.mode;
 926 }
 927
 928 static const char *cache_device_name(struct cache *cache)
 929 {
 930         return dm_device_name(dm_table_get_md(cache->ti->table));
 931 }
 932
 933 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 934 {
 935         const char *descs[] = {
 936                 "write",
 937                 "read-only",
 938                 "fail"
 939         };
 940
 941         dm_table_event(cache->ti->table);
 942         DMINFO("%s: switching cache to %s mode",
 943                cache_device_name(cache), descs[(int)mode]);
 944 }
 945
 946 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 947 {
 948         bool needs_check;
 949         enum cache_metadata_mode old_mode = get_cache_mode(cache);
 950
 951         if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
 952                 DMERR("%s: unable to read needs_check flag, setting failure mode.",
 953                       cache_device_name(cache));
 954                 new_mode = CM_FAIL;
 955         }
 956
 957         if (new_mode == CM_WRITE && needs_check) {
 958                 DMERR("%s: unable to switch cache to write mode until repaired.",
 959                       cache_device_name(cache));
 960                 if (old_mode != new_mode)
 961                         new_mode = old_mode;
 962                 else
 963                         new_mode = CM_READ_ONLY;
 964         }
 965
 966         /* Never move out of fail mode */
 967         if (old_mode == CM_FAIL)
 968                 new_mode = CM_FAIL;
 969
 970         switch (new_mode) {
 971         case CM_FAIL:
 972         case CM_READ_ONLY:
 973                 dm_cache_metadata_set_read_only(cache->cmd);
 974                 break;
 975
 976         case CM_WRITE:
 977                 dm_cache_metadata_set_read_write(cache->cmd);
 978                 break;
 979         }
 980
 981         cache->features.mode = new_mode;
 982
 983         if (new_mode != old_mode)
 984                 notify_mode_switch(cache, new_mode);
 985 }
 986
 987 static void abort_transaction(struct cache *cache)
 988 {
 989         const char *dev_name = cache_device_name(cache);
 990
 991         if (get_cache_mode(cache) >= CM_READ_ONLY)
 992                 return;
 993
 994         if (dm_cache_metadata_set_needs_check(cache->cmd)) {
 995                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 996                 set_cache_mode(cache, CM_FAIL);
 997         }
 998
 999         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1000         if (dm_cache_metadata_abort(cache->cmd)) {
1001                 DMERR("%s: failed to abort metadata transaction", dev_name);
1002                 set_cache_mode(cache, CM_FAIL);
1003         }
1004 }
1005
1006 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1007 {
1008         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1009                     cache_device_name(cache), op, r);
1010         abort_transaction(cache);
1011         set_cache_mode(cache, CM_READ_ONLY);
1012 }
1013
1014 /*----------------------------------------------------------------*/
1015
1016 static void load_stats(struct cache *cache)
1017 {
1018         struct dm_cache_statistics stats;
1019
1020         dm_cache_metadata_get_stats(cache->cmd, &stats);
1021         atomic_set(&cache->stats.read_hit, stats.read_hits);
1022         atomic_set(&cache->stats.read_miss, stats.read_misses);
1023         atomic_set(&cache->stats.write_hit, stats.write_hits);
1024         atomic_set(&cache->stats.write_miss, stats.write_misses);
1025 }
1026
1027 static void save_stats(struct cache *cache)
1028 {
1029         struct dm_cache_statistics stats;
1030
1031         if (get_cache_mode(cache) >= CM_READ_ONLY)
1032                 return;
1033
1034         stats.read_hits = atomic_read(&cache->stats.read_hit);
1035         stats.read_misses = atomic_read(&cache->stats.read_miss);
1036         stats.write_hits = atomic_read(&cache->stats.write_hit);
1037         stats.write_misses = atomic_read(&cache->stats.write_miss);
1038
1039         dm_cache_metadata_set_stats(cache->cmd, &stats);
1040 }
1041
1042 static void update_stats(struct cache_stats *stats, enum policy_operation op)
1043 {
1044         switch (op) {
1045         case POLICY_PROMOTE:
1046                 atomic_inc(&stats->promotion);
1047                 break;
1048
1049         case POLICY_DEMOTE:
1050                 atomic_inc(&stats->demotion);
1051                 break;
1052
1053         case POLICY_WRITEBACK:
1054                 atomic_inc(&stats->writeback);
1055                 break;
1056         }
1057 }
1058
1059 /*----------------------------------------------------------------
1060  * Migration processing
1061  *
1062  * Migration covers moving data from the origin device to the cache, or
1063  * vice versa.
1064  *--------------------------------------------------------------*/
1065
1066 static void inc_io_migrations(struct cache *cache)
1067 {
1068         atomic_inc(&cache->nr_io_migrations);
1069 }
1070
1071 static void dec_io_migrations(struct cache *cache)
1072 {
1073         atomic_dec(&cache->nr_io_migrations);
1074 }
1075
1076 static bool discard_or_flush(struct bio *bio)
1077 {
1078         return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1079 }
1080
1081 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1082                                      dm_dblock_t *b, dm_dblock_t *e)
1083 {
1084         sector_t sb = bio->bi_iter.bi_sector;
1085         sector_t se = bio_end_sector(bio);
1086
1087         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1088
1089         if (se - sb < cache->discard_block_size)
1090                 *e = *b;
1091         else
1092                 *e = to_dblock(block_div(se, cache->discard_block_size));
1093 }
1094
1095 /*----------------------------------------------------------------*/
1096
1097 static void prevent_background_work(struct cache *cache)
1098 {
1099         lockdep_off();
1100         down_write(&cache->background_work_lock);
1101         lockdep_on();
1102 }
1103
1104 static void allow_background_work(struct cache *cache)
1105 {
1106         lockdep_off();
1107         up_write(&cache->background_work_lock);
1108         lockdep_on();
1109 }
1110
1111 static bool background_work_begin(struct cache *cache)
1112 {
1113         bool r;
1114
1115         lockdep_off();
1116         r = down_read_trylock(&cache->background_work_lock);
1117         lockdep_on();
1118
1119         return r;
1120 }
1121
1122 static void background_work_end(struct cache *cache)
1123 {
1124         lockdep_off();
1125         up_read(&cache->background_work_lock);
1126         lockdep_on();
1127 }
1128
1129 /*----------------------------------------------------------------*/
1130
1131 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1132 {
1133         return (bio_data_dir(bio) == WRITE) &&
1134                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1135 }
1136
1137 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1138 {
1139         return writeback_mode(cache) &&
1140                 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1141 }
1142
1143 static void quiesce(struct dm_cache_migration *mg,
1144                     void (*continuation)(struct work_struct *))
1145 {
1146         init_continuation(&mg->k, continuation);
1147         dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1148 }
1149
1150 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1151 {
1152         struct continuation *k = container_of(ws, struct continuation, ws);
1153         return container_of(k, struct dm_cache_migration, k);
1154 }
1155
1156 static void copy_complete(int read_err, unsigned long write_err, void *context)
1157 {
1158         struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1159
1160         if (read_err || write_err)
1161                 mg->k.input = BLK_STS_IOERR;
1162
1163         queue_continuation(mg->cache->wq, &mg->k);
1164 }
1165
1166 static void copy(struct dm_cache_migration *mg, bool promote)
1167 {
1168         struct dm_io_region o_region, c_region;
1169         struct cache *cache = mg->cache;
1170
1171         o_region.bdev = cache->origin_dev->bdev;
1172         o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1173         o_region.count = cache->sectors_per_block;
1174
1175         c_region.bdev = cache->cache_dev->bdev;
1176         c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1177         c_region.count = cache->sectors_per_block;
1178
1179         if (promote)
1180                 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1181         else
1182                 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1183 }
1184
1185 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1186 {
1187         struct per_bio_data *pb = get_per_bio_data(bio);
1188
1189         if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1190                 free_prison_cell(cache, pb->cell);
1191         pb->cell = NULL;
1192 }
1193
1194 static void overwrite_endio(struct bio *bio)
1195 {
1196         struct dm_cache_migration *mg = bio->bi_private;
1197         struct cache *cache = mg->cache;
1198         struct per_bio_data *pb = get_per_bio_data(bio);
1199
1200         dm_unhook_bio(&pb->hook_info, bio);
1201
1202         if (bio->bi_status)
1203                 mg->k.input = bio->bi_status;
1204
1205         queue_continuation(cache->wq, &mg->k);
1206 }
1207
1208 static void overwrite(struct dm_cache_migration *mg,
1209                       void (*continuation)(struct work_struct *))
1210 {
1211         struct bio *bio = mg->overwrite_bio;
1212         struct per_bio_data *pb = get_per_bio_data(bio);
1213
1214         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1215
1216         /*
1217          * The overwrite bio is part of the copy operation, as such it does
1218          * not set/clear discard or dirty flags.
1219          */
1220         if (mg->op->op == POLICY_PROMOTE)
1221                 remap_to_cache(mg->cache, bio, mg->op->cblock);
1222         else
1223                 remap_to_origin(mg->cache, bio);
1224
1225         init_continuation(&mg->k, continuation);
1226         accounted_request(mg->cache, bio);
1227 }
1228
1229 /*
1230  * Migration steps:
1231  *
1232  * 1) exclusive lock preventing WRITEs
1233  * 2) quiesce
1234  * 3) copy or issue overwrite bio
1235  * 4) upgrade to exclusive lock preventing READs and WRITEs
1236  * 5) quiesce
1237  * 6) update metadata and commit
1238  * 7) unlock
1239  */
1240 static void mg_complete(struct dm_cache_migration *mg, bool success)
1241 {
1242         struct bio_list bios;
1243         struct cache *cache = mg->cache;
1244         struct policy_work *op = mg->op;
1245         dm_cblock_t cblock = op->cblock;
1246
1247         if (success)
1248                 update_stats(&cache->stats, op->op);
1249
1250         switch (op->op) {
1251         case POLICY_PROMOTE:
1252                 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1253                 policy_complete_background_work(cache->policy, op, success);
1254
1255                 if (mg->overwrite_bio) {
1256                         if (success)
1257                                 force_set_dirty(cache, cblock);
1258                         else if (mg->k.input)
1259                                 mg->overwrite_bio->bi_status = mg->k.input;
1260                         else
1261                                 mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1262                         bio_endio(mg->overwrite_bio);
1263                 } else {
1264                         if (success)
1265                                 force_clear_dirty(cache, cblock);
1266                         dec_io_migrations(cache);
1267                 }
1268                 break;
1269
1270         case POLICY_DEMOTE:
1271                 /*
1272                  * We clear dirty here to update the nr_dirty counter.
1273                  */
1274                 if (success)
1275                         force_clear_dirty(cache, cblock);
1276                 policy_complete_background_work(cache->policy, op, success);
1277                 dec_io_migrations(cache);
1278                 break;
1279
1280         case POLICY_WRITEBACK:
1281                 if (success)
1282                         force_clear_dirty(cache, cblock);
1283                 policy_complete_background_work(cache->policy, op, success);
1284                 dec_io_migrations(cache);
1285                 break;
1286         }
1287
1288         bio_list_init(&bios);
1289         if (mg->cell) {
1290                 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1291                         free_prison_cell(cache, mg->cell);
1292         }
1293
1294         free_migration(mg);
1295         defer_bios(cache, &bios);
1296         wake_migration_worker(cache);
1297
1298         background_work_end(cache);
1299 }
1300
1301 static void mg_success(struct work_struct *ws)
1302 {
1303         struct dm_cache_migration *mg = ws_to_mg(ws);
1304         mg_complete(mg, mg->k.input == 0);
1305 }
1306
1307 static void mg_update_metadata(struct work_struct *ws)
1308 {
1309         int r;
1310         struct dm_cache_migration *mg = ws_to_mg(ws);
1311         struct cache *cache = mg->cache;
1312         struct policy_work *op = mg->op;
1313
1314         switch (op->op) {
1315         case POLICY_PROMOTE:
1316                 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1317                 if (r) {
1318                         DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1319                                     cache_device_name(cache));
1320                         metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1321
1322                         mg_complete(mg, false);
1323                         return;
1324                 }
1325                 mg_complete(mg, true);
1326                 break;
1327
1328         case POLICY_DEMOTE:
1329                 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1330                 if (r) {
1331                         DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1332                                     cache_device_name(cache));
1333                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1334
1335                         mg_complete(mg, false);
1336                         return;
1337                 }
1338
1339                 /*
1340                  * It would be nice if we only had to commit when a REQ_FLUSH
1341                  * comes through.  But there's one scenario that we have to
1342                  * look out for:
1343                  *
1344                  * - vblock x in a cache block
1345                  * - domotion occurs
1346                  * - cache block gets reallocated and over written
1347                  * - crash
1348                  *
1349                  * When we recover, because there was no commit the cache will
1350                  * rollback to having the data for vblock x in the cache block.
1351                  * But the cache block has since been overwritten, so it'll end
1352                  * up pointing to data that was never in 'x' during the history
1353                  * of the device.
1354                  *
1355                  * To avoid this issue we require a commit as part of the
1356                  * demotion operation.
1357                  */
1358                 init_continuation(&mg->k, mg_success);
1359                 continue_after_commit(&cache->committer, &mg->k);
1360                 schedule_commit(&cache->committer);
1361                 break;
1362
1363         case POLICY_WRITEBACK:
1364                 mg_complete(mg, true);
1365                 break;
1366         }
1367 }
1368
1369 static void mg_update_metadata_after_copy(struct work_struct *ws)
1370 {
1371         struct dm_cache_migration *mg = ws_to_mg(ws);
1372
1373         /*
1374          * Did the copy succeed?
1375          */
1376         if (mg->k.input)
1377                 mg_complete(mg, false);
1378         else
1379                 mg_update_metadata(ws);
1380 }
1381
1382 static void mg_upgrade_lock(struct work_struct *ws)
1383 {
1384         int r;
1385         struct dm_cache_migration *mg = ws_to_mg(ws);
1386
1387         /*
1388          * Did the copy succeed?
1389          */
1390         if (mg->k.input)
1391                 mg_complete(mg, false);
1392
1393         else {
1394                 /*
1395                  * Now we want the lock to prevent both reads and writes.
1396                  */
1397                 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1398                                             READ_WRITE_LOCK_LEVEL);
1399                 if (r < 0)
1400                         mg_complete(mg, false);
1401
1402                 else if (r)
1403                         quiesce(mg, mg_update_metadata);
1404
1405                 else
1406                         mg_update_metadata(ws);
1407         }
1408 }
1409
1410 static void mg_full_copy(struct work_struct *ws)
1411 {
1412         struct dm_cache_migration *mg = ws_to_mg(ws);
1413         struct cache *cache = mg->cache;
1414         struct policy_work *op = mg->op;
1415         bool is_policy_promote = (op->op == POLICY_PROMOTE);
1416
1417         if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1418             is_discarded_oblock(cache, op->oblock)) {
1419                 mg_upgrade_lock(ws);
1420                 return;
1421         }
1422
1423         init_continuation(&mg->k, mg_upgrade_lock);
1424         copy(mg, is_policy_promote);
1425 }
1426
1427 static void mg_copy(struct work_struct *ws)
1428 {
1429         struct dm_cache_migration *mg = ws_to_mg(ws);
1430
1431         if (mg->overwrite_bio) {
1432                 /*
1433                  * No exclusive lock was held when we last checked if the bio
1434                  * was optimisable.  So we have to check again in case things
1435                  * have changed (eg, the block may no longer be discarded).
1436                  */
1437                 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1438                         /*
1439                          * Fallback to a real full copy after doing some tidying up.
1440                          */
1441                         bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1442                         BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1443                         mg->overwrite_bio = NULL;
1444                         inc_io_migrations(mg->cache);
1445                         mg_full_copy(ws);
1446                         return;
1447                 }
1448
1449                 /*
1450                  * It's safe to do this here, even though it's new data
1451                  * because all IO has been locked out of the block.
1452                  *
1453                  * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1454                  * so _not_ using mg_upgrade_lock() as continutation.
1455                  */
1456                 overwrite(mg, mg_update_metadata_after_copy);
1457
1458         } else
1459                 mg_full_copy(ws);
1460 }
1461
1462 static int mg_lock_writes(struct dm_cache_migration *mg)
1463 {
1464         int r;
1465         struct dm_cell_key_v2 key;
1466         struct cache *cache = mg->cache;
1467         struct dm_bio_prison_cell_v2 *prealloc;
1468
1469         prealloc = alloc_prison_cell(cache);
1470
1471         /*
1472          * Prevent writes to the block, but allow reads to continue.
1473          * Unless we're using an overwrite bio, in which case we lock
1474          * everything.
1475          */
1476         build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1477         r = dm_cell_lock_v2(cache->prison, &key,
1478                             mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1479                             prealloc, &mg->cell);
1480         if (r < 0) {
1481                 free_prison_cell(cache, prealloc);
1482                 mg_complete(mg, false);
1483                 return r;
1484         }
1485
1486         if (mg->cell != prealloc)
1487                 free_prison_cell(cache, prealloc);
1488
1489         if (r == 0)
1490                 mg_copy(&mg->k.ws);
1491         else
1492                 quiesce(mg, mg_copy);
1493
1494         return 0;
1495 }
1496
1497 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1498 {
1499         struct dm_cache_migration *mg;
1500
1501         if (!background_work_begin(cache)) {
1502                 policy_complete_background_work(cache->policy, op, false);
1503                 return -EPERM;
1504         }
1505
1506         mg = alloc_migration(cache);
1507
1508         mg->op = op;
1509         mg->overwrite_bio = bio;
1510
1511         if (!bio)
1512                 inc_io_migrations(cache);
1513
1514         return mg_lock_writes(mg);
1515 }
1516
1517 /*----------------------------------------------------------------
1518  * invalidation processing
1519  *--------------------------------------------------------------*/
1520
1521 static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1522 {
1523         struct bio_list bios;
1524         struct cache *cache = mg->cache;
1525
1526         bio_list_init(&bios);
1527         if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1528                 free_prison_cell(cache, mg->cell);
1529
1530         if (!success && mg->overwrite_bio)
1531                 bio_io_error(mg->overwrite_bio);
1532
1533         free_migration(mg);
1534         defer_bios(cache, &bios);
1535
1536         background_work_end(cache);
1537 }
1538
1539 static void invalidate_completed(struct work_struct *ws)
1540 {
1541         struct dm_cache_migration *mg = ws_to_mg(ws);
1542         invalidate_complete(mg, !mg->k.input);
1543 }
1544
1545 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1546 {
1547         int r = policy_invalidate_mapping(cache->policy, cblock);
1548         if (!r) {
1549                 r = dm_cache_remove_mapping(cache->cmd, cblock);
1550                 if (r) {
1551                         DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1552                                     cache_device_name(cache));
1553                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1554                 }
1555
1556         } else if (r == -ENODATA) {
1557                 /*
1558                  * Harmless, already unmapped.
1559                  */
1560                 r = 0;
1561
1562         } else
1563                 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1564
1565         return r;
1566 }
1567
1568 static void invalidate_remove(struct work_struct *ws)
1569 {
1570         int r;
1571         struct dm_cache_migration *mg = ws_to_mg(ws);
1572         struct cache *cache = mg->cache;
1573
1574         r = invalidate_cblock(cache, mg->invalidate_cblock);
1575         if (r) {
1576                 invalidate_complete(mg, false);
1577                 return;
1578         }
1579
1580         init_continuation(&mg->k, invalidate_completed);
1581         continue_after_commit(&cache->committer, &mg->k);
1582         remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1583         mg->overwrite_bio = NULL;
1584         schedule_commit(&cache->committer);
1585 }
1586
1587 static int invalidate_lock(struct dm_cache_migration *mg)
1588 {
1589         int r;
1590         struct dm_cell_key_v2 key;
1591         struct cache *cache = mg->cache;
1592         struct dm_bio_prison_cell_v2 *prealloc;
1593
1594         prealloc = alloc_prison_cell(cache);
1595
1596         build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1597         r = dm_cell_lock_v2(cache->prison, &key,
1598                             READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1599         if (r < 0) {
1600                 free_prison_cell(cache, prealloc);
1601                 invalidate_complete(mg, false);
1602                 return r;
1603         }
1604
1605         if (mg->cell != prealloc)
1606                 free_prison_cell(cache, prealloc);
1607
1608         if (r)
1609                 quiesce(mg, invalidate_remove);
1610
1611         else {
1612                 /*
1613                  * We can't call invalidate_remove() directly here because we
1614                  * might still be in request context.
1615                  */
1616                 init_continuation(&mg->k, invalidate_remove);
1617                 queue_work(cache->wq, &mg->k.ws);
1618         }
1619
1620         return 0;
1621 }
1622
1623 static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1624                             dm_oblock_t oblock, struct bio *bio)
1625 {
1626         struct dm_cache_migration *mg;
1627
1628         if (!background_work_begin(cache))
1629                 return -EPERM;
1630
1631         mg = alloc_migration(cache);
1632
1633         mg->overwrite_bio = bio;
1634         mg->invalidate_cblock = cblock;
1635         mg->invalidate_oblock = oblock;
1636
1637         return invalidate_lock(mg);
1638 }
1639
1640 /*----------------------------------------------------------------
1641  * bio processing
1642  *--------------------------------------------------------------*/
1643
1644 enum busy {
1645         IDLE,
1646         BUSY
1647 };
1648
1649 static enum busy spare_migration_bandwidth(struct cache *cache)
1650 {
1651         bool idle = iot_idle_for(&cache->tracker, HZ);
1652         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1653                 cache->sectors_per_block;
1654
1655         if (idle && current_volume <= cache->migration_threshold)
1656                 return IDLE;
1657         else
1658                 return BUSY;
1659 }
1660
1661 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1662 {
1663         atomic_inc(bio_data_dir(bio) == READ ?
1664                    &cache->stats.read_hit : &cache->stats.write_hit);
1665 }
1666
1667 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1668 {
1669         atomic_inc(bio_data_dir(bio) == READ ?
1670                    &cache->stats.read_miss : &cache->stats.write_miss);
1671 }
1672
1673 /*----------------------------------------------------------------*/
1674
1675 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1676                    bool *commit_needed)
1677 {
1678         int r, data_dir;
1679         bool rb, background_queued;
1680         dm_cblock_t cblock;
1681
1682         *commit_needed = false;
1683
1684         rb = bio_detain_shared(cache, block, bio);
1685         if (!rb) {
1686                 /*
1687                  * An exclusive lock is held for this block, so we have to
1688                  * wait.  We set the commit_needed flag so the current
1689                  * transaction will be committed asap, allowing this lock
1690                  * to be dropped.
1691                  */
1692                 *commit_needed = true;
1693                 return DM_MAPIO_SUBMITTED;
1694         }
1695
1696         data_dir = bio_data_dir(bio);
1697
1698         if (optimisable_bio(cache, bio, block)) {
1699                 struct policy_work *op = NULL;
1700
1701                 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1702                 if (unlikely(r && r != -ENOENT)) {
1703                         DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1704                                     cache_device_name(cache), r);
1705                         bio_io_error(bio);
1706                         return DM_MAPIO_SUBMITTED;
1707                 }
1708
1709                 if (r == -ENOENT && op) {
1710                         bio_drop_shared_lock(cache, bio);
1711                         BUG_ON(op->op != POLICY_PROMOTE);
1712                         mg_start(cache, op, bio);
1713                         return DM_MAPIO_SUBMITTED;
1714                 }
1715         } else {
1716                 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1717                 if (unlikely(r && r != -ENOENT)) {
1718                         DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1719                                     cache_device_name(cache), r);
1720                         bio_io_error(bio);
1721                         return DM_MAPIO_SUBMITTED;
1722                 }
1723
1724                 if (background_queued)
1725                         wake_migration_worker(cache);
1726         }
1727
1728         if (r == -ENOENT) {
1729                 struct per_bio_data *pb = get_per_bio_data(bio);
1730
1731                 /*
1732                  * Miss.
1733                  */
1734                 inc_miss_counter(cache, bio);
1735                 if (pb->req_nr == 0) {
1736                         accounted_begin(cache, bio);
1737                         remap_to_origin_clear_discard(cache, bio, block);
1738                 } else {
1739                         /*
1740                          * This is a duplicate writethrough io that is no
1741                          * longer needed because the block has been demoted.
1742                          */
1743                         bio_endio(bio);
1744                         return DM_MAPIO_SUBMITTED;
1745                 }
1746         } else {
1747                 /*
1748                  * Hit.
1749                  */
1750                 inc_hit_counter(cache, bio);
1751
1752                 /*
1753                  * Passthrough always maps to the origin, invalidating any
1754                  * cache blocks that are written to.
1755                  */
1756                 if (passthrough_mode(cache)) {
1757                         if (bio_data_dir(bio) == WRITE) {
1758                                 bio_drop_shared_lock(cache, bio);
1759                                 atomic_inc(&cache->stats.demotion);
1760                                 invalidate_start(cache, cblock, block, bio);
1761                         } else
1762                                 remap_to_origin_clear_discard(cache, bio, block);
1763                 } else {
1764                         if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1765                             !is_dirty(cache, cblock)) {
1766                                 remap_to_origin_and_cache(cache, bio, block, cblock);
1767                                 accounted_begin(cache, bio);
1768                         } else
1769                                 remap_to_cache_dirty(cache, bio, block, cblock);
1770                 }
1771         }
1772
1773         /*
1774          * dm core turns FUA requests into a separate payload and FLUSH req.
1775          */
1776         if (bio->bi_opf & REQ_FUA) {
1777                 /*
1778                  * issue_after_commit will call accounted_begin a second time.  So
1779                  * we call accounted_complete() to avoid double accounting.
1780                  */
1781                 accounted_complete(cache, bio);
1782                 issue_after_commit(&cache->committer, bio);
1783                 *commit_needed = true;
1784                 return DM_MAPIO_SUBMITTED;
1785         }
1786
1787         return DM_MAPIO_REMAPPED;
1788 }
1789
1790 static bool process_bio(struct cache *cache, struct bio *bio)
1791 {
1792         bool commit_needed;
1793
1794         if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1795                 generic_make_request(bio);
1796
1797         return commit_needed;
1798 }
1799
1800 /*
1801  * A non-zero return indicates read_only or fail_io mode.
1802  */
1803 static int commit(struct cache *cache, bool clean_shutdown)
1804 {
1805         int r;
1806
1807         if (get_cache_mode(cache) >= CM_READ_ONLY)
1808                 return -EINVAL;
1809
1810         atomic_inc(&cache->stats.commit_count);
1811         r = dm_cache_commit(cache->cmd, clean_shutdown);
1812         if (r)
1813                 metadata_operation_failed(cache, "dm_cache_commit", r);
1814
1815         return r;
1816 }
1817
1818 /*
1819  * Used by the batcher.
1820  */
1821 static blk_status_t commit_op(void *context)
1822 {
1823         struct cache *cache = context;
1824
1825         if (dm_cache_changed_this_transaction(cache->cmd))
1826                 return errno_to_blk_status(commit(cache, false));
1827
1828         return 0;
1829 }
1830
1831 /*----------------------------------------------------------------*/
1832
1833 static bool process_flush_bio(struct cache *cache, struct bio *bio)
1834 {
1835         struct per_bio_data *pb = get_per_bio_data(bio);
1836
1837         if (!pb->req_nr)
1838                 remap_to_origin(cache, bio);
1839         else
1840                 remap_to_cache(cache, bio, 0);
1841
1842         issue_after_commit(&cache->committer, bio);
1843         return true;
1844 }
1845
1846 static bool process_discard_bio(struct cache *cache, struct bio *bio)
1847 {
1848         dm_dblock_t b, e;
1849
1850         // FIXME: do we need to lock the region?  Or can we just assume the
1851         // user wont be so foolish as to issue discard concurrently with
1852         // other IO?
1853         calc_discard_block_range(cache, bio, &b, &e);
1854         while (b != e) {
1855                 set_discard(cache, b);
1856                 b = to_dblock(from_dblock(b) + 1);
1857         }
1858
1859         if (cache->features.discard_passdown) {
1860                 remap_to_origin(cache, bio);
1861                 generic_make_request(bio);
1862         } else
1863                 bio_endio(bio);
1864
1865         return false;
1866 }
1867
1868 static void process_deferred_bios(struct work_struct *ws)
1869 {
1870         struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1871
1872         bool commit_needed = false;
1873         struct bio_list bios;
1874         struct bio *bio;
1875
1876         bio_list_init(&bios);
1877
1878         spin_lock_irq(&cache->lock);
1879         bio_list_merge(&bios, &cache->deferred_bios);
1880         bio_list_init(&cache->deferred_bios);
1881         spin_unlock_irq(&cache->lock);
1882
1883         while ((bio = bio_list_pop(&bios))) {
1884                 if (bio->bi_opf & REQ_PREFLUSH)
1885                         commit_needed = process_flush_bio(cache, bio) || commit_needed;
1886
1887                 else if (bio_op(bio) == REQ_OP_DISCARD)
1888                         commit_needed = process_discard_bio(cache, bio) || commit_needed;
1889
1890                 else
1891                         commit_needed = process_bio(cache, bio) || commit_needed;
1892         }
1893
1894         if (commit_needed)
1895                 schedule_commit(&cache->committer);
1896 }
1897
1898 /*----------------------------------------------------------------
1899  * Main worker loop
1900  *--------------------------------------------------------------*/
1901
1902 static void requeue_deferred_bios(struct cache *cache)
1903 {
1904         struct bio *bio;
1905         struct bio_list bios;
1906
1907         bio_list_init(&bios);
1908         bio_list_merge(&bios, &cache->deferred_bios);
1909         bio_list_init(&cache->deferred_bios);
1910
1911         while ((bio = bio_list_pop(&bios))) {
1912                 bio->bi_status = BLK_STS_DM_REQUEUE;
1913                 bio_endio(bio);
1914         }
1915 }
1916
1917 /*
1918  * We want to commit periodically so that not too much
1919  * unwritten metadata builds up.
1920  */
1921 static void do_waker(struct work_struct *ws)
1922 {
1923         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1924
1925         policy_tick(cache->policy, true);
1926         wake_migration_worker(cache);
1927         schedule_commit(&cache->committer);
1928         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1929 }
1930
1931 static void check_migrations(struct work_struct *ws)
1932 {
1933         int r;
1934         struct policy_work *op;
1935         struct cache *cache = container_of(ws, struct cache, migration_worker);
1936         enum busy b;
1937
1938         for (;;) {
1939                 b = spare_migration_bandwidth(cache);
1940
1941                 r = policy_get_background_work(cache->policy, b == IDLE, &op);
1942                 if (r == -ENODATA)
1943                         break;
1944
1945                 if (r) {
1946                         DMERR_LIMIT("%s: policy_background_work failed",
1947                                     cache_device_name(cache));
1948                         break;
1949                 }
1950
1951                 r = mg_start(cache, op, NULL);
1952                 if (r)
1953                         break;
1954         }
1955 }
1956
1957 /*----------------------------------------------------------------
1958  * Target methods
1959  *--------------------------------------------------------------*/
1960
1961 /*
1962  * This function gets called on the error paths of the constructor, so we
1963  * have to cope with a partially initialised struct.
1964  */
1965 static void destroy(struct cache *cache)
1966 {
1967         unsigned i;
1968
1969         mempool_exit(&cache->migration_pool);
1970
1971         if (cache->prison)
1972                 dm_bio_prison_destroy_v2(cache->prison);
1973
1974         if (cache->wq)
1975                 destroy_workqueue(cache->wq);
1976
1977         if (cache->dirty_bitset)
1978                 free_bitset(cache->dirty_bitset);
1979
1980         if (cache->discard_bitset)
1981                 free_bitset(cache->discard_bitset);
1982
1983         if (cache->copier)
1984                 dm_kcopyd_client_destroy(cache->copier);
1985
1986         if (cache->cmd)
1987                 dm_cache_metadata_close(cache->cmd);
1988
1989         if (cache->metadata_dev)
1990                 dm_put_device(cache->ti, cache->metadata_dev);
1991
1992         if (cache->origin_dev)
1993                 dm_put_device(cache->ti, cache->origin_dev);
1994
1995         if (cache->cache_dev)
1996                 dm_put_device(cache->ti, cache->cache_dev);
1997
1998         if (cache->policy)
1999                 dm_cache_policy_destroy(cache->policy);
2000
2001         for (i = 0; i < cache->nr_ctr_args ; i++)
2002                 kfree(cache->ctr_args[i]);
2003         kfree(cache->ctr_args);
2004
2005         bioset_exit(&cache->bs);
2006
2007         kfree(cache);
2008 }
2009
2010 static void cache_dtr(struct dm_target *ti)
2011 {
2012         struct cache *cache = ti->private;
2013
2014         destroy(cache);
2015 }
2016
2017 static sector_t get_dev_size(struct dm_dev *dev)
2018 {
2019         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2020 }
2021
2022 /*----------------------------------------------------------------*/
2023
2024 /*
2025  * Construct a cache device mapping.
2026  *
2027  * cache <metadata dev> <cache dev> <origin dev> <block size>
2028  *       <#feature args> [<feature arg>]*
2029  *       <policy> <#policy args> [<policy arg>]*
2030  *
2031  * metadata dev    : fast device holding the persistent metadata
2032  * cache dev       : fast device holding cached data blocks
2033  * origin dev      : slow device holding original data blocks
2034  * block size      : cache unit size in sectors
2035  *
2036  * #feature args   : number of feature arguments passed
2037  * feature args    : writethrough.  (The default is writeback.)
2038  *
2039  * policy          : the replacement policy to use
2040  * #policy args    : an even number of policy arguments corresponding
2041  *                   to key/value pairs passed to the policy
2042  * policy args     : key/value pairs passed to the policy
2043  *                   E.g. 'sequential_threshold 1024'
2044  *                   See cache-policies.txt for details.
2045  *
2046  * Optional feature arguments are:
2047  *   writethrough  : write through caching that prohibits cache block
2048  *                   content from being different from origin block content.
2049  *                   Without this argument, the default behaviour is to write
2050  *                   back cache block contents later for performance reasons,
2051  *                   so they may differ from the corresponding origin blocks.
2052  */
2053 struct cache_args {
2054         struct dm_target *ti;
2055
2056         struct dm_dev *metadata_dev;
2057
2058         struct dm_dev *cache_dev;
2059         sector_t cache_sectors;
2060
2061         struct dm_dev *origin_dev;
2062         sector_t origin_sectors;
2063
2064         uint32_t block_size;
2065
2066         const char *policy_name;
2067         int policy_argc;
2068         const char **policy_argv;
2069
2070         struct cache_features features;
2071 };
2072
2073 static void destroy_cache_args(struct cache_args *ca)
2074 {
2075         if (ca->metadata_dev)
2076                 dm_put_device(ca->ti, ca->metadata_dev);
2077
2078         if (ca->cache_dev)
2079                 dm_put_device(ca->ti, ca->cache_dev);
2080
2081         if (ca->origin_dev)
2082                 dm_put_device(ca->ti, ca->origin_dev);
2083
2084         kfree(ca);
2085 }
2086
2087 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2088 {
2089         if (!as->argc) {
2090                 *error = "Insufficient args";
2091                 return false;
2092         }
2093
2094         return true;
2095 }
2096
2097 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2098                               char **error)
2099 {
2100         int r;
2101         sector_t metadata_dev_size;
2102         char b[BDEVNAME_SIZE];
2103
2104         if (!at_least_one_arg(as, error))
2105                 return -EINVAL;
2106
2107         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2108                           &ca->metadata_dev);
2109         if (r) {
2110                 *error = "Error opening metadata device";
2111                 return r;
2112         }
2113
2114         metadata_dev_size = get_dev_size(ca->metadata_dev);
2115         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2116                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2117                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2118
2119         return 0;
2120 }
2121
2122 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2123                            char **error)
2124 {
2125         int r;
2126
2127         if (!at_least_one_arg(as, error))
2128                 return -EINVAL;
2129
2130         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2131                           &ca->cache_dev);
2132         if (r) {
2133                 *error = "Error opening cache device";
2134                 return r;
2135         }
2136         ca->cache_sectors = get_dev_size(ca->cache_dev);
2137
2138         return 0;
2139 }
2140
2141 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2142                             char **error)
2143 {
2144         int r;
2145
2146         if (!at_least_one_arg(as, error))
2147                 return -EINVAL;
2148
2149         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2150                           &ca->origin_dev);
2151         if (r) {
2152                 *error = "Error opening origin device";
2153                 return r;
2154         }
2155
2156         ca->origin_sectors = get_dev_size(ca->origin_dev);
2157         if (ca->ti->len > ca->origin_sectors) {
2158                 *error = "Device size larger than cached device";
2159                 return -EINVAL;
2160         }
2161
2162         return 0;
2163 }
2164
2165 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2166                             char **error)
2167 {
2168         unsigned long block_size;
2169
2170         if (!at_least_one_arg(as, error))
2171                 return -EINVAL;
2172
2173         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2174             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2175             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2176             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2177                 *error = "Invalid data block size";
2178                 return -EINVAL;
2179         }
2180
2181         if (block_size > ca->cache_sectors) {
2182                 *error = "Data block size is larger than the cache device";
2183                 return -EINVAL;
2184         }
2185
2186         ca->block_size = block_size;
2187
2188         return 0;
2189 }
2190
2191 static void init_features(struct cache_features *cf)
2192 {
2193         cf->mode = CM_WRITE;
2194         cf->io_mode = CM_IO_WRITEBACK;
2195         cf->metadata_version = 1;
2196         cf->discard_passdown = true;
2197 }
2198
2199 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2200                           char **error)
2201 {
2202         static const struct dm_arg _args[] = {
2203                 {0, 3, "Invalid number of cache feature arguments"},
2204         };
2205
2206         int r, mode_ctr = 0;
2207         unsigned argc;
2208         const char *arg;
2209         struct cache_features *cf = &ca->features;
2210
2211         init_features(cf);
2212
2213         r = dm_read_arg_group(_args, as, &argc, error);
2214         if (r)
2215                 return -EINVAL;
2216
2217         while (argc--) {
2218                 arg = dm_shift_arg(as);
2219
2220                 if (!strcasecmp(arg, "writeback")) {
2221                         cf->io_mode = CM_IO_WRITEBACK;
2222                         mode_ctr++;
2223                 }
2224
2225                 else if (!strcasecmp(arg, "writethrough")) {
2226                         cf->io_mode = CM_IO_WRITETHROUGH;
2227                         mode_ctr++;
2228                 }
2229
2230                 else if (!strcasecmp(arg, "passthrough")) {
2231                         cf->io_mode = CM_IO_PASSTHROUGH;
2232                         mode_ctr++;
2233                 }
2234
2235                 else if (!strcasecmp(arg, "metadata2"))
2236                         cf->metadata_version = 2;
2237
2238                 else if (!strcasecmp(arg, "no_discard_passdown"))
2239                         cf->discard_passdown = false;
2240
2241                 else {
2242                         *error = "Unrecognised cache feature requested";
2243                         return -EINVAL;
2244                 }
2245         }
2246
2247         if (mode_ctr > 1) {
2248                 *error = "Duplicate cache io_mode features requested";
2249                 return -EINVAL;
2250         }
2251
2252         return 0;
2253 }
2254
2255 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2256                         char **error)
2257 {
2258         static const struct dm_arg _args[] = {
2259                 {0, 1024, "Invalid number of policy arguments"},
2260         };
2261
2262         int r;
2263
2264         if (!at_least_one_arg(as, error))
2265                 return -EINVAL;
2266
2267         ca->policy_name = dm_shift_arg(as);
2268
2269         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2270         if (r)
2271                 return -EINVAL;
2272
2273         ca->policy_argv = (const char **)as->argv;
2274         dm_consume_args(as, ca->policy_argc);
2275
2276         return 0;
2277 }
2278
2279 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2280                             char **error)
2281 {
2282         int r;
2283         struct dm_arg_set as;
2284
2285         as.argc = argc;
2286         as.argv = argv;
2287
2288         r = parse_metadata_dev(ca, &as, error);
2289         if (r)
2290                 return r;
2291
2292         r = parse_cache_dev(ca, &as, error);
2293         if (r)
2294                 return r;
2295
2296         r = parse_origin_dev(ca, &as, error);
2297         if (r)
2298                 return r;
2299
2300         r = parse_block_size(ca, &as, error);
2301         if (r)
2302                 return r;
2303
2304         r = parse_features(ca, &as, error);
2305         if (r)
2306                 return r;
2307
2308         r = parse_policy(ca, &as, error);
2309         if (r)
2310                 return r;
2311
2312         return 0;
2313 }
2314
2315 /*----------------------------------------------------------------*/
2316
2317 static struct kmem_cache *migration_cache;
2318
2319 #define NOT_CORE_OPTION 1
2320
2321 static int process_config_option(struct cache *cache, const char *key, const char *value)
2322 {
2323         unsigned long tmp;
2324
2325         if (!strcasecmp(key, "migration_threshold")) {
2326                 if (kstrtoul(value, 10, &tmp))
2327                         return -EINVAL;
2328
2329                 cache->migration_threshold = tmp;
2330                 return 0;
2331         }
2332
2333         return NOT_CORE_OPTION;
2334 }
2335
2336 static int set_config_value(struct cache *cache, const char *key, const char *value)
2337 {
2338         int r = process_config_option(cache, key, value);
2339
2340         if (r == NOT_CORE_OPTION)
2341                 r = policy_set_config_value(cache->policy, key, value);
2342
2343         if (r)
2344                 DMWARN("bad config value for %s: %s", key, value);
2345
2346         return r;
2347 }
2348
2349 static int set_config_values(struct cache *cache, int argc, const char **argv)
2350 {
2351         int r = 0;
2352
2353         if (argc & 1) {
2354                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2355                 return -EINVAL;
2356         }
2357
2358         while (argc) {
2359                 r = set_config_value(cache, argv[0], argv[1]);
2360                 if (r)
2361                         break;
2362
2363                 argc -= 2;
2364                 argv += 2;
2365         }
2366
2367         return r;
2368 }
2369
2370 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2371                                char **error)
2372 {
2373         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2374                                                            cache->cache_size,
2375                                                            cache->origin_sectors,
2376                                                            cache->sectors_per_block);
2377         if (IS_ERR(p)) {
2378                 *error = "Error creating cache's policy";
2379                 return PTR_ERR(p);
2380         }
2381         cache->policy = p;
2382         BUG_ON(!cache->policy);
2383
2384         return 0;
2385 }
2386
2387 /*
2388  * We want the discard block size to be at least the size of the cache
2389  * block size and have no more than 2^14 discard blocks across the origin.
2390  */
2391 #define MAX_DISCARD_BLOCKS (1 << 14)
2392
2393 static bool too_many_discard_blocks(sector_t discard_block_size,
2394                                     sector_t origin_size)
2395 {
2396         (void) sector_div(origin_size, discard_block_size);
2397
2398         return origin_size > MAX_DISCARD_BLOCKS;
2399 }
2400
2401 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2402                                              sector_t origin_size)
2403 {
2404         sector_t discard_block_size = cache_block_size;
2405
2406         if (origin_size)
2407                 while (too_many_discard_blocks(discard_block_size, origin_size))
2408                         discard_block_size *= 2;
2409
2410         return discard_block_size;
2411 }
2412
2413 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2414 {
2415         dm_block_t nr_blocks = from_cblock(size);
2416
2417         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2418                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2419                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2420                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2421                              (unsigned long long) nr_blocks);
2422
2423         cache->cache_size = size;
2424 }
2425
2426 static int is_congested(struct dm_dev *dev, int bdi_bits)
2427 {
2428         struct request_queue *q = bdev_get_queue(dev->bdev);
2429         return bdi_congested(q->backing_dev_info, bdi_bits);
2430 }
2431
2432 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2433 {
2434         struct cache *cache = container_of(cb, struct cache, callbacks);
2435
2436         return is_congested(cache->origin_dev, bdi_bits) ||
2437                 is_congested(cache->cache_dev, bdi_bits);
2438 }
2439
2440 #define DEFAULT_MIGRATION_THRESHOLD 2048
2441
2442 static int cache_create(struct cache_args *ca, struct cache **result)
2443 {
2444         int r = 0;
2445         char **error = &ca->ti->error;
2446         struct cache *cache;
2447         struct dm_target *ti = ca->ti;
2448         dm_block_t origin_blocks;
2449         struct dm_cache_metadata *cmd;
2450         bool may_format = ca->features.mode == CM_WRITE;
2451
2452         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2453         if (!cache)
2454                 return -ENOMEM;
2455
2456         cache->ti = ca->ti;
2457         ti->private = cache;
2458         ti->num_flush_bios = 2;
2459         ti->flush_supported = true;
2460
2461         ti->num_discard_bios = 1;
2462         ti->discards_supported = true;
2463
2464         ti->per_io_data_size = sizeof(struct per_bio_data);
2465
2466         cache->features = ca->features;
2467         if (writethrough_mode(cache)) {
2468                 /* Create bioset for writethrough bios issued to origin */
2469                 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2470                 if (r)
2471                         goto bad;
2472         }
2473
2474         cache->callbacks.congested_fn = cache_is_congested;
2475         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2476
2477         cache->metadata_dev = ca->metadata_dev;
2478         cache->origin_dev = ca->origin_dev;
2479         cache->cache_dev = ca->cache_dev;
2480
2481         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2482
2483         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2484         origin_blocks = block_div(origin_blocks, ca->block_size);
2485         cache->origin_blocks = to_oblock(origin_blocks);
2486
2487         cache->sectors_per_block = ca->block_size;
2488         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2489                 r = -EINVAL;
2490                 goto bad;
2491         }
2492
2493         if (ca->block_size & (ca->block_size - 1)) {
2494                 dm_block_t cache_size = ca->cache_sectors;
2495
2496                 cache->sectors_per_block_shift = -1;
2497                 cache_size = block_div(cache_size, ca->block_size);
2498                 set_cache_size(cache, to_cblock(cache_size));
2499         } else {
2500                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2501                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2502         }
2503
2504         r = create_cache_policy(cache, ca, error);
2505         if (r)
2506                 goto bad;
2507
2508         cache->policy_nr_args = ca->policy_argc;
2509         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2510
2511         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2512         if (r) {
2513                 *error = "Error setting cache policy's config values";
2514                 goto bad;
2515         }
2516
2517         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2518                                      ca->block_size, may_format,
2519                                      dm_cache_policy_get_hint_size(cache->policy),
2520                                      ca->features.metadata_version);
2521         if (IS_ERR(cmd)) {
2522                 *error = "Error creating metadata object";
2523                 r = PTR_ERR(cmd);
2524                 goto bad;
2525         }
2526         cache->cmd = cmd;
2527         set_cache_mode(cache, CM_WRITE);
2528         if (get_cache_mode(cache) != CM_WRITE) {
2529                 *error = "Unable to get write access to metadata, please check/repair metadata.";
2530                 r = -EINVAL;
2531                 goto bad;
2532         }
2533
2534         if (passthrough_mode(cache)) {
2535                 bool all_clean;
2536
2537                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2538                 if (r) {
2539                         *error = "dm_cache_metadata_all_clean() failed";
2540                         goto bad;
2541                 }
2542
2543                 if (!all_clean) {
2544                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2545                         r = -EINVAL;
2546                         goto bad;
2547                 }
2548
2549                 policy_allow_migrations(cache->policy, false);
2550         }
2551
2552         spin_lock_init(&cache->lock);
2553         bio_list_init(&cache->deferred_bios);
2554         atomic_set(&cache->nr_allocated_migrations, 0);
2555         atomic_set(&cache->nr_io_migrations, 0);
2556         init_waitqueue_head(&cache->migration_wait);
2557
2558         r = -ENOMEM;
2559         atomic_set(&cache->nr_dirty, 0);
2560         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2561         if (!cache->dirty_bitset) {
2562                 *error = "could not allocate dirty bitset";
2563                 goto bad;
2564         }
2565         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2566
2567         cache->discard_block_size =
2568                 calculate_discard_block_size(cache->sectors_per_block,
2569                                              cache->origin_sectors);
2570         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2571                                                               cache->discard_block_size));
2572         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2573         if (!cache->discard_bitset) {
2574                 *error = "could not allocate discard bitset";
2575                 goto bad;
2576         }
2577         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2578
2579         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2580         if (IS_ERR(cache->copier)) {
2581                 *error = "could not create kcopyd client";
2582                 r = PTR_ERR(cache->copier);
2583                 goto bad;
2584         }
2585
2586         cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2587         if (!cache->wq) {
2588                 *error = "could not create workqueue for metadata object";
2589                 goto bad;
2590         }
2591         INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2592         INIT_WORK(&cache->migration_worker, check_migrations);
2593         INIT_DELAYED_WORK(&cache->waker, do_waker);
2594
2595         cache->prison = dm_bio_prison_create_v2(cache->wq);
2596         if (!cache->prison) {
2597                 *error = "could not create bio prison";
2598                 goto bad;
2599         }
2600
2601         r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2602                                    migration_cache);
2603         if (r) {
2604                 *error = "Error creating cache's migration mempool";
2605                 goto bad;
2606         }
2607
2608         cache->need_tick_bio = true;
2609         cache->sized = false;
2610         cache->invalidate = false;
2611         cache->commit_requested = false;
2612         cache->loaded_mappings = false;
2613         cache->loaded_discards = false;
2614
2615         load_stats(cache);
2616
2617         atomic_set(&cache->stats.demotion, 0);
2618         atomic_set(&cache->stats.promotion, 0);
2619         atomic_set(&cache->stats.copies_avoided, 0);
2620         atomic_set(&cache->stats.cache_cell_clash, 0);
2621         atomic_set(&cache->stats.commit_count, 0);
2622         atomic_set(&cache->stats.discard_count, 0);
2623
2624         spin_lock_init(&cache->invalidation_lock);
2625         INIT_LIST_HEAD(&cache->invalidation_requests);
2626
2627         batcher_init(&cache->committer, commit_op, cache,
2628                      issue_op, cache, cache->wq);
2629         iot_init(&cache->tracker);
2630
2631         init_rwsem(&cache->background_work_lock);
2632         prevent_background_work(cache);
2633
2634         *result = cache;
2635         return 0;
2636 bad:
2637         destroy(cache);
2638         return r;
2639 }
2640
2641 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2642 {
2643         unsigned i;
2644         const char **copy;
2645
2646         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2647         if (!copy)
2648                 return -ENOMEM;
2649         for (i = 0; i < argc; i++) {
2650                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2651                 if (!copy[i]) {
2652                         while (i--)
2653                                 kfree(copy[i]);
2654                         kfree(copy);
2655                         return -ENOMEM;
2656                 }
2657         }
2658
2659         cache->nr_ctr_args = argc;
2660         cache->ctr_args = copy;
2661
2662         return 0;
2663 }
2664
2665 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2666 {
2667         int r = -EINVAL;
2668         struct cache_args *ca;
2669         struct cache *cache = NULL;
2670
2671         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2672         if (!ca) {
2673                 ti->error = "Error allocating memory for cache";
2674                 return -ENOMEM;
2675         }
2676         ca->ti = ti;
2677
2678         r = parse_cache_args(ca, argc, argv, &ti->error);
2679         if (r)
2680                 goto out;
2681
2682         r = cache_create(ca, &cache);
2683         if (r)
2684                 goto out;
2685
2686         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2687         if (r) {
2688                 destroy(cache);
2689                 goto out;
2690         }
2691
2692         ti->private = cache;
2693 out:
2694         destroy_cache_args(ca);
2695         return r;
2696 }
2697
2698 /*----------------------------------------------------------------*/
2699
2700 static int cache_map(struct dm_target *ti, struct bio *bio)
2701 {
2702         struct cache *cache = ti->private;
2703
2704         int r;
2705         bool commit_needed;
2706         dm_oblock_t block = get_bio_block(cache, bio);
2707
2708         init_per_bio_data(bio);
2709         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2710                 /*
2711                  * This can only occur if the io goes to a partial block at
2712                  * the end of the origin device.  We don't cache these.
2713                  * Just remap to the origin and carry on.
2714                  */
2715                 remap_to_origin(cache, bio);
2716                 accounted_begin(cache, bio);
2717                 return DM_MAPIO_REMAPPED;
2718         }
2719
2720         if (discard_or_flush(bio)) {
2721                 defer_bio(cache, bio);
2722                 return DM_MAPIO_SUBMITTED;
2723         }
2724
2725         r = map_bio(cache, bio, block, &commit_needed);
2726         if (commit_needed)
2727                 schedule_commit(&cache->committer);
2728
2729         return r;
2730 }
2731
2732 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2733 {
2734         struct cache *cache = ti->private;
2735         unsigned long flags;
2736         struct per_bio_data *pb = get_per_bio_data(bio);
2737
2738         if (pb->tick) {
2739                 policy_tick(cache->policy, false);
2740
2741                 spin_lock_irqsave(&cache->lock, flags);
2742                 cache->need_tick_bio = true;
2743                 spin_unlock_irqrestore(&cache->lock, flags);
2744         }
2745
2746         bio_drop_shared_lock(cache, bio);
2747         accounted_complete(cache, bio);
2748
2749         return DM_ENDIO_DONE;
2750 }
2751
2752 static int write_dirty_bitset(struct cache *cache)
2753 {
2754         int r;
2755
2756         if (get_cache_mode(cache) >= CM_READ_ONLY)
2757                 return -EINVAL;
2758
2759         r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2760         if (r)
2761                 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2762
2763         return r;
2764 }
2765
2766 static int write_discard_bitset(struct cache *cache)
2767 {
2768         unsigned i, r;
2769
2770         if (get_cache_mode(cache) >= CM_READ_ONLY)
2771                 return -EINVAL;
2772
2773         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2774                                            cache->discard_nr_blocks);
2775         if (r) {
2776                 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2777                 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2778                 return r;
2779         }
2780
2781         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2782                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2783                                          is_discarded(cache, to_dblock(i)));
2784                 if (r) {
2785                         metadata_operation_failed(cache, "dm_cache_set_discard", r);
2786                         return r;
2787                 }
2788         }
2789
2790         return 0;
2791 }
2792
2793 static int write_hints(struct cache *cache)
2794 {
2795         int r;
2796
2797         if (get_cache_mode(cache) >= CM_READ_ONLY)
2798                 return -EINVAL;
2799
2800         r = dm_cache_write_hints(cache->cmd, cache->policy);
2801         if (r) {
2802                 metadata_operation_failed(cache, "dm_cache_write_hints", r);
2803                 return r;
2804         }
2805
2806         return 0;
2807 }
2808
2809 /*
2810  * returns true on success
2811  */
2812 static bool sync_metadata(struct cache *cache)
2813 {
2814         int r1, r2, r3, r4;
2815
2816         r1 = write_dirty_bitset(cache);
2817         if (r1)
2818                 DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2819
2820         r2 = write_discard_bitset(cache);
2821         if (r2)
2822                 DMERR("%s: could not write discard bitset", cache_device_name(cache));
2823
2824         save_stats(cache);
2825
2826         r3 = write_hints(cache);
2827         if (r3)
2828                 DMERR("%s: could not write hints", cache_device_name(cache));
2829
2830         /*
2831          * If writing the above metadata failed, we still commit, but don't
2832          * set the clean shutdown flag.  This will effectively force every
2833          * dirty bit to be set on reload.
2834          */
2835         r4 = commit(cache, !r1 && !r2 && !r3);
2836         if (r4)
2837                 DMERR("%s: could not write cache metadata", cache_device_name(cache));
2838
2839         return !r1 && !r2 && !r3 && !r4;
2840 }
2841
2842 static void cache_postsuspend(struct dm_target *ti)
2843 {
2844         struct cache *cache = ti->private;
2845
2846         prevent_background_work(cache);
2847         BUG_ON(atomic_read(&cache->nr_io_migrations));
2848
2849         cancel_delayed_work(&cache->waker);
2850         flush_workqueue(cache->wq);
2851         WARN_ON(cache->tracker.in_flight);
2852
2853         /*
2854          * If it's a flush suspend there won't be any deferred bios, so this
2855          * call is harmless.
2856          */
2857         requeue_deferred_bios(cache);
2858
2859         if (get_cache_mode(cache) == CM_WRITE)
2860                 (void) sync_metadata(cache);
2861 }
2862
2863 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2864                         bool dirty, uint32_t hint, bool hint_valid)
2865 {
2866         int r;
2867         struct cache *cache = context;
2868
2869         if (dirty) {
2870                 set_bit(from_cblock(cblock), cache->dirty_bitset);
2871                 atomic_inc(&cache->nr_dirty);
2872         } else
2873                 clear_bit(from_cblock(cblock), cache->dirty_bitset);
2874
2875         r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2876         if (r)
2877                 return r;
2878
2879         return 0;
2880 }
2881
2882 /*
2883  * The discard block size in the on disk metadata is not
2884  * neccessarily the same as we're currently using.  So we have to
2885  * be careful to only set the discarded attribute if we know it
2886  * covers a complete block of the new size.
2887  */
2888 struct discard_load_info {
2889         struct cache *cache;
2890
2891         /*
2892          * These blocks are sized using the on disk dblock size, rather
2893          * than the current one.
2894          */
2895         dm_block_t block_size;
2896         dm_block_t discard_begin, discard_end;
2897 };
2898
2899 static void discard_load_info_init(struct cache *cache,
2900                                    struct discard_load_info *li)
2901 {
2902         li->cache = cache;
2903         li->discard_begin = li->discard_end = 0;
2904 }
2905
2906 static void set_discard_range(struct discard_load_info *li)
2907 {
2908         sector_t b, e;
2909
2910         if (li->discard_begin == li->discard_end)
2911                 return;
2912
2913         /*
2914          * Convert to sectors.
2915          */
2916         b = li->discard_begin * li->block_size;
2917         e = li->discard_end * li->block_size;
2918
2919         /*
2920          * Then convert back to the current dblock size.
2921          */
2922         b = dm_sector_div_up(b, li->cache->discard_block_size);
2923         sector_div(e, li->cache->discard_block_size);
2924
2925         /*
2926          * The origin may have shrunk, so we need to check we're still in
2927          * bounds.
2928          */
2929         if (e > from_dblock(li->cache->discard_nr_blocks))
2930                 e = from_dblock(li->cache->discard_nr_blocks);
2931
2932         for (; b < e; b++)
2933                 set_discard(li->cache, to_dblock(b));
2934 }
2935
2936 static int load_discard(void *context, sector_t discard_block_size,
2937                         dm_dblock_t dblock, bool discard)
2938 {
2939         struct discard_load_info *li = context;
2940
2941         li->block_size = discard_block_size;
2942
2943         if (discard) {
2944                 if (from_dblock(dblock) == li->discard_end)
2945                         /*
2946                          * We're already in a discard range, just extend it.
2947                          */
2948                         li->discard_end = li->discard_end + 1ULL;
2949
2950                 else {
2951                         /*
2952                          * Emit the old range and start a new one.
2953                          */
2954                         set_discard_range(li);
2955                         li->discard_begin = from_dblock(dblock);
2956                         li->discard_end = li->discard_begin + 1ULL;
2957                 }
2958         } else {
2959                 set_discard_range(li);
2960                 li->discard_begin = li->discard_end = 0;
2961         }
2962
2963         return 0;
2964 }
2965
2966 static dm_cblock_t get_cache_dev_size(struct cache *cache)
2967 {
2968         sector_t size = get_dev_size(cache->cache_dev);
2969         (void) sector_div(size, cache->sectors_per_block);
2970         return to_cblock(size);
2971 }
2972
2973 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2974 {
2975         if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
2976                 if (cache->sized) {
2977                         DMERR("%s: unable to extend cache due to missing cache table reload",
2978                               cache_device_name(cache));
2979                         return false;
2980                 }
2981         }
2982
2983         /*
2984          * We can't drop a dirty block when shrinking the cache.
2985          */
2986         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2987                 new_size = to_cblock(from_cblock(new_size) + 1);
2988                 if (is_dirty(cache, new_size)) {
2989                         DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2990                               cache_device_name(cache),
2991                               (unsigned long long) from_cblock(new_size));
2992                         return false;
2993                 }
2994         }
2995
2996         return true;
2997 }
2998
2999 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3000 {
3001         int r;
3002
3003         r = dm_cache_resize(cache->cmd, new_size);
3004         if (r) {
3005                 DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3006                 metadata_operation_failed(cache, "dm_cache_resize", r);
3007                 return r;
3008         }
3009
3010         set_cache_size(cache, new_size);
3011
3012         return 0;
3013 }
3014
3015 static int cache_preresume(struct dm_target *ti)
3016 {
3017         int r = 0;
3018         struct cache *cache = ti->private;
3019         dm_cblock_t csize = get_cache_dev_size(cache);
3020
3021         /*
3022          * Check to see if the cache has resized.
3023          */
3024         if (!cache->sized) {
3025                 r = resize_cache_dev(cache, csize);
3026                 if (r)
3027                         return r;
3028
3029                 cache->sized = true;
3030
3031         } else if (csize != cache->cache_size) {
3032                 if (!can_resize(cache, csize))
3033                         return -EINVAL;
3034
3035                 r = resize_cache_dev(cache, csize);
3036                 if (r)
3037                         return r;
3038         }
3039
3040         if (!cache->loaded_mappings) {
3041                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3042                                            load_mapping, cache);
3043                 if (r) {
3044                         DMERR("%s: could not load cache mappings", cache_device_name(cache));
3045                         metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3046                         return r;
3047                 }
3048
3049                 cache->loaded_mappings = true;
3050         }
3051
3052         if (!cache->loaded_discards) {
3053                 struct discard_load_info li;
3054
3055                 /*
3056                  * The discard bitset could have been resized, or the
3057                  * discard block size changed.  To be safe we start by
3058                  * setting every dblock to not discarded.
3059                  */
3060                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3061
3062                 discard_load_info_init(cache, &li);
3063                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3064                 if (r) {
3065                         DMERR("%s: could not load origin discards", cache_device_name(cache));
3066                         metadata_operation_failed(cache, "dm_cache_load_discards", r);
3067                         return r;
3068                 }
3069                 set_discard_range(&li);
3070
3071                 cache->loaded_discards = true;
3072         }
3073
3074         return r;
3075 }
3076
3077 static void cache_resume(struct dm_target *ti)
3078 {
3079         struct cache *cache = ti->private;
3080
3081         cache->need_tick_bio = true;
3082         allow_background_work(cache);
3083         do_waker(&cache->waker.work);
3084 }
3085
3086 static void emit_flags(struct cache *cache, char *result,
3087                        unsigned maxlen, ssize_t *sz_ptr)
3088 {
3089         ssize_t sz = *sz_ptr;
3090         struct cache_features *cf = &cache->features;
3091         unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3092
3093         DMEMIT("%u ", count);
3094
3095         if (cf->metadata_version == 2)
3096                 DMEMIT("metadata2 ");
3097
3098         if (writethrough_mode(cache))
3099                 DMEMIT("writethrough ");
3100
3101         else if (passthrough_mode(cache))
3102                 DMEMIT("passthrough ");
3103
3104         else if (writeback_mode(cache))
3105                 DMEMIT("writeback ");
3106
3107         else {
3108                 DMEMIT("unknown ");
3109                 DMERR("%s: internal error: unknown io mode: %d",
3110                       cache_device_name(cache), (int) cf->io_mode);
3111         }
3112
3113         if (!cf->discard_passdown)
3114                 DMEMIT("no_discard_passdown ");
3115
3116         *sz_ptr = sz;
3117 }
3118
3119 /*
3120  * Status format:
3121  *
3122  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3123  * <cache block size> <#used cache blocks>/<#total cache blocks>
3124  * <#read hits> <#read misses> <#write hits> <#write misses>
3125  * <#demotions> <#promotions> <#dirty>
3126  * <#features> <features>*
3127  * <#core args> <core args>
3128  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3129  */
3130 static void cache_status(struct dm_target *ti, status_type_t type,
3131                          unsigned status_flags, char *result, unsigned maxlen)
3132 {
3133         int r = 0;
3134         unsigned i;
3135         ssize_t sz = 0;
3136         dm_block_t nr_free_blocks_metadata = 0;
3137         dm_block_t nr_blocks_metadata = 0;
3138         char buf[BDEVNAME_SIZE];
3139         struct cache *cache = ti->private;
3140         dm_cblock_t residency;
3141         bool needs_check;
3142
3143         switch (type) {
3144         case STATUSTYPE_INFO:
3145                 if (get_cache_mode(cache) == CM_FAIL) {
3146                         DMEMIT("Fail");
3147                         break;
3148                 }
3149
3150                 /* Commit to ensure statistics aren't out-of-date */
3151                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3152                         (void) commit(cache, false);
3153
3154                 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3155                 if (r) {
3156                         DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3157                               cache_device_name(cache), r);
3158                         goto err;
3159                 }
3160
3161                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3162                 if (r) {
3163                         DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3164                               cache_device_name(cache), r);
3165                         goto err;
3166                 }
3167
3168                 residency = policy_residency(cache->policy);
3169
3170                 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3171                        (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3172                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3173                        (unsigned long long)nr_blocks_metadata,
3174                        (unsigned long long)cache->sectors_per_block,
3175                        (unsigned long long) from_cblock(residency),
3176                        (unsigned long long) from_cblock(cache->cache_size),
3177                        (unsigned) atomic_read(&cache->stats.read_hit),
3178                        (unsigned) atomic_read(&cache->stats.read_miss),
3179                        (unsigned) atomic_read(&cache->stats.write_hit),
3180                        (unsigned) atomic_read(&cache->stats.write_miss),
3181                        (unsigned) atomic_read(&cache->stats.demotion),
3182                        (unsigned) atomic_read(&cache->stats.promotion),
3183                        (unsigned long) atomic_read(&cache->nr_dirty));
3184
3185                 emit_flags(cache, result, maxlen, &sz);
3186
3187                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3188
3189                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3190                 if (sz < maxlen) {
3191                         r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3192                         if (r)
3193                                 DMERR("%s: policy_emit_config_values returned %d",
3194                                       cache_device_name(cache), r);
3195                 }
3196
3197                 if (get_cache_mode(cache) == CM_READ_ONLY)
3198                         DMEMIT("ro ");
3199                 else
3200                         DMEMIT("rw ");
3201
3202                 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3203
3204                 if (r || needs_check)
3205                         DMEMIT("needs_check ");
3206                 else
3207                         DMEMIT("- ");
3208
3209                 break;
3210
3211         case STATUSTYPE_TABLE:
3212                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3213                 DMEMIT("%s ", buf);
3214                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3215                 DMEMIT("%s ", buf);
3216                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3217                 DMEMIT("%s", buf);
3218
3219                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3220                         DMEMIT(" %s", cache->ctr_args[i]);
3221                 if (cache->nr_ctr_args)
3222                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3223         }
3224
3225         return;
3226
3227 err:
3228         DMEMIT("Error");
3229 }
3230
3231 /*
3232  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3233  * the one-past-the-end value.
3234  */
3235 struct cblock_range {
3236         dm_cblock_t begin;
3237         dm_cblock_t end;
3238 };
3239
3240 /*
3241  * A cache block range can take two forms:
3242  *
3243  * i) A single cblock, eg. '3456'
3244  * ii) A begin and end cblock with a dash between, eg. 123-234
3245  */
3246 static int parse_cblock_range(struct cache *cache, const char *str,
3247                               struct cblock_range *result)
3248 {
3249         char dummy;
3250         uint64_t b, e;
3251         int r;
3252
3253         /*
3254          * Try and parse form (ii) first.
3255          */
3256         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3257         if (r < 0)
3258                 return r;
3259
3260         if (r == 2) {
3261                 result->begin = to_cblock(b);
3262                 result->end = to_cblock(e);
3263                 return 0;
3264         }
3265
3266         /*
3267          * That didn't work, try form (i).
3268          */
3269         r = sscanf(str, "%llu%c", &b, &dummy);
3270         if (r < 0)
3271                 return r;
3272
3273         if (r == 1) {
3274                 result->begin = to_cblock(b);
3275                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3276                 return 0;
3277         }
3278
3279         DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3280         return -EINVAL;
3281 }
3282
3283 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3284 {
3285         uint64_t b = from_cblock(range->begin);
3286         uint64_t e = from_cblock(range->end);
3287         uint64_t n = from_cblock(cache->cache_size);
3288
3289         if (b >= n) {
3290                 DMERR("%s: begin cblock out of range: %llu >= %llu",
3291                       cache_device_name(cache), b, n);
3292                 return -EINVAL;
3293         }
3294
3295         if (e > n) {
3296                 DMERR("%s: end cblock out of range: %llu > %llu",
3297                       cache_device_name(cache), e, n);
3298                 return -EINVAL;
3299         }
3300
3301         if (b >= e) {
3302                 DMERR("%s: invalid cblock range: %llu >= %llu",
3303                       cache_device_name(cache), b, e);
3304                 return -EINVAL;
3305         }
3306
3307         return 0;
3308 }
3309
3310 static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3311 {
3312         return to_cblock(from_cblock(b) + 1);
3313 }
3314
3315 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3316 {
3317         int r = 0;
3318
3319         /*
3320          * We don't need to do any locking here because we know we're in
3321          * passthrough mode.  There's is potential for a race between an
3322          * invalidation triggered by an io and an invalidation message.  This
3323          * is harmless, we must not worry if the policy call fails.
3324          */
3325         while (range->begin != range->end) {
3326                 r = invalidate_cblock(cache, range->begin);
3327                 if (r)
3328                         return r;
3329
3330                 range->begin = cblock_succ(range->begin);
3331         }
3332
3333         cache->commit_requested = true;
3334         return r;
3335 }
3336
3337 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3338                                               const char **cblock_ranges)
3339 {
3340         int r = 0;
3341         unsigned i;
3342         struct cblock_range range;
3343
3344         if (!passthrough_mode(cache)) {
3345                 DMERR("%s: cache has to be in passthrough mode for invalidation",
3346                       cache_device_name(cache));
3347                 return -EPERM;
3348         }
3349
3350         for (i = 0; i < count; i++) {
3351                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3352                 if (r)
3353                         break;
3354
3355                 r = validate_cblock_range(cache, &range);
3356                 if (r)
3357                         break;
3358
3359                 /*
3360                  * Pass begin and end origin blocks to the worker and wake it.
3361                  */
3362                 r = request_invalidation(cache, &range);
3363                 if (r)
3364                         break;
3365         }
3366
3367         return r;
3368 }
3369
3370 /*
3371  * Supports
3372  *      "<key> <value>"
3373  * and
3374  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3375  *
3376  * The key migration_threshold is supported by the cache target core.
3377  */
3378 static int cache_message(struct dm_target *ti, unsigned argc, char **argv,
3379                          char *result, unsigned maxlen)
3380 {
3381         struct cache *cache = ti->private;
3382
3383         if (!argc)
3384                 return -EINVAL;
3385
3386         if (get_cache_mode(cache) >= CM_READ_ONLY) {
3387                 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3388                       cache_device_name(cache));
3389                 return -EOPNOTSUPP;
3390         }
3391
3392         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3393                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3394
3395         if (argc != 2)
3396                 return -EINVAL;
3397
3398         return set_config_value(cache, argv[0], argv[1]);
3399 }
3400
3401 static int cache_iterate_devices(struct dm_target *ti,
3402                                  iterate_devices_callout_fn fn, void *data)
3403 {
3404         int r = 0;
3405         struct cache *cache = ti->private;
3406
3407         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3408         if (!r)
3409                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3410
3411         return r;
3412 }
3413
3414 static bool origin_dev_supports_discard(struct block_device *origin_bdev)
3415 {
3416         struct request_queue *q = bdev_get_queue(origin_bdev);
3417
3418         return q && blk_queue_discard(q);
3419 }
3420
3421 /*
3422  * If discard_passdown was enabled verify that the origin device
3423  * supports discards.  Disable discard_passdown if not.
3424  */
3425 static void disable_passdown_if_not_supported(struct cache *cache)
3426 {
3427         struct block_device *origin_bdev = cache->origin_dev->bdev;
3428         struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3429         const char *reason = NULL;
3430         char buf[BDEVNAME_SIZE];
3431
3432         if (!cache->features.discard_passdown)
3433                 return;
3434
3435         if (!origin_dev_supports_discard(origin_bdev))
3436                 reason = "discard unsupported";
3437
3438         else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3439                 reason = "max discard sectors smaller than a block";
3440
3441         if (reason) {
3442                 DMWARN("Origin device (%s) %s: Disabling discard passdown.",
3443                        bdevname(origin_bdev, buf), reason);
3444                 cache->features.discard_passdown = false;
3445         }
3446 }
3447
3448 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3449 {
3450         struct block_device *origin_bdev = cache->origin_dev->bdev;
3451         struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3452
3453         if (!cache->features.discard_passdown) {
3454                 /* No passdown is done so setting own virtual limits */
3455                 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3456                                                     cache->origin_sectors);
3457                 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3458                 return;
3459         }
3460
3461         /*
3462          * cache_iterate_devices() is stacking both origin and fast device limits
3463          * but discards aren't passed to fast device, so inherit origin's limits.
3464          */
3465         limits->max_discard_sectors = origin_limits->max_discard_sectors;
3466         limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3467         limits->discard_granularity = origin_limits->discard_granularity;
3468         limits->discard_alignment = origin_limits->discard_alignment;
3469         limits->discard_misaligned = origin_limits->discard_misaligned;
3470 }
3471
3472 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3473 {
3474         struct cache *cache = ti->private;
3475         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3476
3477         /*
3478          * If the system-determined stacked limits are compatible with the
3479          * cache's blocksize (io_opt is a factor) do not override them.
3480          */
3481         if (io_opt_sectors < cache->sectors_per_block ||
3482             do_div(io_opt_sectors, cache->sectors_per_block)) {
3483                 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3484                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3485         }
3486
3487         disable_passdown_if_not_supported(cache);
3488         set_discard_limits(cache, limits);
3489 }
3490
3491 /*----------------------------------------------------------------*/
3492
3493 static struct target_type cache_target = {
3494         .name = "cache",
3495         .version = {2, 1, 0},
3496         .module = THIS_MODULE,
3497         .ctr = cache_ctr,
3498         .dtr = cache_dtr,
3499         .map = cache_map,
3500         .end_io = cache_end_io,
3501         .postsuspend = cache_postsuspend,
3502         .preresume = cache_preresume,
3503         .resume = cache_resume,
3504         .status = cache_status,
3505         .message = cache_message,
3506         .iterate_devices = cache_iterate_devices,
3507         .io_hints = cache_io_hints,
3508 };
3509
3510 static int __init dm_cache_init(void)
3511 {
3512         int r;
3513
3514         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3515         if (!migration_cache)
3516                 return -ENOMEM;
3517
3518         r = dm_register_target(&cache_target);
3519         if (r) {
3520                 DMERR("cache target registration failed: %d", r);
3521                 kmem_cache_destroy(migration_cache);
3522                 return r;
3523         }
3524
3525         return 0;
3526 }
3527
3528 static void __exit dm_cache_exit(void)
3529 {
3530         dm_unregister_target(&cache_target);
3531         kmem_cache_destroy(migration_cache);
3532 }
3533
3534 module_init(dm_cache_init);
3535 module_exit(dm_cache_exit);
3536
3537 MODULE_DESCRIPTION(DM_NAME " cache target");
3538 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3539 MODULE_LICENSE("GPL");