drivers/md/dm-bufio.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2009-2011 Red Hat, Inc.
   4  *
   5  * Author: Mikulas Patocka <mpatocka@redhat.com>
   6  *
   7  * This file is released under the GPL.
   8  */
   9
  10 #include <linux/dm-bufio.h>
  11
  12 #include <linux/device-mapper.h>
  13 #include <linux/dm-io.h>
  14 #include <linux/slab.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/jiffies.h>
  17 #include <linux/vmalloc.h>
  18 #include <linux/shrinker.h>
  19 #include <linux/module.h>
  20 #include <linux/rbtree.h>
  21 #include <linux/stacktrace.h>
  22 #include <linux/jump_label.h>
  23
  24 #include "dm.h"
  25
  26 #define DM_MSG_PREFIX "bufio"
  27
  28 /*
  29  * Memory management policy:
  30  *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  31  *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  32  *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  33  *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  34  *      dirty buffers.
  35  */
  36 #define DM_BUFIO_MIN_BUFFERS            8
  37
  38 #define DM_BUFIO_MEMORY_PERCENT         2
  39 #define DM_BUFIO_VMALLOC_PERCENT        25
  40 #define DM_BUFIO_WRITEBACK_RATIO        3
  41 #define DM_BUFIO_LOW_WATERMARK_RATIO    16
  42
  43 /*
  44  * Check buffer ages in this interval (seconds)
  45  */
  46 #define DM_BUFIO_WORK_TIMER_SECS        30
  47
  48 /*
  49  * Free buffers when they are older than this (seconds)
  50  */
  51 #define DM_BUFIO_DEFAULT_AGE_SECS       300
  52
  53 /*
  54  * The nr of bytes of cached data to keep around.
  55  */
  56 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
  57
  58 /*
  59  * Align buffer writes to this boundary.
  60  * Tests show that SSDs have the highest IOPS when using 4k writes.
  61  */
  62 #define DM_BUFIO_WRITE_ALIGN            4096
  63
  64 /*
  65  * dm_buffer->list_mode
  66  */
  67 #define LIST_CLEAN      0
  68 #define LIST_DIRTY      1
  69 #define LIST_SIZE       2
  70
  71 /*--------------------------------------------------------------*/
  72
  73 /*
  74  * Rather than use an LRU list, we use a clock algorithm where entries
  75  * are held in a circular list.  When an entry is 'hit' a reference bit
  76  * is set.  The least recently used entry is approximated by running a
  77  * cursor around the list selecting unreferenced entries. Referenced
  78  * entries have their reference bit cleared as the cursor passes them.
  79  */
  80 struct lru_entry {
  81         struct list_head list;
  82         atomic_t referenced;
  83 };
  84
  85 struct lru_iter {
  86         struct lru *lru;
  87         struct list_head list;
  88         struct lru_entry *stop;
  89         struct lru_entry *e;
  90 };
  91
  92 struct lru {
  93         struct list_head *cursor;
  94         unsigned long count;
  95
  96         struct list_head iterators;
  97 };
  98
  99 /*--------------*/
 100
 101 static void lru_init(struct lru *lru)
 102 {
 103         lru->cursor = NULL;
 104         lru->count = 0;
 105         INIT_LIST_HEAD(&lru->iterators);
 106 }
 107
 108 static void lru_destroy(struct lru *lru)
 109 {
 110         WARN_ON_ONCE(lru->cursor);
 111         WARN_ON_ONCE(!list_empty(&lru->iterators));
 112 }
 113
 114 /*
 115  * Insert a new entry into the lru.
 116  */
 117 static void lru_insert(struct lru *lru, struct lru_entry *le)
 118 {
 119         /*
 120          * Don't be tempted to set to 1, makes the lru aspect
 121          * perform poorly.
 122          */
 123         atomic_set(&le->referenced, 0);
 124
 125         if (lru->cursor) {
 126                 list_add_tail(&le->list, lru->cursor);
 127         } else {
 128                 INIT_LIST_HEAD(&le->list);
 129                 lru->cursor = &le->list;
 130         }
 131         lru->count++;
 132 }
 133
 134 /*--------------*/
 135
 136 /*
 137  * Convert a list_head pointer to an lru_entry pointer.
 138  */
 139 static inline struct lru_entry *to_le(struct list_head *l)
 140 {
 141         return container_of(l, struct lru_entry, list);
 142 }
 143
 144 /*
 145  * Initialize an lru_iter and add it to the list of cursors in the lru.
 146  */
 147 static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
 148 {
 149         it->lru = lru;
 150         it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
 151         it->e = lru->cursor ? to_le(lru->cursor) : NULL;
 152         list_add(&it->list, &lru->iterators);
 153 }
 154
 155 /*
 156  * Remove an lru_iter from the list of cursors in the lru.
 157  */
 158 static inline void lru_iter_end(struct lru_iter *it)
 159 {
 160         list_del(&it->list);
 161 }
 162
 163 /* Predicate function type to be used with lru_iter_next */
 164 typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
 165
 166 /*
 167  * Advance the cursor to the next entry that passes the
 168  * predicate, and return that entry.  Returns NULL if the
 169  * iteration is complete.
 170  */
 171 static struct lru_entry *lru_iter_next(struct lru_iter *it,
 172                                        iter_predicate pred, void *context)
 173 {
 174         struct lru_entry *e;
 175
 176         while (it->e) {
 177                 e = it->e;
 178
 179                 /* advance the cursor */
 180                 if (it->e == it->stop)
 181                         it->e = NULL;
 182                 else
 183                         it->e = to_le(it->e->list.next);
 184
 185                 if (pred(e, context))
 186                         return e;
 187         }
 188
 189         return NULL;
 190 }
 191
 192 /*
 193  * Invalidate a specific lru_entry and update all cursors in
 194  * the lru accordingly.
 195  */
 196 static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
 197 {
 198         struct lru_iter *it;
 199
 200         list_for_each_entry(it, &lru->iterators, list) {
 201                 /* Move c->e forwards if necc. */
 202                 if (it->e == e) {
 203                         it->e = to_le(it->e->list.next);
 204                         if (it->e == e)
 205                                 it->e = NULL;
 206                 }
 207
 208                 /* Move it->stop backwards if necc. */
 209                 if (it->stop == e) {
 210                         it->stop = to_le(it->stop->list.prev);
 211                         if (it->stop == e)
 212                                 it->stop = NULL;
 213                 }
 214         }
 215 }
 216
 217 /*--------------*/
 218
 219 /*
 220  * Remove a specific entry from the lru.
 221  */
 222 static void lru_remove(struct lru *lru, struct lru_entry *le)
 223 {
 224         lru_iter_invalidate(lru, le);
 225         if (lru->count == 1) {
 226                 lru->cursor = NULL;
 227         } else {
 228                 if (lru->cursor == &le->list)
 229                         lru->cursor = lru->cursor->next;
 230                 list_del(&le->list);
 231         }
 232         lru->count--;
 233 }
 234
 235 /*
 236  * Mark as referenced.
 237  */
 238 static inline void lru_reference(struct lru_entry *le)
 239 {
 240         atomic_set(&le->referenced, 1);
 241 }
 242
 243 /*--------------*/
 244
 245 /*
 246  * Remove the least recently used entry (approx), that passes the predicate.
 247  * Returns NULL on failure.
 248  */
 249 enum evict_result {
 250         ER_EVICT,
 251         ER_DONT_EVICT,
 252         ER_STOP, /* stop looking for something to evict */
 253 };
 254
 255 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
 256
 257 static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
 258 {
 259         unsigned long tested = 0;
 260         struct list_head *h = lru->cursor;
 261         struct lru_entry *le;
 262
 263         if (!h)
 264                 return NULL;
 265         /*
 266          * In the worst case we have to loop around twice. Once to clear
 267          * the reference flags, and then again to discover the predicate
 268          * fails for all entries.
 269          */
 270         while (tested < lru->count) {
 271                 le = container_of(h, struct lru_entry, list);
 272
 273                 if (atomic_read(&le->referenced)) {
 274                         atomic_set(&le->referenced, 0);
 275                 } else {
 276                         tested++;
 277                         switch (pred(le, context)) {
 278                         case ER_EVICT:
 279                                 /*
 280                                  * Adjust the cursor, so we start the next
 281                                  * search from here.
 282                                  */
 283                                 lru->cursor = le->list.next;
 284                                 lru_remove(lru, le);
 285                                 return le;
 286
 287                         case ER_DONT_EVICT:
 288                                 break;
 289
 290                         case ER_STOP:
 291                                 lru->cursor = le->list.next;
 292                                 return NULL;
 293                         }
 294                 }
 295
 296                 h = h->next;
 297
 298                 cond_resched();
 299         }
 300
 301         return NULL;
 302 }
 303
 304 /*--------------------------------------------------------------*/
 305
 306 /*
 307  * Buffer state bits.
 308  */
 309 #define B_READING       0
 310 #define B_WRITING       1
 311 #define B_DIRTY         2
 312
 313 /*
 314  * Describes how the block was allocated:
 315  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 316  * See the comment at alloc_buffer_data.
 317  */
 318 enum data_mode {
 319         DATA_MODE_SLAB = 0,
 320         DATA_MODE_GET_FREE_PAGES = 1,
 321         DATA_MODE_VMALLOC = 2,
 322         DATA_MODE_LIMIT = 3
 323 };
 324
 325 struct dm_buffer {
 326         /* protected by the locks in dm_buffer_cache */
 327         struct rb_node node;
 328
 329         /* immutable, so don't need protecting */
 330         sector_t block;
 331         void *data;
 332         unsigned char data_mode;                /* DATA_MODE_* */
 333
 334         /*
 335          * These two fields are used in isolation, so do not need
 336          * a surrounding lock.
 337          */
 338         atomic_t hold_count;
 339         unsigned long last_accessed;
 340
 341         /*
 342          * Everything else is protected by the mutex in
 343          * dm_bufio_client
 344          */
 345         unsigned long state;
 346         struct lru_entry lru;
 347         unsigned char list_mode;                /* LIST_* */
 348         blk_status_t read_error;
 349         blk_status_t write_error;
 350         unsigned int dirty_start;
 351         unsigned int dirty_end;
 352         unsigned int write_start;
 353         unsigned int write_end;
 354         struct list_head write_list;
 355         struct dm_bufio_client *c;
 356         void (*end_io)(struct dm_buffer *b, blk_status_t bs);
 357 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 358 #define MAX_STACK 10
 359         unsigned int stack_len;
 360         unsigned long stack_entries[MAX_STACK];
 361 #endif
 362 };
 363
 364 /*--------------------------------------------------------------*/
 365
 366 /*
 367  * The buffer cache manages buffers, particularly:
 368  *  - inc/dec of holder count
 369  *  - setting the last_accessed field
 370  *  - maintains clean/dirty state along with lru
 371  *  - selecting buffers that match predicates
 372  *
 373  * It does *not* handle:
 374  *  - allocation/freeing of buffers.
 375  *  - IO
 376  *  - Eviction or cache sizing.
 377  *
 378  * cache_get() and cache_put() are threadsafe, you do not need to
 379  * protect these calls with a surrounding mutex.  All the other
 380  * methods are not threadsafe; they do use locking primitives, but
 381  * only enough to ensure get/put are threadsafe.
 382  */
 383
 384 struct buffer_tree {
 385         struct rw_semaphore lock;
 386         struct rb_root root;
 387 } ____cacheline_aligned_in_smp;
 388
 389 struct dm_buffer_cache {
 390         struct lru lru[LIST_SIZE];
 391         /*
 392          * We spread entries across multiple trees to reduce contention
 393          * on the locks.
 394          */
 395         unsigned int num_locks;
 396         struct buffer_tree trees[];
 397 };
 398
 399 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 400 {
 401         return dm_hash_locks_index(block, num_locks);
 402 }
 403
 404 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
 405 {
 406         down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 407 }
 408
 409 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
 410 {
 411         up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 412 }
 413
 414 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
 415 {
 416         down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 417 }
 418
 419 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
 420 {
 421         up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 422 }
 423
 424 /*
 425  * Sometimes we want to repeatedly get and drop locks as part of an iteration.
 426  * This struct helps avoid redundant drop and gets of the same lock.
 427  */
 428 struct lock_history {
 429         struct dm_buffer_cache *cache;
 430         bool write;
 431         unsigned int previous;
 432         unsigned int no_previous;
 433 };
 434
 435 static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
 436 {
 437         lh->cache = cache;
 438         lh->write = write;
 439         lh->no_previous = cache->num_locks;
 440         lh->previous = lh->no_previous;
 441 }
 442
 443 static void __lh_lock(struct lock_history *lh, unsigned int index)
 444 {
 445         if (lh->write)
 446                 down_write(&lh->cache->trees[index].lock);
 447         else
 448                 down_read(&lh->cache->trees[index].lock);
 449 }
 450
 451 static void __lh_unlock(struct lock_history *lh, unsigned int index)
 452 {
 453         if (lh->write)
 454                 up_write(&lh->cache->trees[index].lock);
 455         else
 456                 up_read(&lh->cache->trees[index].lock);
 457 }
 458
 459 /*
 460  * Make sure you call this since it will unlock the final lock.
 461  */
 462 static void lh_exit(struct lock_history *lh)
 463 {
 464         if (lh->previous != lh->no_previous) {
 465                 __lh_unlock(lh, lh->previous);
 466                 lh->previous = lh->no_previous;
 467         }
 468 }
 469
 470 /*
 471  * Named 'next' because there is no corresponding
 472  * 'up/unlock' call since it's done automatically.
 473  */
 474 static void lh_next(struct lock_history *lh, sector_t b)
 475 {
 476         unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
 477
 478         if (lh->previous != lh->no_previous) {
 479                 if (lh->previous != index) {
 480                         __lh_unlock(lh, lh->previous);
 481                         __lh_lock(lh, index);
 482                         lh->previous = index;
 483                 }
 484         } else {
 485                 __lh_lock(lh, index);
 486                 lh->previous = index;
 487         }
 488 }
 489
 490 static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
 491 {
 492         return container_of(le, struct dm_buffer, lru);
 493 }
 494
 495 static struct dm_buffer *list_to_buffer(struct list_head *l)
 496 {
 497         struct lru_entry *le = list_entry(l, struct lru_entry, list);
 498
 499         if (!le)
 500                 return NULL;
 501
 502         return le_to_buffer(le);
 503 }
 504
 505 static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
 506 {
 507         unsigned int i;
 508
 509         bc->num_locks = num_locks;
 510
 511         for (i = 0; i < bc->num_locks; i++) {
 512                 init_rwsem(&bc->trees[i].lock);
 513                 bc->trees[i].root = RB_ROOT;
 514         }
 515
 516         lru_init(&bc->lru[LIST_CLEAN]);
 517         lru_init(&bc->lru[LIST_DIRTY]);
 518 }
 519
 520 static void cache_destroy(struct dm_buffer_cache *bc)
 521 {
 522         unsigned int i;
 523
 524         for (i = 0; i < bc->num_locks; i++)
 525                 WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
 526
 527         lru_destroy(&bc->lru[LIST_CLEAN]);
 528         lru_destroy(&bc->lru[LIST_DIRTY]);
 529 }
 530
 531 /*--------------*/
 532
 533 /*
 534  * not threadsafe, or racey depending how you look at it
 535  */
 536 static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
 537 {
 538         return bc->lru[list_mode].count;
 539 }
 540
 541 static inline unsigned long cache_total(struct dm_buffer_cache *bc)
 542 {
 543         return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
 544 }
 545
 546 /*--------------*/
 547
 548 /*
 549  * Gets a specific buffer, indexed by block.
 550  * If the buffer is found then its holder count will be incremented and
 551  * lru_reference will be called.
 552  *
 553  * threadsafe
 554  */
 555 static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
 556 {
 557         struct rb_node *n = root->rb_node;
 558         struct dm_buffer *b;
 559
 560         while (n) {
 561                 b = container_of(n, struct dm_buffer, node);
 562
 563                 if (b->block == block)
 564                         return b;
 565
 566                 n = block < b->block ? n->rb_left : n->rb_right;
 567         }
 568
 569         return NULL;
 570 }
 571
 572 static void __cache_inc_buffer(struct dm_buffer *b)
 573 {
 574         atomic_inc(&b->hold_count);
 575         WRITE_ONCE(b->last_accessed, jiffies);
 576 }
 577
 578 static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
 579 {
 580         struct dm_buffer *b;
 581
 582         cache_read_lock(bc, block);
 583         b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
 584         if (b) {
 585                 lru_reference(&b->lru);
 586                 __cache_inc_buffer(b);
 587         }
 588         cache_read_unlock(bc, block);
 589
 590         return b;
 591 }
 592
 593 /*--------------*/
 594
 595 /*
 596  * Returns true if the hold count hits zero.
 597  * threadsafe
 598  */
 599 static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
 600 {
 601         bool r;
 602
 603         cache_read_lock(bc, b->block);
 604         BUG_ON(!atomic_read(&b->hold_count));
 605         r = atomic_dec_and_test(&b->hold_count);
 606         cache_read_unlock(bc, b->block);
 607
 608         return r;
 609 }
 610
 611 /*--------------*/
 612
 613 typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
 614
 615 /*
 616  * Evicts a buffer based on a predicate.  The oldest buffer that
 617  * matches the predicate will be selected.  In addition to the
 618  * predicate the hold_count of the selected buffer will be zero.
 619  */
 620 struct evict_wrapper {
 621         struct lock_history *lh;
 622         b_predicate pred;
 623         void *context;
 624 };
 625
 626 /*
 627  * Wraps the buffer predicate turning it into an lru predicate.  Adds
 628  * extra test for hold_count.
 629  */
 630 static enum evict_result __evict_pred(struct lru_entry *le, void *context)
 631 {
 632         struct evict_wrapper *w = context;
 633         struct dm_buffer *b = le_to_buffer(le);
 634
 635         lh_next(w->lh, b->block);
 636
 637         if (atomic_read(&b->hold_count))
 638                 return ER_DONT_EVICT;
 639
 640         return w->pred(b, w->context);
 641 }
 642
 643 static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
 644                                        b_predicate pred, void *context,
 645                                        struct lock_history *lh)
 646 {
 647         struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 648         struct lru_entry *le;
 649         struct dm_buffer *b;
 650
 651         le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
 652         if (!le)
 653                 return NULL;
 654
 655         b = le_to_buffer(le);
 656         /* __evict_pred will have locked the appropriate tree. */
 657         rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 658
 659         return b;
 660 }
 661
 662 static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
 663                                      b_predicate pred, void *context)
 664 {
 665         struct dm_buffer *b;
 666         struct lock_history lh;
 667
 668         lh_init(&lh, bc, true);
 669         b = __cache_evict(bc, list_mode, pred, context, &lh);
 670         lh_exit(&lh);
 671
 672         return b;
 673 }
 674
 675 /*--------------*/
 676
 677 /*
 678  * Mark a buffer as clean or dirty. Not threadsafe.
 679  */
 680 static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
 681 {
 682         cache_write_lock(bc, b->block);
 683         if (list_mode != b->list_mode) {
 684                 lru_remove(&bc->lru[b->list_mode], &b->lru);
 685                 b->list_mode = list_mode;
 686                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 687         }
 688         cache_write_unlock(bc, b->block);
 689 }
 690
 691 /*--------------*/
 692
 693 /*
 694  * Runs through the lru associated with 'old_mode', if the predicate matches then
 695  * it moves them to 'new_mode'.  Not threadsafe.
 696  */
 697 static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
 698                               b_predicate pred, void *context, struct lock_history *lh)
 699 {
 700         struct lru_entry *le;
 701         struct dm_buffer *b;
 702         struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 703
 704         while (true) {
 705                 le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
 706                 if (!le)
 707                         break;
 708
 709                 b = le_to_buffer(le);
 710                 b->list_mode = new_mode;
 711                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 712         }
 713 }
 714
 715 static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
 716                             b_predicate pred, void *context)
 717 {
 718         struct lock_history lh;
 719
 720         lh_init(&lh, bc, true);
 721         __cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
 722         lh_exit(&lh);
 723 }
 724
 725 /*--------------*/
 726
 727 /*
 728  * Iterates through all clean or dirty entries calling a function for each
 729  * entry.  The callback may terminate the iteration early.  Not threadsafe.
 730  */
 731
 732 /*
 733  * Iterator functions should return one of these actions to indicate
 734  * how the iteration should proceed.
 735  */
 736 enum it_action {
 737         IT_NEXT,
 738         IT_COMPLETE,
 739 };
 740
 741 typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
 742
 743 static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 744                             iter_fn fn, void *context, struct lock_history *lh)
 745 {
 746         struct lru *lru = &bc->lru[list_mode];
 747         struct lru_entry *le, *first;
 748
 749         if (!lru->cursor)
 750                 return;
 751
 752         first = le = to_le(lru->cursor);
 753         do {
 754                 struct dm_buffer *b = le_to_buffer(le);
 755
 756                 lh_next(lh, b->block);
 757
 758                 switch (fn(b, context)) {
 759                 case IT_NEXT:
 760                         break;
 761
 762                 case IT_COMPLETE:
 763                         return;
 764                 }
 765                 cond_resched();
 766
 767                 le = to_le(le->list.next);
 768         } while (le != first);
 769 }
 770
 771 static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 772                           iter_fn fn, void *context)
 773 {
 774         struct lock_history lh;
 775
 776         lh_init(&lh, bc, false);
 777         __cache_iterate(bc, list_mode, fn, context, &lh);
 778         lh_exit(&lh);
 779 }
 780
 781 /*--------------*/
 782
 783 /*
 784  * Passes ownership of the buffer to the cache. Returns false if the
 785  * buffer was already present (in which case ownership does not pass).
 786  * eg, a race with another thread.
 787  *
 788  * Holder count should be 1 on insertion.
 789  *
 790  * Not threadsafe.
 791  */
 792 static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
 793 {
 794         struct rb_node **new = &root->rb_node, *parent = NULL;
 795         struct dm_buffer *found;
 796
 797         while (*new) {
 798                 found = container_of(*new, struct dm_buffer, node);
 799
 800                 if (found->block == b->block)
 801                         return false;
 802
 803                 parent = *new;
 804                 new = b->block < found->block ?
 805                         &found->node.rb_left : &found->node.rb_right;
 806         }
 807
 808         rb_link_node(&b->node, parent, new);
 809         rb_insert_color(&b->node, root);
 810
 811         return true;
 812 }
 813
 814 static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
 815 {
 816         bool r;
 817
 818         if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
 819                 return false;
 820
 821         cache_write_lock(bc, b->block);
 822         BUG_ON(atomic_read(&b->hold_count) != 1);
 823         r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
 824         if (r)
 825                 lru_insert(&bc->lru[b->list_mode], &b->lru);
 826         cache_write_unlock(bc, b->block);
 827
 828         return r;
 829 }
 830
 831 /*--------------*/
 832
 833 /*
 834  * Removes buffer from cache, ownership of the buffer passes back to the caller.
 835  * Fails if the hold_count is not one (ie. the caller holds the only reference).
 836  *
 837  * Not threadsafe.
 838  */
 839 static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
 840 {
 841         bool r;
 842
 843         cache_write_lock(bc, b->block);
 844
 845         if (atomic_read(&b->hold_count) != 1) {
 846                 r = false;
 847         } else {
 848                 r = true;
 849                 rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 850                 lru_remove(&bc->lru[b->list_mode], &b->lru);
 851         }
 852
 853         cache_write_unlock(bc, b->block);
 854
 855         return r;
 856 }
 857
 858 /*--------------*/
 859
 860 typedef void (*b_release)(struct dm_buffer *);
 861
 862 static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
 863 {
 864         struct rb_node *n = root->rb_node;
 865         struct dm_buffer *b;
 866         struct dm_buffer *best = NULL;
 867
 868         while (n) {
 869                 b = container_of(n, struct dm_buffer, node);
 870
 871                 if (b->block == block)
 872                         return b;
 873
 874                 if (block <= b->block) {
 875                         n = n->rb_left;
 876                         best = b;
 877                 } else {
 878                         n = n->rb_right;
 879                 }
 880         }
 881
 882         return best;
 883 }
 884
 885 static void __remove_range(struct dm_buffer_cache *bc,
 886                            struct rb_root *root,
 887                            sector_t begin, sector_t end,
 888                            b_predicate pred, b_release release)
 889 {
 890         struct dm_buffer *b;
 891
 892         while (true) {
 893                 cond_resched();
 894
 895                 b = __find_next(root, begin);
 896                 if (!b || (b->block >= end))
 897                         break;
 898
 899                 begin = b->block + 1;
 900
 901                 if (atomic_read(&b->hold_count))
 902                         continue;
 903
 904                 if (pred(b, NULL) == ER_EVICT) {
 905                         rb_erase(&b->node, root);
 906                         lru_remove(&bc->lru[b->list_mode], &b->lru);
 907                         release(b);
 908                 }
 909         }
 910 }
 911
 912 static void cache_remove_range(struct dm_buffer_cache *bc,
 913                                sector_t begin, sector_t end,
 914                                b_predicate pred, b_release release)
 915 {
 916         unsigned int i;
 917
 918         for (i = 0; i < bc->num_locks; i++) {
 919                 down_write(&bc->trees[i].lock);
 920                 __remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
 921                 up_write(&bc->trees[i].lock);
 922         }
 923 }
 924
 925 /*----------------------------------------------------------------*/
 926
 927 /*
 928  * Linking of buffers:
 929  *      All buffers are linked to buffer_cache with their node field.
 930  *
 931  *      Clean buffers that are not being written (B_WRITING not set)
 932  *      are linked to lru[LIST_CLEAN] with their lru_list field.
 933  *
 934  *      Dirty and clean buffers that are being written are linked to
 935  *      lru[LIST_DIRTY] with their lru_list field. When the write
 936  *      finishes, the buffer cannot be relinked immediately (because we
 937  *      are in an interrupt context and relinking requires process
 938  *      context), so some clean-not-writing buffers can be held on
 939  *      dirty_lru too.  They are later added to lru in the process
 940  *      context.
 941  */
 942 struct dm_bufio_client {
 943         struct block_device *bdev;
 944         unsigned int block_size;
 945         s8 sectors_per_block_bits;
 946
 947         bool no_sleep;
 948         struct mutex lock;
 949         spinlock_t spinlock;
 950
 951         int async_write_error;
 952
 953         void (*alloc_callback)(struct dm_buffer *buf);
 954         void (*write_callback)(struct dm_buffer *buf);
 955         struct kmem_cache *slab_buffer;
 956         struct kmem_cache *slab_cache;
 957         struct dm_io_client *dm_io;
 958
 959         struct list_head reserved_buffers;
 960         unsigned int need_reserved_buffers;
 961
 962         unsigned int minimum_buffers;
 963
 964         sector_t start;
 965
 966         struct shrinker shrinker;
 967         struct work_struct shrink_work;
 968         atomic_long_t need_shrink;
 969
 970         wait_queue_head_t free_buffer_wait;
 971
 972         struct list_head client_list;
 973
 974         /*
 975          * Used by global_cleanup to sort the clients list.
 976          */
 977         unsigned long oldest_buffer;
 978
 979         struct dm_buffer_cache cache; /* must be last member */
 980 };
 981
 982 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
 983
 984 /*----------------------------------------------------------------*/
 985
 986 #define dm_bufio_in_request()   (!!current->bio_list)
 987
 988 static void dm_bufio_lock(struct dm_bufio_client *c)
 989 {
 990         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
 991                 spin_lock_bh(&c->spinlock);
 992         else
 993                 mutex_lock_nested(&c->lock, dm_bufio_in_request());
 994 }
 995
 996 static void dm_bufio_unlock(struct dm_bufio_client *c)
 997 {
 998         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
 999                 spin_unlock_bh(&c->spinlock);
1000         else
1001                 mutex_unlock(&c->lock);
1002 }
1003
1004 /*----------------------------------------------------------------*/
1005
1006 /*
1007  * Default cache size: available memory divided by the ratio.
1008  */
1009 static unsigned long dm_bufio_default_cache_size;
1010
1011 /*
1012  * Total cache size set by the user.
1013  */
1014 static unsigned long dm_bufio_cache_size;
1015
1016 /*
1017  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1018  * at any time.  If it disagrees, the user has changed cache size.
1019  */
1020 static unsigned long dm_bufio_cache_size_latch;
1021
1022 static DEFINE_SPINLOCK(global_spinlock);
1023
1024 /*
1025  * Buffers are freed after this timeout
1026  */
1027 static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
1028 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1029
1030 static unsigned long dm_bufio_peak_allocated;
1031 static unsigned long dm_bufio_allocated_kmem_cache;
1032 static unsigned long dm_bufio_allocated_get_free_pages;
1033 static unsigned long dm_bufio_allocated_vmalloc;
1034 static unsigned long dm_bufio_current_allocated;
1035
1036 /*----------------------------------------------------------------*/
1037
1038 /*
1039  * The current number of clients.
1040  */
1041 static int dm_bufio_client_count;
1042
1043 /*
1044  * The list of all clients.
1045  */
1046 static LIST_HEAD(dm_bufio_all_clients);
1047
1048 /*
1049  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1050  */
1051 static DEFINE_MUTEX(dm_bufio_clients_lock);
1052
1053 static struct workqueue_struct *dm_bufio_wq;
1054 static struct delayed_work dm_bufio_cleanup_old_work;
1055 static struct work_struct dm_bufio_replacement_work;
1056
1057
1058 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1059 static void buffer_record_stack(struct dm_buffer *b)
1060 {
1061         b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
1062 }
1063 #endif
1064
1065 /*----------------------------------------------------------------*/
1066
1067 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1068 {
1069         unsigned char data_mode;
1070         long diff;
1071
1072         static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1073                 &dm_bufio_allocated_kmem_cache,
1074                 &dm_bufio_allocated_get_free_pages,
1075                 &dm_bufio_allocated_vmalloc,
1076         };
1077
1078         data_mode = b->data_mode;
1079         diff = (long)b->c->block_size;
1080         if (unlink)
1081                 diff = -diff;
1082
1083         spin_lock(&global_spinlock);
1084
1085         *class_ptr[data_mode] += diff;
1086
1087         dm_bufio_current_allocated += diff;
1088
1089         if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1090                 dm_bufio_peak_allocated = dm_bufio_current_allocated;
1091
1092         if (!unlink) {
1093                 if (dm_bufio_current_allocated > dm_bufio_cache_size)
1094                         queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1095         }
1096
1097         spin_unlock(&global_spinlock);
1098 }
1099
1100 /*
1101  * Change the number of clients and recalculate per-client limit.
1102  */
1103 static void __cache_size_refresh(void)
1104 {
1105         if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1106                 return;
1107         if (WARN_ON(dm_bufio_client_count < 0))
1108                 return;
1109
1110         dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1111
1112         /*
1113          * Use default if set to 0 and report the actual cache size used.
1114          */
1115         if (!dm_bufio_cache_size_latch) {
1116                 (void)cmpxchg(&dm_bufio_cache_size, 0,
1117                               dm_bufio_default_cache_size);
1118                 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1119         }
1120 }
1121
1122 /*
1123  * Allocating buffer data.
1124  *
1125  * Small buffers are allocated with kmem_cache, to use space optimally.
1126  *
1127  * For large buffers, we choose between get_free_pages and vmalloc.
1128  * Each has advantages and disadvantages.
1129  *
1130  * __get_free_pages can randomly fail if the memory is fragmented.
1131  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1132  * as low as 128M) so using it for caching is not appropriate.
1133  *
1134  * If the allocation may fail we use __get_free_pages. Memory fragmentation
1135  * won't have a fatal effect here, but it just causes flushes of some other
1136  * buffers and more I/O will be performed. Don't use __get_free_pages if it
1137  * always fails (i.e. order > MAX_ORDER).
1138  *
1139  * If the allocation shouldn't fail we use __vmalloc. This is only for the
1140  * initial reserve allocation, so there's no risk of wasting all vmalloc
1141  * space.
1142  */
1143 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
1144                                unsigned char *data_mode)
1145 {
1146         if (unlikely(c->slab_cache != NULL)) {
1147                 *data_mode = DATA_MODE_SLAB;
1148                 return kmem_cache_alloc(c->slab_cache, gfp_mask);
1149         }
1150
1151         if (c->block_size <= KMALLOC_MAX_SIZE &&
1152             gfp_mask & __GFP_NORETRY) {
1153                 *data_mode = DATA_MODE_GET_FREE_PAGES;
1154                 return (void *)__get_free_pages(gfp_mask,
1155                                                 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1156         }
1157
1158         *data_mode = DATA_MODE_VMALLOC;
1159
1160         return __vmalloc(c->block_size, gfp_mask);
1161 }
1162
1163 /*
1164  * Free buffer's data.
1165  */
1166 static void free_buffer_data(struct dm_bufio_client *c,
1167                              void *data, unsigned char data_mode)
1168 {
1169         switch (data_mode) {
1170         case DATA_MODE_SLAB:
1171                 kmem_cache_free(c->slab_cache, data);
1172                 break;
1173
1174         case DATA_MODE_GET_FREE_PAGES:
1175                 free_pages((unsigned long)data,
1176                            c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1177                 break;
1178
1179         case DATA_MODE_VMALLOC:
1180                 vfree(data);
1181                 break;
1182
1183         default:
1184                 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1185                        data_mode);
1186                 BUG();
1187         }
1188 }
1189
1190 /*
1191  * Allocate buffer and its data.
1192  */
1193 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
1194 {
1195         struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
1196
1197         if (!b)
1198                 return NULL;
1199
1200         b->c = c;
1201
1202         b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
1203         if (!b->data) {
1204                 kmem_cache_free(c->slab_buffer, b);
1205                 return NULL;
1206         }
1207         adjust_total_allocated(b, false);
1208
1209 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1210         b->stack_len = 0;
1211 #endif
1212         return b;
1213 }
1214
1215 /*
1216  * Free buffer and its data.
1217  */
1218 static void free_buffer(struct dm_buffer *b)
1219 {
1220         struct dm_bufio_client *c = b->c;
1221
1222         adjust_total_allocated(b, true);
1223         free_buffer_data(c, b->data, b->data_mode);
1224         kmem_cache_free(c->slab_buffer, b);
1225 }
1226
1227 /*
1228  *--------------------------------------------------------------------------
1229  * Submit I/O on the buffer.
1230  *
1231  * Bio interface is faster but it has some problems:
1232  *      the vector list is limited (increasing this limit increases
1233  *      memory-consumption per buffer, so it is not viable);
1234  *
1235  *      the memory must be direct-mapped, not vmalloced;
1236  *
1237  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1238  * it is not vmalloced, try using the bio interface.
1239  *
1240  * If the buffer is big, if it is vmalloced or if the underlying device
1241  * rejects the bio because it is too large, use dm-io layer to do the I/O.
1242  * The dm-io layer splits the I/O into multiple requests, avoiding the above
1243  * shortcomings.
1244  *--------------------------------------------------------------------------
1245  */
1246
1247 /*
1248  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1249  * that the request was handled directly with bio interface.
1250  */
1251 static void dmio_complete(unsigned long error, void *context)
1252 {
1253         struct dm_buffer *b = context;
1254
1255         b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
1256 }
1257
1258 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
1259                      unsigned int n_sectors, unsigned int offset)
1260 {
1261         int r;
1262         struct dm_io_request io_req = {
1263                 .bi_opf = op,
1264                 .notify.fn = dmio_complete,
1265                 .notify.context = b,
1266                 .client = b->c->dm_io,
1267         };
1268         struct dm_io_region region = {
1269                 .bdev = b->c->bdev,
1270                 .sector = sector,
1271                 .count = n_sectors,
1272         };
1273
1274         if (b->data_mode != DATA_MODE_VMALLOC) {
1275                 io_req.mem.type = DM_IO_KMEM;
1276                 io_req.mem.ptr.addr = (char *)b->data + offset;
1277         } else {
1278                 io_req.mem.type = DM_IO_VMA;
1279                 io_req.mem.ptr.vma = (char *)b->data + offset;
1280         }
1281
1282         r = dm_io(&io_req, 1, &region, NULL);
1283         if (unlikely(r))
1284                 b->end_io(b, errno_to_blk_status(r));
1285 }
1286
1287 static void bio_complete(struct bio *bio)
1288 {
1289         struct dm_buffer *b = bio->bi_private;
1290         blk_status_t status = bio->bi_status;
1291
1292         bio_uninit(bio);
1293         kfree(bio);
1294         b->end_io(b, status);
1295 }
1296
1297 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
1298                     unsigned int n_sectors, unsigned int offset)
1299 {
1300         struct bio *bio;
1301         char *ptr;
1302         unsigned int len;
1303
1304         bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
1305         if (!bio) {
1306                 use_dmio(b, op, sector, n_sectors, offset);
1307                 return;
1308         }
1309         bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
1310         bio->bi_iter.bi_sector = sector;
1311         bio->bi_end_io = bio_complete;
1312         bio->bi_private = b;
1313
1314         ptr = (char *)b->data + offset;
1315         len = n_sectors << SECTOR_SHIFT;
1316
1317         __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1318
1319         submit_bio(bio);
1320 }
1321
1322 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1323 {
1324         sector_t sector;
1325
1326         if (likely(c->sectors_per_block_bits >= 0))
1327                 sector = block << c->sectors_per_block_bits;
1328         else
1329                 sector = block * (c->block_size >> SECTOR_SHIFT);
1330         sector += c->start;
1331
1332         return sector;
1333 }
1334
1335 static void submit_io(struct dm_buffer *b, enum req_op op,
1336                       void (*end_io)(struct dm_buffer *, blk_status_t))
1337 {
1338         unsigned int n_sectors;
1339         sector_t sector;
1340         unsigned int offset, end;
1341
1342         b->end_io = end_io;
1343
1344         sector = block_to_sector(b->c, b->block);
1345
1346         if (op != REQ_OP_WRITE) {
1347                 n_sectors = b->c->block_size >> SECTOR_SHIFT;
1348                 offset = 0;
1349         } else {
1350                 if (b->c->write_callback)
1351                         b->c->write_callback(b);
1352                 offset = b->write_start;
1353                 end = b->write_end;
1354                 offset &= -DM_BUFIO_WRITE_ALIGN;
1355                 end += DM_BUFIO_WRITE_ALIGN - 1;
1356                 end &= -DM_BUFIO_WRITE_ALIGN;
1357                 if (unlikely(end > b->c->block_size))
1358                         end = b->c->block_size;
1359
1360                 sector += offset >> SECTOR_SHIFT;
1361                 n_sectors = (end - offset) >> SECTOR_SHIFT;
1362         }
1363
1364         if (b->data_mode != DATA_MODE_VMALLOC)
1365                 use_bio(b, op, sector, n_sectors, offset);
1366         else
1367                 use_dmio(b, op, sector, n_sectors, offset);
1368 }
1369
1370 /*
1371  *--------------------------------------------------------------
1372  * Writing dirty buffers
1373  *--------------------------------------------------------------
1374  */
1375
1376 /*
1377  * The endio routine for write.
1378  *
1379  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1380  * it.
1381  */
1382 static void write_endio(struct dm_buffer *b, blk_status_t status)
1383 {
1384         b->write_error = status;
1385         if (unlikely(status)) {
1386                 struct dm_bufio_client *c = b->c;
1387
1388                 (void)cmpxchg(&c->async_write_error, 0,
1389                                 blk_status_to_errno(status));
1390         }
1391
1392         BUG_ON(!test_bit(B_WRITING, &b->state));
1393
1394         smp_mb__before_atomic();
1395         clear_bit(B_WRITING, &b->state);
1396         smp_mb__after_atomic();
1397
1398         wake_up_bit(&b->state, B_WRITING);
1399 }
1400
1401 /*
1402  * Initiate a write on a dirty buffer, but don't wait for it.
1403  *
1404  * - If the buffer is not dirty, exit.
1405  * - If there some previous write going on, wait for it to finish (we can't
1406  *   have two writes on the same buffer simultaneously).
1407  * - Submit our write and don't wait on it. We set B_WRITING indicating
1408  *   that there is a write in progress.
1409  */
1410 static void __write_dirty_buffer(struct dm_buffer *b,
1411                                  struct list_head *write_list)
1412 {
1413         if (!test_bit(B_DIRTY, &b->state))
1414                 return;
1415
1416         clear_bit(B_DIRTY, &b->state);
1417         wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1418
1419         b->write_start = b->dirty_start;
1420         b->write_end = b->dirty_end;
1421
1422         if (!write_list)
1423                 submit_io(b, REQ_OP_WRITE, write_endio);
1424         else
1425                 list_add_tail(&b->write_list, write_list);
1426 }
1427
1428 static void __flush_write_list(struct list_head *write_list)
1429 {
1430         struct blk_plug plug;
1431
1432         blk_start_plug(&plug);
1433         while (!list_empty(write_list)) {
1434                 struct dm_buffer *b =
1435                         list_entry(write_list->next, struct dm_buffer, write_list);
1436                 list_del(&b->write_list);
1437                 submit_io(b, REQ_OP_WRITE, write_endio);
1438                 cond_resched();
1439         }
1440         blk_finish_plug(&plug);
1441 }
1442
1443 /*
1444  * Wait until any activity on the buffer finishes.  Possibly write the
1445  * buffer if it is dirty.  When this function finishes, there is no I/O
1446  * running on the buffer and the buffer is not dirty.
1447  */
1448 static void __make_buffer_clean(struct dm_buffer *b)
1449 {
1450         BUG_ON(atomic_read(&b->hold_count));
1451
1452         /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1453         if (!smp_load_acquire(&b->state))       /* fast case */
1454                 return;
1455
1456         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1457         __write_dirty_buffer(b, NULL);
1458         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1459 }
1460
1461 static enum evict_result is_clean(struct dm_buffer *b, void *context)
1462 {
1463         struct dm_bufio_client *c = context;
1464
1465         /* These should never happen */
1466         if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1467                 return ER_DONT_EVICT;
1468         if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1469                 return ER_DONT_EVICT;
1470         if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1471                 return ER_DONT_EVICT;
1472
1473         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1474             unlikely(test_bit(B_READING, &b->state)))
1475                 return ER_DONT_EVICT;
1476
1477         return ER_EVICT;
1478 }
1479
1480 static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1481 {
1482         /* These should never happen */
1483         if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1484                 return ER_DONT_EVICT;
1485         if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1486                 return ER_DONT_EVICT;
1487
1488         return ER_EVICT;
1489 }
1490
1491 /*
1492  * Find some buffer that is not held by anybody, clean it, unlink it and
1493  * return it.
1494  */
1495 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
1496 {
1497         struct dm_buffer *b;
1498
1499         b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1500         if (b) {
1501                 /* this also waits for pending reads */
1502                 __make_buffer_clean(b);
1503                 return b;
1504         }
1505
1506         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1507                 return NULL;
1508
1509         b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1510         if (b) {
1511                 __make_buffer_clean(b);
1512                 return b;
1513         }
1514
1515         return NULL;
1516 }
1517
1518 /*
1519  * Wait until some other threads free some buffer or release hold count on
1520  * some buffer.
1521  *
1522  * This function is entered with c->lock held, drops it and regains it
1523  * before exiting.
1524  */
1525 static void __wait_for_free_buffer(struct dm_bufio_client *c)
1526 {
1527         DECLARE_WAITQUEUE(wait, current);
1528
1529         add_wait_queue(&c->free_buffer_wait, &wait);
1530         set_current_state(TASK_UNINTERRUPTIBLE);
1531         dm_bufio_unlock(c);
1532
1533         /*
1534          * It's possible to miss a wake up event since we don't always
1535          * hold c->lock when wake_up is called.  So we have a timeout here,
1536          * just in case.
1537          */
1538         io_schedule_timeout(5 * HZ);
1539
1540         remove_wait_queue(&c->free_buffer_wait, &wait);
1541
1542         dm_bufio_lock(c);
1543 }
1544
1545 enum new_flag {
1546         NF_FRESH = 0,
1547         NF_READ = 1,
1548         NF_GET = 2,
1549         NF_PREFETCH = 3
1550 };
1551
1552 /*
1553  * Allocate a new buffer. If the allocation is not possible, wait until
1554  * some other thread frees a buffer.
1555  *
1556  * May drop the lock and regain it.
1557  */
1558 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
1559 {
1560         struct dm_buffer *b;
1561         bool tried_noio_alloc = false;
1562
1563         /*
1564          * dm-bufio is resistant to allocation failures (it just keeps
1565          * one buffer reserved in cases all the allocations fail).
1566          * So set flags to not try too hard:
1567          *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1568          *                  mutex and wait ourselves.
1569          *      __GFP_NORETRY: don't retry and rather return failure
1570          *      __GFP_NOMEMALLOC: don't use emergency reserves
1571          *      __GFP_NOWARN: don't print a warning in case of failure
1572          *
1573          * For debugging, if we set the cache size to 1, no new buffers will
1574          * be allocated.
1575          */
1576         while (1) {
1577                 if (dm_bufio_cache_size_latch != 1) {
1578                         b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1579                         if (b)
1580                                 return b;
1581                 }
1582
1583                 if (nf == NF_PREFETCH)
1584                         return NULL;
1585
1586                 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
1587                         dm_bufio_unlock(c);
1588                         b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1589                         dm_bufio_lock(c);
1590                         if (b)
1591                                 return b;
1592                         tried_noio_alloc = true;
1593                 }
1594
1595                 if (!list_empty(&c->reserved_buffers)) {
1596                         b = list_to_buffer(c->reserved_buffers.next);
1597                         list_del(&b->lru.list);
1598                         c->need_reserved_buffers++;
1599
1600                         return b;
1601                 }
1602
1603                 b = __get_unclaimed_buffer(c);
1604                 if (b)
1605                         return b;
1606
1607                 __wait_for_free_buffer(c);
1608         }
1609 }
1610
1611 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
1612 {
1613         struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1614
1615         if (!b)
1616                 return NULL;
1617
1618         if (c->alloc_callback)
1619                 c->alloc_callback(b);
1620
1621         return b;
1622 }
1623
1624 /*
1625  * Free a buffer and wake other threads waiting for free buffers.
1626  */
1627 static void __free_buffer_wake(struct dm_buffer *b)
1628 {
1629         struct dm_bufio_client *c = b->c;
1630
1631         b->block = -1;
1632         if (!c->need_reserved_buffers)
1633                 free_buffer(b);
1634         else {
1635                 list_add(&b->lru.list, &c->reserved_buffers);
1636                 c->need_reserved_buffers--;
1637         }
1638
1639         /*
1640          * We hold the bufio lock here, so no one can add entries to the
1641          * wait queue anyway.
1642          */
1643         if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1644                 wake_up(&c->free_buffer_wait);
1645 }
1646
1647 static enum evict_result cleaned(struct dm_buffer *b, void *context)
1648 {
1649         if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1650                 return ER_DONT_EVICT; /* should never happen */
1651
1652         if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1653                 return ER_DONT_EVICT;
1654         else
1655                 return ER_EVICT;
1656 }
1657
1658 static void __move_clean_buffers(struct dm_bufio_client *c)
1659 {
1660         cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1661 }
1662
1663 struct write_context {
1664         int no_wait;
1665         struct list_head *write_list;
1666 };
1667
1668 static enum it_action write_one(struct dm_buffer *b, void *context)
1669 {
1670         struct write_context *wc = context;
1671
1672         if (wc->no_wait && test_bit(B_WRITING, &b->state))
1673                 return IT_COMPLETE;
1674
1675         __write_dirty_buffer(b, wc->write_list);
1676         return IT_NEXT;
1677 }
1678
1679 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
1680                                         struct list_head *write_list)
1681 {
1682         struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1683
1684         __move_clean_buffers(c);
1685         cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
1686 }
1687
1688 /*
1689  * Check if we're over watermark.
1690  * If we are over threshold_buffers, start freeing buffers.
1691  * If we're over "limit_buffers", block until we get under the limit.
1692  */
1693 static void __check_watermark(struct dm_bufio_client *c,
1694                               struct list_head *write_list)
1695 {
1696         if (cache_count(&c->cache, LIST_DIRTY) >
1697             cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1698                 __write_dirty_buffers_async(c, 1, write_list);
1699 }
1700
1701 /*
1702  *--------------------------------------------------------------
1703  * Getting a buffer
1704  *--------------------------------------------------------------
1705  */
1706
1707 static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1708 {
1709         /*
1710          * Relying on waitqueue_active() is racey, but we sleep
1711          * with schedule_timeout anyway.
1712          */
1713         if (cache_put(&c->cache, b) &&
1714             unlikely(waitqueue_active(&c->free_buffer_wait)))
1715                 wake_up(&c->free_buffer_wait);
1716 }
1717
1718 /*
1719  * This assumes you have already checked the cache to see if the buffer
1720  * is already present (it will recheck after dropping the lock for allocation).
1721  */
1722 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1723                                      enum new_flag nf, int *need_submit,
1724                                      struct list_head *write_list)
1725 {
1726         struct dm_buffer *b, *new_b = NULL;
1727
1728         *need_submit = 0;
1729
1730         /* This can't be called with NF_GET */
1731         if (WARN_ON_ONCE(nf == NF_GET))
1732                 return NULL;
1733
1734         new_b = __alloc_buffer_wait(c, nf);
1735         if (!new_b)
1736                 return NULL;
1737
1738         /*
1739          * We've had a period where the mutex was unlocked, so need to
1740          * recheck the buffer tree.
1741          */
1742         b = cache_get(&c->cache, block);
1743         if (b) {
1744                 __free_buffer_wake(new_b);
1745                 goto found_buffer;
1746         }
1747
1748         __check_watermark(c, write_list);
1749
1750         b = new_b;
1751         atomic_set(&b->hold_count, 1);
1752         WRITE_ONCE(b->last_accessed, jiffies);
1753         b->block = block;
1754         b->read_error = 0;
1755         b->write_error = 0;
1756         b->list_mode = LIST_CLEAN;
1757
1758         if (nf == NF_FRESH)
1759                 b->state = 0;
1760         else {
1761                 b->state = 1 << B_READING;
1762                 *need_submit = 1;
1763         }
1764
1765         /*
1766          * We mustn't insert into the cache until the B_READING state
1767          * is set.  Otherwise another thread could get it and use
1768          * it before it had been read.
1769          */
1770         cache_insert(&c->cache, b);
1771
1772         return b;
1773
1774 found_buffer:
1775         if (nf == NF_PREFETCH) {
1776                 cache_put_and_wake(c, b);
1777                 return NULL;
1778         }
1779
1780         /*
1781          * Note: it is essential that we don't wait for the buffer to be
1782          * read if dm_bufio_get function is used. Both dm_bufio_get and
1783          * dm_bufio_prefetch can be used in the driver request routine.
1784          * If the user called both dm_bufio_prefetch and dm_bufio_get on
1785          * the same buffer, it would deadlock if we waited.
1786          */
1787         if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1788                 cache_put_and_wake(c, b);
1789                 return NULL;
1790         }
1791
1792         return b;
1793 }
1794
1795 /*
1796  * The endio routine for reading: set the error, clear the bit and wake up
1797  * anyone waiting on the buffer.
1798  */
1799 static void read_endio(struct dm_buffer *b, blk_status_t status)
1800 {
1801         b->read_error = status;
1802
1803         BUG_ON(!test_bit(B_READING, &b->state));
1804
1805         smp_mb__before_atomic();
1806         clear_bit(B_READING, &b->state);
1807         smp_mb__after_atomic();
1808
1809         wake_up_bit(&b->state, B_READING);
1810 }
1811
1812 /*
1813  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1814  * functions is similar except that dm_bufio_new doesn't read the
1815  * buffer from the disk (assuming that the caller overwrites all the data
1816  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1817  */
1818 static void *new_read(struct dm_bufio_client *c, sector_t block,
1819                       enum new_flag nf, struct dm_buffer **bp)
1820 {
1821         int need_submit = 0;
1822         struct dm_buffer *b;
1823
1824         LIST_HEAD(write_list);
1825
1826         *bp = NULL;
1827
1828         /*
1829          * Fast path, hopefully the block is already in the cache.  No need
1830          * to get the client lock for this.
1831          */
1832         b = cache_get(&c->cache, block);
1833         if (b) {
1834                 if (nf == NF_PREFETCH) {
1835                         cache_put_and_wake(c, b);
1836                         return NULL;
1837                 }
1838
1839                 /*
1840                  * Note: it is essential that we don't wait for the buffer to be
1841                  * read if dm_bufio_get function is used. Both dm_bufio_get and
1842                  * dm_bufio_prefetch can be used in the driver request routine.
1843                  * If the user called both dm_bufio_prefetch and dm_bufio_get on
1844                  * the same buffer, it would deadlock if we waited.
1845                  */
1846                 if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1847                         cache_put_and_wake(c, b);
1848                         return NULL;
1849                 }
1850         }
1851
1852         if (!b) {
1853                 if (nf == NF_GET)
1854                         return NULL;
1855
1856                 dm_bufio_lock(c);
1857                 b = __bufio_new(c, block, nf, &need_submit, &write_list);
1858                 dm_bufio_unlock(c);
1859         }
1860
1861 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1862         if (b && (atomic_read(&b->hold_count) == 1))
1863                 buffer_record_stack(b);
1864 #endif
1865
1866         __flush_write_list(&write_list);
1867
1868         if (!b)
1869                 return NULL;
1870
1871         if (need_submit)
1872                 submit_io(b, REQ_OP_READ, read_endio);
1873
1874         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1875
1876         if (b->read_error) {
1877                 int error = blk_status_to_errno(b->read_error);
1878
1879                 dm_bufio_release(b);
1880
1881                 return ERR_PTR(error);
1882         }
1883
1884         *bp = b;
1885
1886         return b->data;
1887 }
1888
1889 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1890                    struct dm_buffer **bp)
1891 {
1892         return new_read(c, block, NF_GET, bp);
1893 }
1894 EXPORT_SYMBOL_GPL(dm_bufio_get);
1895
1896 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1897                     struct dm_buffer **bp)
1898 {
1899         if (WARN_ON_ONCE(dm_bufio_in_request()))
1900                 return ERR_PTR(-EINVAL);
1901
1902         return new_read(c, block, NF_READ, bp);
1903 }
1904 EXPORT_SYMBOL_GPL(dm_bufio_read);
1905
1906 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1907                    struct dm_buffer **bp)
1908 {
1909         if (WARN_ON_ONCE(dm_bufio_in_request()))
1910                 return ERR_PTR(-EINVAL);
1911
1912         return new_read(c, block, NF_FRESH, bp);
1913 }
1914 EXPORT_SYMBOL_GPL(dm_bufio_new);
1915
1916 void dm_bufio_prefetch(struct dm_bufio_client *c,
1917                        sector_t block, unsigned int n_blocks)
1918 {
1919         struct blk_plug plug;
1920
1921         LIST_HEAD(write_list);
1922
1923         if (WARN_ON_ONCE(dm_bufio_in_request()))
1924                 return; /* should never happen */
1925
1926         blk_start_plug(&plug);
1927
1928         for (; n_blocks--; block++) {
1929                 int need_submit;
1930                 struct dm_buffer *b;
1931
1932                 b = cache_get(&c->cache, block);
1933                 if (b) {
1934                         /* already in cache */
1935                         cache_put_and_wake(c, b);
1936                         continue;
1937                 }
1938
1939                 dm_bufio_lock(c);
1940                 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1941                                 &write_list);
1942                 if (unlikely(!list_empty(&write_list))) {
1943                         dm_bufio_unlock(c);
1944                         blk_finish_plug(&plug);
1945                         __flush_write_list(&write_list);
1946                         blk_start_plug(&plug);
1947                         dm_bufio_lock(c);
1948                 }
1949                 if (unlikely(b != NULL)) {
1950                         dm_bufio_unlock(c);
1951
1952                         if (need_submit)
1953                                 submit_io(b, REQ_OP_READ, read_endio);
1954                         dm_bufio_release(b);
1955
1956                         cond_resched();
1957
1958                         if (!n_blocks)
1959                                 goto flush_plug;
1960                         dm_bufio_lock(c);
1961                 }
1962                 dm_bufio_unlock(c);
1963         }
1964
1965 flush_plug:
1966         blk_finish_plug(&plug);
1967 }
1968 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1969
1970 void dm_bufio_release(struct dm_buffer *b)
1971 {
1972         struct dm_bufio_client *c = b->c;
1973
1974         /*
1975          * If there were errors on the buffer, and the buffer is not
1976          * to be written, free the buffer. There is no point in caching
1977          * invalid buffer.
1978          */
1979         if ((b->read_error || b->write_error) &&
1980             !test_bit_acquire(B_READING, &b->state) &&
1981             !test_bit(B_WRITING, &b->state) &&
1982             !test_bit(B_DIRTY, &b->state)) {
1983                 dm_bufio_lock(c);
1984
1985                 /* cache remove can fail if there are other holders */
1986                 if (cache_remove(&c->cache, b)) {
1987                         __free_buffer_wake(b);
1988                         dm_bufio_unlock(c);
1989                         return;
1990                 }
1991
1992                 dm_bufio_unlock(c);
1993         }
1994
1995         cache_put_and_wake(c, b);
1996 }
1997 EXPORT_SYMBOL_GPL(dm_bufio_release);
1998
1999 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2000                                         unsigned int start, unsigned int end)
2001 {
2002         struct dm_bufio_client *c = b->c;
2003
2004         BUG_ON(start >= end);
2005         BUG_ON(end > b->c->block_size);
2006
2007         dm_bufio_lock(c);
2008
2009         BUG_ON(test_bit(B_READING, &b->state));
2010
2011         if (!test_and_set_bit(B_DIRTY, &b->state)) {
2012                 b->dirty_start = start;
2013                 b->dirty_end = end;
2014                 cache_mark(&c->cache, b, LIST_DIRTY);
2015         } else {
2016                 if (start < b->dirty_start)
2017                         b->dirty_start = start;
2018                 if (end > b->dirty_end)
2019                         b->dirty_end = end;
2020         }
2021
2022         dm_bufio_unlock(c);
2023 }
2024 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2025
2026 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2027 {
2028         dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
2029 }
2030 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2031
2032 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2033 {
2034         LIST_HEAD(write_list);
2035
2036         if (WARN_ON_ONCE(dm_bufio_in_request()))
2037                 return; /* should never happen */
2038
2039         dm_bufio_lock(c);
2040         __write_dirty_buffers_async(c, 0, &write_list);
2041         dm_bufio_unlock(c);
2042         __flush_write_list(&write_list);
2043 }
2044 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2045
2046 /*
2047  * For performance, it is essential that the buffers are written asynchronously
2048  * and simultaneously (so that the block layer can merge the writes) and then
2049  * waited upon.
2050  *
2051  * Finally, we flush hardware disk cache.
2052  */
2053 static bool is_writing(struct lru_entry *e, void *context)
2054 {
2055         struct dm_buffer *b = le_to_buffer(e);
2056
2057         return test_bit(B_WRITING, &b->state);
2058 }
2059
2060 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2061 {
2062         int a, f;
2063         unsigned long nr_buffers;
2064         struct lru_entry *e;
2065         struct lru_iter it;
2066
2067         LIST_HEAD(write_list);
2068
2069         dm_bufio_lock(c);
2070         __write_dirty_buffers_async(c, 0, &write_list);
2071         dm_bufio_unlock(c);
2072         __flush_write_list(&write_list);
2073         dm_bufio_lock(c);
2074
2075         nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2076         lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2077         while ((e = lru_iter_next(&it, is_writing, c))) {
2078                 struct dm_buffer *b = le_to_buffer(e);
2079                 __cache_inc_buffer(b);
2080
2081                 BUG_ON(test_bit(B_READING, &b->state));
2082
2083                 if (nr_buffers) {
2084                         nr_buffers--;
2085                         dm_bufio_unlock(c);
2086                         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2087                         dm_bufio_lock(c);
2088                 } else {
2089                         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2090                 }
2091
2092                 if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2093                         cache_mark(&c->cache, b, LIST_CLEAN);
2094
2095                 cache_put_and_wake(c, b);
2096
2097                 cond_resched();
2098         }
2099         lru_iter_end(&it);
2100
2101         wake_up(&c->free_buffer_wait);
2102         dm_bufio_unlock(c);
2103
2104         a = xchg(&c->async_write_error, 0);
2105         f = dm_bufio_issue_flush(c);
2106         if (a)
2107                 return a;
2108
2109         return f;
2110 }
2111 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2112
2113 /*
2114  * Use dm-io to send an empty barrier to flush the device.
2115  */
2116 int dm_bufio_issue_flush(struct dm_bufio_client *c)
2117 {
2118         struct dm_io_request io_req = {
2119                 .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
2120                 .mem.type = DM_IO_KMEM,
2121                 .mem.ptr.addr = NULL,
2122                 .client = c->dm_io,
2123         };
2124         struct dm_io_region io_reg = {
2125                 .bdev = c->bdev,
2126                 .sector = 0,
2127                 .count = 0,
2128         };
2129
2130         if (WARN_ON_ONCE(dm_bufio_in_request()))
2131                 return -EINVAL;
2132
2133         return dm_io(&io_req, 1, &io_reg, NULL);
2134 }
2135 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2136
2137 /*
2138  * Use dm-io to send a discard request to flush the device.
2139  */
2140 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2141 {
2142         struct dm_io_request io_req = {
2143                 .bi_opf = REQ_OP_DISCARD | REQ_SYNC,
2144                 .mem.type = DM_IO_KMEM,
2145                 .mem.ptr.addr = NULL,
2146                 .client = c->dm_io,
2147         };
2148         struct dm_io_region io_reg = {
2149                 .bdev = c->bdev,
2150                 .sector = block_to_sector(c, block),
2151                 .count = block_to_sector(c, count),
2152         };
2153
2154         if (WARN_ON_ONCE(dm_bufio_in_request()))
2155                 return -EINVAL; /* discards are optional */
2156
2157         return dm_io(&io_req, 1, &io_reg, NULL);
2158 }
2159 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2160
2161 static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2162 {
2163         struct dm_buffer *b;
2164
2165         b = cache_get(&c->cache, block);
2166         if (b) {
2167                 if (likely(!smp_load_acquire(&b->state))) {
2168                         if (cache_remove(&c->cache, b))
2169                                 __free_buffer_wake(b);
2170                         else
2171                                 cache_put_and_wake(c, b);
2172                 } else {
2173                         cache_put_and_wake(c, b);
2174                 }
2175         }
2176
2177         return b ? true : false;
2178 }
2179
2180 /*
2181  * Free the given buffer.
2182  *
2183  * This is just a hint, if the buffer is in use or dirty, this function
2184  * does nothing.
2185  */
2186 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2187 {
2188         dm_bufio_lock(c);
2189         forget_buffer(c, block);
2190         dm_bufio_unlock(c);
2191 }
2192 EXPORT_SYMBOL_GPL(dm_bufio_forget);
2193
2194 static enum evict_result idle(struct dm_buffer *b, void *context)
2195 {
2196         return b->state ? ER_DONT_EVICT : ER_EVICT;
2197 }
2198
2199 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2200 {
2201         dm_bufio_lock(c);
2202         cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
2203         dm_bufio_unlock(c);
2204 }
2205 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2206
2207 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
2208 {
2209         c->minimum_buffers = n;
2210 }
2211 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2212
2213 unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2214 {
2215         return c->block_size;
2216 }
2217 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2218
2219 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2220 {
2221         sector_t s = bdev_nr_sectors(c->bdev);
2222
2223         if (s >= c->start)
2224                 s -= c->start;
2225         else
2226                 s = 0;
2227         if (likely(c->sectors_per_block_bits >= 0))
2228                 s >>= c->sectors_per_block_bits;
2229         else
2230                 sector_div(s, c->block_size >> SECTOR_SHIFT);
2231         return s;
2232 }
2233 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2234
2235 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
2236 {
2237         return c->dm_io;
2238 }
2239 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2240
2241 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2242 {
2243         return b->block;
2244 }
2245 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2246
2247 void *dm_bufio_get_block_data(struct dm_buffer *b)
2248 {
2249         return b->data;
2250 }
2251 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2252
2253 void *dm_bufio_get_aux_data(struct dm_buffer *b)
2254 {
2255         return b + 1;
2256 }
2257 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2258
2259 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
2260 {
2261         return b->c;
2262 }
2263 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2264
2265 static enum it_action warn_leak(struct dm_buffer *b, void *context)
2266 {
2267         bool *warned = context;
2268
2269         WARN_ON(!(*warned));
2270         *warned = true;
2271         DMERR("leaked buffer %llx, hold count %u, list %d",
2272               (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2273 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2274         stack_trace_print(b->stack_entries, b->stack_len, 1);
2275         /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2276         atomic_set(&b->hold_count, 0);
2277 #endif
2278         return IT_NEXT;
2279 }
2280
2281 static void drop_buffers(struct dm_bufio_client *c)
2282 {
2283         int i;
2284         struct dm_buffer *b;
2285
2286         if (WARN_ON(dm_bufio_in_request()))
2287                 return; /* should never happen */
2288
2289         /*
2290          * An optimization so that the buffers are not written one-by-one.
2291          */
2292         dm_bufio_write_dirty_buffers_async(c);
2293
2294         dm_bufio_lock(c);
2295
2296         while ((b = __get_unclaimed_buffer(c)))
2297                 __free_buffer_wake(b);
2298
2299         for (i = 0; i < LIST_SIZE; i++) {
2300                 bool warned = false;
2301
2302                 cache_iterate(&c->cache, i, warn_leak, &warned);
2303         }
2304
2305 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2306         while ((b = __get_unclaimed_buffer(c)))
2307                 __free_buffer_wake(b);
2308 #endif
2309
2310         for (i = 0; i < LIST_SIZE; i++)
2311                 WARN_ON(cache_count(&c->cache, i));
2312
2313         dm_bufio_unlock(c);
2314 }
2315
2316 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2317 {
2318         unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2319
2320         if (likely(c->sectors_per_block_bits >= 0))
2321                 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2322         else
2323                 retain_bytes /= c->block_size;
2324
2325         return retain_bytes;
2326 }
2327
2328 static void __scan(struct dm_bufio_client *c)
2329 {
2330         int l;
2331         struct dm_buffer *b;
2332         unsigned long freed = 0;
2333         unsigned long retain_target = get_retain_buffers(c);
2334         unsigned long count = cache_total(&c->cache);
2335
2336         for (l = 0; l < LIST_SIZE; l++) {
2337                 while (true) {
2338                         if (count - freed <= retain_target)
2339                                 atomic_long_set(&c->need_shrink, 0);
2340                         if (!atomic_long_read(&c->need_shrink))
2341                                 break;
2342
2343                         b = cache_evict(&c->cache, l,
2344                                         l == LIST_CLEAN ? is_clean : is_dirty, c);
2345                         if (!b)
2346                                 break;
2347
2348                         __make_buffer_clean(b);
2349                         __free_buffer_wake(b);
2350
2351                         atomic_long_dec(&c->need_shrink);
2352                         freed++;
2353                         cond_resched();
2354                 }
2355         }
2356 }
2357
2358 static void shrink_work(struct work_struct *w)
2359 {
2360         struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
2361
2362         dm_bufio_lock(c);
2363         __scan(c);
2364         dm_bufio_unlock(c);
2365 }
2366
2367 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2368 {
2369         struct dm_bufio_client *c;
2370
2371         c = container_of(shrink, struct dm_bufio_client, shrinker);
2372         atomic_long_add(sc->nr_to_scan, &c->need_shrink);
2373         queue_work(dm_bufio_wq, &c->shrink_work);
2374
2375         return sc->nr_to_scan;
2376 }
2377
2378 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2379 {
2380         struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
2381         unsigned long count = cache_total(&c->cache);
2382         unsigned long retain_target = get_retain_buffers(c);
2383         unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
2384
2385         if (unlikely(count < retain_target))
2386                 count = 0;
2387         else
2388                 count -= retain_target;
2389
2390         if (unlikely(count < queued_for_cleanup))
2391                 count = 0;
2392         else
2393                 count -= queued_for_cleanup;
2394
2395         return count;
2396 }
2397
2398 /*
2399  * Create the buffering interface
2400  */
2401 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
2402                                                unsigned int reserved_buffers, unsigned int aux_size,
2403                                                void (*alloc_callback)(struct dm_buffer *),
2404                                                void (*write_callback)(struct dm_buffer *),
2405                                                unsigned int flags)
2406 {
2407         int r;
2408         unsigned int num_locks;
2409         struct dm_bufio_client *c;
2410         char slab_name[27];
2411
2412         if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2413                 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2414                 r = -EINVAL;
2415                 goto bad_client;
2416         }
2417
2418         num_locks = dm_num_hash_locks();
2419         c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
2420         if (!c) {
2421                 r = -ENOMEM;
2422                 goto bad_client;
2423         }
2424         cache_init(&c->cache, num_locks);
2425
2426         c->bdev = bdev;
2427         c->block_size = block_size;
2428         if (is_power_of_2(block_size))
2429                 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2430         else
2431                 c->sectors_per_block_bits = -1;
2432
2433         c->alloc_callback = alloc_callback;
2434         c->write_callback = write_callback;
2435
2436         if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2437                 c->no_sleep = true;
2438                 static_branch_inc(&no_sleep_enabled);
2439         }
2440
2441         mutex_init(&c->lock);
2442         spin_lock_init(&c->spinlock);
2443         INIT_LIST_HEAD(&c->reserved_buffers);
2444         c->need_reserved_buffers = reserved_buffers;
2445
2446         dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2447
2448         init_waitqueue_head(&c->free_buffer_wait);
2449         c->async_write_error = 0;
2450
2451         c->dm_io = dm_io_client_create();
2452         if (IS_ERR(c->dm_io)) {
2453                 r = PTR_ERR(c->dm_io);
2454                 goto bad_dm_io;
2455         }
2456
2457         if (block_size <= KMALLOC_MAX_SIZE &&
2458             (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
2459                 unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
2460
2461                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
2462                 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
2463                                                   SLAB_RECLAIM_ACCOUNT, NULL);
2464                 if (!c->slab_cache) {
2465                         r = -ENOMEM;
2466                         goto bad;
2467                 }
2468         }
2469         if (aux_size)
2470                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
2471         else
2472                 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
2473         c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2474                                            0, SLAB_RECLAIM_ACCOUNT, NULL);
2475         if (!c->slab_buffer) {
2476                 r = -ENOMEM;
2477                 goto bad;
2478         }
2479
2480         while (c->need_reserved_buffers) {
2481                 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2482
2483                 if (!b) {
2484                         r = -ENOMEM;
2485                         goto bad;
2486                 }
2487                 __free_buffer_wake(b);
2488         }
2489
2490         INIT_WORK(&c->shrink_work, shrink_work);
2491         atomic_long_set(&c->need_shrink, 0);
2492
2493         c->shrinker.count_objects = dm_bufio_shrink_count;
2494         c->shrinker.scan_objects = dm_bufio_shrink_scan;
2495         c->shrinker.seeks = 1;
2496         c->shrinker.batch = 0;
2497         r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
2498                               MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2499         if (r)
2500                 goto bad;
2501
2502         mutex_lock(&dm_bufio_clients_lock);
2503         dm_bufio_client_count++;
2504         list_add(&c->client_list, &dm_bufio_all_clients);
2505         __cache_size_refresh();
2506         mutex_unlock(&dm_bufio_clients_lock);
2507
2508         return c;
2509
2510 bad:
2511         while (!list_empty(&c->reserved_buffers)) {
2512                 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2513
2514                 list_del(&b->lru.list);
2515                 free_buffer(b);
2516         }
2517         kmem_cache_destroy(c->slab_cache);
2518         kmem_cache_destroy(c->slab_buffer);
2519         dm_io_client_destroy(c->dm_io);
2520 bad_dm_io:
2521         mutex_destroy(&c->lock);
2522         if (c->no_sleep)
2523                 static_branch_dec(&no_sleep_enabled);
2524         kfree(c);
2525 bad_client:
2526         return ERR_PTR(r);
2527 }
2528 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2529
2530 /*
2531  * Free the buffering interface.
2532  * It is required that there are no references on any buffers.
2533  */
2534 void dm_bufio_client_destroy(struct dm_bufio_client *c)
2535 {
2536         unsigned int i;
2537
2538         drop_buffers(c);
2539
2540         unregister_shrinker(&c->shrinker);
2541         flush_work(&c->shrink_work);
2542
2543         mutex_lock(&dm_bufio_clients_lock);
2544
2545         list_del(&c->client_list);
2546         dm_bufio_client_count--;
2547         __cache_size_refresh();
2548
2549         mutex_unlock(&dm_bufio_clients_lock);
2550
2551         WARN_ON(c->need_reserved_buffers);
2552
2553         while (!list_empty(&c->reserved_buffers)) {
2554                 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2555
2556                 list_del(&b->lru.list);
2557                 free_buffer(b);
2558         }
2559
2560         for (i = 0; i < LIST_SIZE; i++)
2561                 if (cache_count(&c->cache, i))
2562                         DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2563
2564         for (i = 0; i < LIST_SIZE; i++)
2565                 WARN_ON(cache_count(&c->cache, i));
2566
2567         cache_destroy(&c->cache);
2568         kmem_cache_destroy(c->slab_cache);
2569         kmem_cache_destroy(c->slab_buffer);
2570         dm_io_client_destroy(c->dm_io);
2571         mutex_destroy(&c->lock);
2572         if (c->no_sleep)
2573                 static_branch_dec(&no_sleep_enabled);
2574         kfree(c);
2575 }
2576 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2577
2578 void dm_bufio_client_reset(struct dm_bufio_client *c)
2579 {
2580         drop_buffers(c);
2581         flush_work(&c->shrink_work);
2582 }
2583 EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2584
2585 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2586 {
2587         c->start = start;
2588 }
2589 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2590
2591 /*--------------------------------------------------------------*/
2592
2593 static unsigned int get_max_age_hz(void)
2594 {
2595         unsigned int max_age = READ_ONCE(dm_bufio_max_age);
2596
2597         if (max_age > UINT_MAX / HZ)
2598                 max_age = UINT_MAX / HZ;
2599
2600         return max_age * HZ;
2601 }
2602
2603 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
2604 {
2605         return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
2606 }
2607
2608 struct evict_params {
2609         gfp_t gfp;
2610         unsigned long age_hz;
2611
2612         /*
2613          * This gets updated with the largest last_accessed (ie. most
2614          * recently used) of the evicted buffers.  It will not be reinitialised
2615          * by __evict_many(), so you can use it across multiple invocations.
2616          */
2617         unsigned long last_accessed;
2618 };
2619
2620 /*
2621  * We may not be able to evict this buffer if IO pending or the client
2622  * is still using it.
2623  *
2624  * And if GFP_NOFS is used, we must not do any I/O because we hold
2625  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2626  * rerouted to different bufio client.
2627  */
2628 static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
2629 {
2630         struct evict_params *params = context;
2631
2632         if (!(params->gfp & __GFP_FS) ||
2633             (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2634                 if (test_bit_acquire(B_READING, &b->state) ||
2635                     test_bit(B_WRITING, &b->state) ||
2636                     test_bit(B_DIRTY, &b->state))
2637                         return ER_DONT_EVICT;
2638         }
2639
2640         return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
2641 }
2642
2643 static unsigned long __evict_many(struct dm_bufio_client *c,
2644                                   struct evict_params *params,
2645                                   int list_mode, unsigned long max_count)
2646 {
2647         unsigned long count;
2648         unsigned long last_accessed;
2649         struct dm_buffer *b;
2650
2651         for (count = 0; count < max_count; count++) {
2652                 b = cache_evict(&c->cache, list_mode, select_for_evict, params);
2653                 if (!b)
2654                         break;
2655
2656                 last_accessed = READ_ONCE(b->last_accessed);
2657                 if (time_after_eq(params->last_accessed, last_accessed))
2658                         params->last_accessed = last_accessed;
2659
2660                 __make_buffer_clean(b);
2661                 __free_buffer_wake(b);
2662
2663                 cond_resched();
2664         }
2665
2666         return count;
2667 }
2668
2669 static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
2670 {
2671         struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
2672         unsigned long retain = get_retain_buffers(c);
2673         unsigned long count;
2674         LIST_HEAD(write_list);
2675
2676         dm_bufio_lock(c);
2677
2678         __check_watermark(c, &write_list);
2679         if (unlikely(!list_empty(&write_list))) {
2680                 dm_bufio_unlock(c);
2681                 __flush_write_list(&write_list);
2682                 dm_bufio_lock(c);
2683         }
2684
2685         count = cache_total(&c->cache);
2686         if (count > retain)
2687                 __evict_many(c, &params, LIST_CLEAN, count - retain);
2688
2689         dm_bufio_unlock(c);
2690 }
2691
2692 static void cleanup_old_buffers(void)
2693 {
2694         unsigned long max_age_hz = get_max_age_hz();
2695         struct dm_bufio_client *c;
2696
2697         mutex_lock(&dm_bufio_clients_lock);
2698
2699         __cache_size_refresh();
2700
2701         list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2702                 evict_old_buffers(c, max_age_hz);
2703
2704         mutex_unlock(&dm_bufio_clients_lock);
2705 }
2706
2707 static void work_fn(struct work_struct *w)
2708 {
2709         cleanup_old_buffers();
2710
2711         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2712                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2713 }
2714
2715 /*--------------------------------------------------------------*/
2716
2717 /*
2718  * Global cleanup tries to evict the oldest buffers from across _all_
2719  * the clients.  It does this by repeatedly evicting a few buffers from
2720  * the client that holds the oldest buffer.  It's approximate, but hopefully
2721  * good enough.
2722  */
2723 static struct dm_bufio_client *__pop_client(void)
2724 {
2725         struct list_head *h;
2726
2727         if (list_empty(&dm_bufio_all_clients))
2728                 return NULL;
2729
2730         h = dm_bufio_all_clients.next;
2731         list_del(h);
2732         return container_of(h, struct dm_bufio_client, client_list);
2733 }
2734
2735 /*
2736  * Inserts the client in the global client list based on its
2737  * 'oldest_buffer' field.
2738  */
2739 static void __insert_client(struct dm_bufio_client *new_client)
2740 {
2741         struct dm_bufio_client *c;
2742         struct list_head *h = dm_bufio_all_clients.next;
2743
2744         while (h != &dm_bufio_all_clients) {
2745                 c = container_of(h, struct dm_bufio_client, client_list);
2746                 if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2747                         break;
2748                 h = h->next;
2749         }
2750
2751         list_add_tail(&new_client->client_list, h);
2752 }
2753
2754 static unsigned long __evict_a_few(unsigned long nr_buffers)
2755 {
2756         unsigned long count;
2757         struct dm_bufio_client *c;
2758         struct evict_params params = {
2759                 .gfp = GFP_KERNEL,
2760                 .age_hz = 0,
2761                 /* set to jiffies in case there are no buffers in this client */
2762                 .last_accessed = jiffies
2763         };
2764
2765         c = __pop_client();
2766         if (!c)
2767                 return 0;
2768
2769         dm_bufio_lock(c);
2770         count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
2771         dm_bufio_unlock(c);
2772
2773         if (count)
2774                 c->oldest_buffer = params.last_accessed;
2775         __insert_client(c);
2776
2777         return count;
2778 }
2779
2780 static void check_watermarks(void)
2781 {
2782         LIST_HEAD(write_list);
2783         struct dm_bufio_client *c;
2784
2785         mutex_lock(&dm_bufio_clients_lock);
2786         list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2787                 dm_bufio_lock(c);
2788                 __check_watermark(c, &write_list);
2789                 dm_bufio_unlock(c);
2790         }
2791         mutex_unlock(&dm_bufio_clients_lock);
2792
2793         __flush_write_list(&write_list);
2794 }
2795
2796 static void evict_old(void)
2797 {
2798         unsigned long threshold = dm_bufio_cache_size -
2799                 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2800
2801         mutex_lock(&dm_bufio_clients_lock);
2802         while (dm_bufio_current_allocated > threshold) {
2803                 if (!__evict_a_few(64))
2804                         break;
2805                 cond_resched();
2806         }
2807         mutex_unlock(&dm_bufio_clients_lock);
2808 }
2809
2810 static void do_global_cleanup(struct work_struct *w)
2811 {
2812         check_watermarks();
2813         evict_old();
2814 }
2815
2816 /*
2817  *--------------------------------------------------------------
2818  * Module setup
2819  *--------------------------------------------------------------
2820  */
2821
2822 /*
2823  * This is called only once for the whole dm_bufio module.
2824  * It initializes memory limit.
2825  */
2826 static int __init dm_bufio_init(void)
2827 {
2828         __u64 mem;
2829
2830         dm_bufio_allocated_kmem_cache = 0;
2831         dm_bufio_allocated_get_free_pages = 0;
2832         dm_bufio_allocated_vmalloc = 0;
2833         dm_bufio_current_allocated = 0;
2834
2835         mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2836                                DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2837
2838         if (mem > ULONG_MAX)
2839                 mem = ULONG_MAX;
2840
2841 #ifdef CONFIG_MMU
2842         if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2843                 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2844 #endif
2845
2846         dm_bufio_default_cache_size = mem;
2847
2848         mutex_lock(&dm_bufio_clients_lock);
2849         __cache_size_refresh();
2850         mutex_unlock(&dm_bufio_clients_lock);
2851
2852         dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2853         if (!dm_bufio_wq)
2854                 return -ENOMEM;
2855
2856         INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2857         INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2858         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2859                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2860
2861         return 0;
2862 }
2863
2864 /*
2865  * This is called once when unloading the dm_bufio module.
2866  */
2867 static void __exit dm_bufio_exit(void)
2868 {
2869         int bug = 0;
2870
2871         cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2872         destroy_workqueue(dm_bufio_wq);
2873
2874         if (dm_bufio_client_count) {
2875                 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2876                         __func__, dm_bufio_client_count);
2877                 bug = 1;
2878         }
2879
2880         if (dm_bufio_current_allocated) {
2881                 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2882                         __func__, dm_bufio_current_allocated);
2883                 bug = 1;
2884         }
2885
2886         if (dm_bufio_allocated_get_free_pages) {
2887                 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2888                        __func__, dm_bufio_allocated_get_free_pages);
2889                 bug = 1;
2890         }
2891
2892         if (dm_bufio_allocated_vmalloc) {
2893                 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2894                        __func__, dm_bufio_allocated_vmalloc);
2895                 bug = 1;
2896         }
2897
2898         WARN_ON(bug); /* leaks are not worth crashing the system */
2899 }
2900
2901 module_init(dm_bufio_init)
2902 module_exit(dm_bufio_exit)
2903
2904 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
2905 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2906
2907 module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
2908 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2909
2910 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
2911 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2912
2913 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
2914 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2915
2916 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
2917 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2918
2919 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
2920 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2921
2922 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
2923 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2924
2925 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
2926 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2927
2928 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2929 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2930 MODULE_LICENSE("GPL");