Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[platform/kernel/linux-rpi.git] / mm / zswap.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * zswap.c - zswap driver file
4  *
5  * zswap is a cache that takes pages that are in the process
6  * of being swapped out and attempts to compress and store them in a
7  * RAM-based memory pool.  This can result in a significant I/O reduction on
8  * the swap device and, in the case where decompressing from RAM is faster
9  * than reading from the swap device, can also improve workload performance.
10  *
11  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12 */
13
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/highmem.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/atomic.h>
23 #include <linux/rbtree.h>
24 #include <linux/swap.h>
25 #include <linux/crypto.h>
26 #include <linux/scatterlist.h>
27 #include <linux/mempool.h>
28 #include <linux/zpool.h>
29 #include <crypto/acompress.h>
30 #include <linux/zswap.h>
31 #include <linux/mm_types.h>
32 #include <linux/page-flags.h>
33 #include <linux/swapops.h>
34 #include <linux/writeback.h>
35 #include <linux/pagemap.h>
36 #include <linux/workqueue.h>
37
38 #include "swap.h"
39 #include "internal.h"
40
41 /*********************************
42 * statistics
43 **********************************/
44 /* Total bytes used by the compressed storage */
45 u64 zswap_pool_total_size;
46 /* The number of compressed pages currently stored in zswap */
47 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
48 /* The number of same-value filled pages currently stored in zswap */
49 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
50
51 /*
52  * The statistics below are not protected from concurrent access for
53  * performance reasons so they may not be a 100% accurate.  However,
54  * they do provide useful information on roughly how many times a
55  * certain event is occurring.
56 */
57
58 /* Pool limit was hit (see zswap_max_pool_percent) */
59 static u64 zswap_pool_limit_hit;
60 /* Pages written back when pool limit was reached */
61 static u64 zswap_written_back_pages;
62 /* Store failed due to a reclaim failure after pool limit was reached */
63 static u64 zswap_reject_reclaim_fail;
64 /* Compressed page was too big for the allocator to (optimally) store */
65 static u64 zswap_reject_compress_poor;
66 /* Store failed because underlying allocator could not get memory */
67 static u64 zswap_reject_alloc_fail;
68 /* Store failed because the entry metadata could not be allocated (rare) */
69 static u64 zswap_reject_kmemcache_fail;
70 /* Duplicate store was encountered (rare) */
71 static u64 zswap_duplicate_entry;
72
73 /* Shrinker work queue */
74 static struct workqueue_struct *shrink_wq;
75 /* Pool limit was hit, we need to calm down */
76 static bool zswap_pool_reached_full;
77
78 /*********************************
79 * tunables
80 **********************************/
81
82 #define ZSWAP_PARAM_UNSET ""
83
84 static int zswap_setup(void);
85
86 /* Enable/disable zswap */
87 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
88 static int zswap_enabled_param_set(const char *,
89                                    const struct kernel_param *);
90 static const struct kernel_param_ops zswap_enabled_param_ops = {
91         .set =          zswap_enabled_param_set,
92         .get =          param_get_bool,
93 };
94 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
95
96 /* Crypto compressor to use */
97 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
98 static int zswap_compressor_param_set(const char *,
99                                       const struct kernel_param *);
100 static const struct kernel_param_ops zswap_compressor_param_ops = {
101         .set =          zswap_compressor_param_set,
102         .get =          param_get_charp,
103         .free =         param_free_charp,
104 };
105 module_param_cb(compressor, &zswap_compressor_param_ops,
106                 &zswap_compressor, 0644);
107
108 /* Compressed storage zpool to use */
109 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
110 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
111 static const struct kernel_param_ops zswap_zpool_param_ops = {
112         .set =          zswap_zpool_param_set,
113         .get =          param_get_charp,
114         .free =         param_free_charp,
115 };
116 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
117
118 /* The maximum percentage of memory that the compressed pool can occupy */
119 static unsigned int zswap_max_pool_percent = 20;
120 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
121
122 /* The threshold for accepting new pages after the max_pool_percent was hit */
123 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
124 module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
125                    uint, 0644);
126
127 /*
128  * Enable/disable handling same-value filled pages (enabled by default).
129  * If disabled every page is considered non-same-value filled.
130  */
131 static bool zswap_same_filled_pages_enabled = true;
132 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
133                    bool, 0644);
134
135 /* Enable/disable handling non-same-value filled pages (enabled by default) */
136 static bool zswap_non_same_filled_pages_enabled = true;
137 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
138                    bool, 0644);
139
140 static bool zswap_exclusive_loads_enabled = IS_ENABLED(
141                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
142 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
143
144 /* Number of zpools in zswap_pool (empirically determined for scalability) */
145 #define ZSWAP_NR_ZPOOLS 32
146
147 /*********************************
148 * data structures
149 **********************************/
150
151 struct crypto_acomp_ctx {
152         struct crypto_acomp *acomp;
153         struct acomp_req *req;
154         struct crypto_wait wait;
155         u8 *dstmem;
156         struct mutex *mutex;
157 };
158
159 /*
160  * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
161  * The only case where lru_lock is not acquired while holding tree.lock is
162  * when a zswap_entry is taken off the lru for writeback, in that case it
163  * needs to be verified that it's still valid in the tree.
164  */
165 struct zswap_pool {
166         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
167         struct crypto_acomp_ctx __percpu *acomp_ctx;
168         struct kref kref;
169         struct list_head list;
170         struct work_struct release_work;
171         struct work_struct shrink_work;
172         struct hlist_node node;
173         char tfm_name[CRYPTO_MAX_ALG_NAME];
174         struct list_head lru;
175         spinlock_t lru_lock;
176 };
177
178 /*
179  * struct zswap_entry
180  *
181  * This structure contains the metadata for tracking a single compressed
182  * page within zswap.
183  *
184  * rbnode - links the entry into red-black tree for the appropriate swap type
185  * swpentry - associated swap entry, the offset indexes into the red-black tree
186  * refcount - the number of outstanding reference to the entry. This is needed
187  *            to protect against premature freeing of the entry by code
188  *            concurrent calls to load, invalidate, and writeback.  The lock
189  *            for the zswap_tree structure that contains the entry must
190  *            be held while changing the refcount.  Since the lock must
191  *            be held, there is no reason to also make refcount atomic.
192  * length - the length in bytes of the compressed page data.  Needed during
193  *          decompression. For a same value filled page length is 0, and both
194  *          pool and lru are invalid and must be ignored.
195  * pool - the zswap_pool the entry's data is in
196  * handle - zpool allocation handle that stores the compressed page data
197  * value - value of the same-value filled pages which have same content
198  * objcg - the obj_cgroup that the compressed memory is charged to
199  * lru - handle to the pool's lru used to evict pages.
200  */
201 struct zswap_entry {
202         struct rb_node rbnode;
203         swp_entry_t swpentry;
204         int refcount;
205         unsigned int length;
206         struct zswap_pool *pool;
207         union {
208                 unsigned long handle;
209                 unsigned long value;
210         };
211         struct obj_cgroup *objcg;
212         struct list_head lru;
213 };
214
215 /*
216  * The tree lock in the zswap_tree struct protects a few things:
217  * - the rbtree
218  * - the refcount field of each entry in the tree
219  */
220 struct zswap_tree {
221         struct rb_root rbroot;
222         spinlock_t lock;
223 };
224
225 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
226
227 /* RCU-protected iteration */
228 static LIST_HEAD(zswap_pools);
229 /* protects zswap_pools list modification */
230 static DEFINE_SPINLOCK(zswap_pools_lock);
231 /* pool counter to provide unique names to zpool */
232 static atomic_t zswap_pools_count = ATOMIC_INIT(0);
233
234 enum zswap_init_type {
235         ZSWAP_UNINIT,
236         ZSWAP_INIT_SUCCEED,
237         ZSWAP_INIT_FAILED
238 };
239
240 static enum zswap_init_type zswap_init_state;
241
242 /* used to ensure the integrity of initialization */
243 static DEFINE_MUTEX(zswap_init_lock);
244
245 /* init completed, but couldn't create the initial pool */
246 static bool zswap_has_pool;
247
248 /*********************************
249 * helpers and fwd declarations
250 **********************************/
251
252 #define zswap_pool_debug(msg, p)                                \
253         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
254                  zpool_get_type((p)->zpools[0]))
255
256 static int zswap_writeback_entry(struct zswap_entry *entry,
257                                  struct zswap_tree *tree);
258 static int zswap_pool_get(struct zswap_pool *pool);
259 static void zswap_pool_put(struct zswap_pool *pool);
260
261 static bool zswap_is_full(void)
262 {
263         return totalram_pages() * zswap_max_pool_percent / 100 <
264                         DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
265 }
266
267 static bool zswap_can_accept(void)
268 {
269         return totalram_pages() * zswap_accept_thr_percent / 100 *
270                                 zswap_max_pool_percent / 100 >
271                         DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
272 }
273
274 static void zswap_update_total_size(void)
275 {
276         struct zswap_pool *pool;
277         u64 total = 0;
278         int i;
279
280         rcu_read_lock();
281
282         list_for_each_entry_rcu(pool, &zswap_pools, list)
283                 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
284                         total += zpool_get_total_size(pool->zpools[i]);
285
286         rcu_read_unlock();
287
288         zswap_pool_total_size = total;
289 }
290
291 /*********************************
292 * zswap entry functions
293 **********************************/
294 static struct kmem_cache *zswap_entry_cache;
295
296 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
297 {
298         struct zswap_entry *entry;
299         entry = kmem_cache_alloc(zswap_entry_cache, gfp);
300         if (!entry)
301                 return NULL;
302         entry->refcount = 1;
303         RB_CLEAR_NODE(&entry->rbnode);
304         return entry;
305 }
306
307 static void zswap_entry_cache_free(struct zswap_entry *entry)
308 {
309         kmem_cache_free(zswap_entry_cache, entry);
310 }
311
312 /*********************************
313 * rbtree functions
314 **********************************/
315 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
316 {
317         struct rb_node *node = root->rb_node;
318         struct zswap_entry *entry;
319         pgoff_t entry_offset;
320
321         while (node) {
322                 entry = rb_entry(node, struct zswap_entry, rbnode);
323                 entry_offset = swp_offset(entry->swpentry);
324                 if (entry_offset > offset)
325                         node = node->rb_left;
326                 else if (entry_offset < offset)
327                         node = node->rb_right;
328                 else
329                         return entry;
330         }
331         return NULL;
332 }
333
334 /*
335  * In the case that a entry with the same offset is found, a pointer to
336  * the existing entry is stored in dupentry and the function returns -EEXIST
337  */
338 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
339                         struct zswap_entry **dupentry)
340 {
341         struct rb_node **link = &root->rb_node, *parent = NULL;
342         struct zswap_entry *myentry;
343         pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
344
345         while (*link) {
346                 parent = *link;
347                 myentry = rb_entry(parent, struct zswap_entry, rbnode);
348                 myentry_offset = swp_offset(myentry->swpentry);
349                 if (myentry_offset > entry_offset)
350                         link = &(*link)->rb_left;
351                 else if (myentry_offset < entry_offset)
352                         link = &(*link)->rb_right;
353                 else {
354                         *dupentry = myentry;
355                         return -EEXIST;
356                 }
357         }
358         rb_link_node(&entry->rbnode, parent, link);
359         rb_insert_color(&entry->rbnode, root);
360         return 0;
361 }
362
363 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
364 {
365         if (!RB_EMPTY_NODE(&entry->rbnode)) {
366                 rb_erase(&entry->rbnode, root);
367                 RB_CLEAR_NODE(&entry->rbnode);
368                 return true;
369         }
370         return false;
371 }
372
373 static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
374 {
375         int i = 0;
376
377         if (ZSWAP_NR_ZPOOLS > 1)
378                 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
379
380         return entry->pool->zpools[i];
381 }
382
383 /*
384  * Carries out the common pattern of freeing and entry's zpool allocation,
385  * freeing the entry itself, and decrementing the number of stored pages.
386  */
387 static void zswap_free_entry(struct zswap_entry *entry)
388 {
389         if (entry->objcg) {
390                 obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
391                 obj_cgroup_put(entry->objcg);
392         }
393         if (!entry->length)
394                 atomic_dec(&zswap_same_filled_pages);
395         else {
396                 spin_lock(&entry->pool->lru_lock);
397                 list_del(&entry->lru);
398                 spin_unlock(&entry->pool->lru_lock);
399                 zpool_free(zswap_find_zpool(entry), entry->handle);
400                 zswap_pool_put(entry->pool);
401         }
402         zswap_entry_cache_free(entry);
403         atomic_dec(&zswap_stored_pages);
404         zswap_update_total_size();
405 }
406
407 /* caller must hold the tree lock */
408 static void zswap_entry_get(struct zswap_entry *entry)
409 {
410         entry->refcount++;
411 }
412
413 /* caller must hold the tree lock
414 * remove from the tree and free it, if nobody reference the entry
415 */
416 static void zswap_entry_put(struct zswap_tree *tree,
417                         struct zswap_entry *entry)
418 {
419         int refcount = --entry->refcount;
420
421         WARN_ON_ONCE(refcount < 0);
422         if (refcount == 0) {
423                 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
424                 zswap_free_entry(entry);
425         }
426 }
427
428 /* caller must hold the tree lock */
429 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
430                                 pgoff_t offset)
431 {
432         struct zswap_entry *entry;
433
434         entry = zswap_rb_search(root, offset);
435         if (entry)
436                 zswap_entry_get(entry);
437
438         return entry;
439 }
440
441 /*********************************
442 * per-cpu code
443 **********************************/
444 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
445 /*
446  * If users dynamically change the zpool type and compressor at runtime, i.e.
447  * zswap is running, zswap can have more than one zpool on one cpu, but they
448  * are sharing dtsmem. So we need this mutex to be per-cpu.
449  */
450 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
451
452 static int zswap_dstmem_prepare(unsigned int cpu)
453 {
454         struct mutex *mutex;
455         u8 *dst;
456
457         dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
458         if (!dst)
459                 return -ENOMEM;
460
461         mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
462         if (!mutex) {
463                 kfree(dst);
464                 return -ENOMEM;
465         }
466
467         mutex_init(mutex);
468         per_cpu(zswap_dstmem, cpu) = dst;
469         per_cpu(zswap_mutex, cpu) = mutex;
470         return 0;
471 }
472
473 static int zswap_dstmem_dead(unsigned int cpu)
474 {
475         struct mutex *mutex;
476         u8 *dst;
477
478         mutex = per_cpu(zswap_mutex, cpu);
479         kfree(mutex);
480         per_cpu(zswap_mutex, cpu) = NULL;
481
482         dst = per_cpu(zswap_dstmem, cpu);
483         kfree(dst);
484         per_cpu(zswap_dstmem, cpu) = NULL;
485
486         return 0;
487 }
488
489 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
490 {
491         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
492         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
493         struct crypto_acomp *acomp;
494         struct acomp_req *req;
495
496         acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
497         if (IS_ERR(acomp)) {
498                 pr_err("could not alloc crypto acomp %s : %ld\n",
499                                 pool->tfm_name, PTR_ERR(acomp));
500                 return PTR_ERR(acomp);
501         }
502         acomp_ctx->acomp = acomp;
503
504         req = acomp_request_alloc(acomp_ctx->acomp);
505         if (!req) {
506                 pr_err("could not alloc crypto acomp_request %s\n",
507                        pool->tfm_name);
508                 crypto_free_acomp(acomp_ctx->acomp);
509                 return -ENOMEM;
510         }
511         acomp_ctx->req = req;
512
513         crypto_init_wait(&acomp_ctx->wait);
514         /*
515          * if the backend of acomp is async zip, crypto_req_done() will wakeup
516          * crypto_wait_req(); if the backend of acomp is scomp, the callback
517          * won't be called, crypto_wait_req() will return without blocking.
518          */
519         acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
520                                    crypto_req_done, &acomp_ctx->wait);
521
522         acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
523         acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
524
525         return 0;
526 }
527
528 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
529 {
530         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
531         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
532
533         if (!IS_ERR_OR_NULL(acomp_ctx)) {
534                 if (!IS_ERR_OR_NULL(acomp_ctx->req))
535                         acomp_request_free(acomp_ctx->req);
536                 if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
537                         crypto_free_acomp(acomp_ctx->acomp);
538         }
539
540         return 0;
541 }
542
543 /*********************************
544 * pool functions
545 **********************************/
546
547 static struct zswap_pool *__zswap_pool_current(void)
548 {
549         struct zswap_pool *pool;
550
551         pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
552         WARN_ONCE(!pool && zswap_has_pool,
553                   "%s: no page storage pool!\n", __func__);
554
555         return pool;
556 }
557
558 static struct zswap_pool *zswap_pool_current(void)
559 {
560         assert_spin_locked(&zswap_pools_lock);
561
562         return __zswap_pool_current();
563 }
564
565 static struct zswap_pool *zswap_pool_current_get(void)
566 {
567         struct zswap_pool *pool;
568
569         rcu_read_lock();
570
571         pool = __zswap_pool_current();
572         if (!zswap_pool_get(pool))
573                 pool = NULL;
574
575         rcu_read_unlock();
576
577         return pool;
578 }
579
580 static struct zswap_pool *zswap_pool_last_get(void)
581 {
582         struct zswap_pool *pool, *last = NULL;
583
584         rcu_read_lock();
585
586         list_for_each_entry_rcu(pool, &zswap_pools, list)
587                 last = pool;
588         WARN_ONCE(!last && zswap_has_pool,
589                   "%s: no page storage pool!\n", __func__);
590         if (!zswap_pool_get(last))
591                 last = NULL;
592
593         rcu_read_unlock();
594
595         return last;
596 }
597
598 /* type and compressor must be null-terminated */
599 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
600 {
601         struct zswap_pool *pool;
602
603         assert_spin_locked(&zswap_pools_lock);
604
605         list_for_each_entry_rcu(pool, &zswap_pools, list) {
606                 if (strcmp(pool->tfm_name, compressor))
607                         continue;
608                 /* all zpools share the same type */
609                 if (strcmp(zpool_get_type(pool->zpools[0]), type))
610                         continue;
611                 /* if we can't get it, it's about to be destroyed */
612                 if (!zswap_pool_get(pool))
613                         continue;
614                 return pool;
615         }
616
617         return NULL;
618 }
619
620 /*
621  * If the entry is still valid in the tree, drop the initial ref and remove it
622  * from the tree. This function must be called with an additional ref held,
623  * otherwise it may race with another invalidation freeing the entry.
624  */
625 static void zswap_invalidate_entry(struct zswap_tree *tree,
626                                    struct zswap_entry *entry)
627 {
628         if (zswap_rb_erase(&tree->rbroot, entry))
629                 zswap_entry_put(tree, entry);
630 }
631
632 static int zswap_reclaim_entry(struct zswap_pool *pool)
633 {
634         struct zswap_entry *entry;
635         struct zswap_tree *tree;
636         pgoff_t swpoffset;
637         int ret;
638
639         /* Get an entry off the LRU */
640         spin_lock(&pool->lru_lock);
641         if (list_empty(&pool->lru)) {
642                 spin_unlock(&pool->lru_lock);
643                 return -EINVAL;
644         }
645         entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
646         list_del_init(&entry->lru);
647         /*
648          * Once the lru lock is dropped, the entry might get freed. The
649          * swpoffset is copied to the stack, and entry isn't deref'd again
650          * until the entry is verified to still be alive in the tree.
651          */
652         swpoffset = swp_offset(entry->swpentry);
653         tree = zswap_trees[swp_type(entry->swpentry)];
654         spin_unlock(&pool->lru_lock);
655
656         /* Check for invalidate() race */
657         spin_lock(&tree->lock);
658         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
659                 ret = -EAGAIN;
660                 goto unlock;
661         }
662         /* Hold a reference to prevent a free during writeback */
663         zswap_entry_get(entry);
664         spin_unlock(&tree->lock);
665
666         ret = zswap_writeback_entry(entry, tree);
667
668         spin_lock(&tree->lock);
669         if (ret) {
670                 /* Writeback failed, put entry back on LRU */
671                 spin_lock(&pool->lru_lock);
672                 list_move(&entry->lru, &pool->lru);
673                 spin_unlock(&pool->lru_lock);
674                 goto put_unlock;
675         }
676
677         /*
678          * Writeback started successfully, the page now belongs to the
679          * swapcache. Drop the entry from zswap - unless invalidate already
680          * took it out while we had the tree->lock released for IO.
681          */
682         zswap_invalidate_entry(tree, entry);
683
684 put_unlock:
685         /* Drop local reference */
686         zswap_entry_put(tree, entry);
687 unlock:
688         spin_unlock(&tree->lock);
689         return ret ? -EAGAIN : 0;
690 }
691
692 static void shrink_worker(struct work_struct *w)
693 {
694         struct zswap_pool *pool = container_of(w, typeof(*pool),
695                                                 shrink_work);
696         int ret, failures = 0;
697
698         do {
699                 ret = zswap_reclaim_entry(pool);
700                 if (ret) {
701                         zswap_reject_reclaim_fail++;
702                         if (ret != -EAGAIN)
703                                 break;
704                         if (++failures == MAX_RECLAIM_RETRIES)
705                                 break;
706                 }
707                 cond_resched();
708         } while (!zswap_can_accept());
709         zswap_pool_put(pool);
710 }
711
712 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
713 {
714         int i;
715         struct zswap_pool *pool;
716         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
717         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
718         int ret;
719
720         if (!zswap_has_pool) {
721                 /* if either are unset, pool initialization failed, and we
722                  * need both params to be set correctly before trying to
723                  * create a pool.
724                  */
725                 if (!strcmp(type, ZSWAP_PARAM_UNSET))
726                         return NULL;
727                 if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
728                         return NULL;
729         }
730
731         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
732         if (!pool)
733                 return NULL;
734
735         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
736                 /* unique name for each pool specifically required by zsmalloc */
737                 snprintf(name, 38, "zswap%x",
738                          atomic_inc_return(&zswap_pools_count));
739
740                 pool->zpools[i] = zpool_create_pool(type, name, gfp);
741                 if (!pool->zpools[i]) {
742                         pr_err("%s zpool not available\n", type);
743                         goto error;
744                 }
745         }
746         pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
747
748         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
749
750         pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
751         if (!pool->acomp_ctx) {
752                 pr_err("percpu alloc failed\n");
753                 goto error;
754         }
755
756         ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
757                                        &pool->node);
758         if (ret)
759                 goto error;
760         pr_debug("using %s compressor\n", pool->tfm_name);
761
762         /* being the current pool takes 1 ref; this func expects the
763          * caller to always add the new pool as the current pool
764          */
765         kref_init(&pool->kref);
766         INIT_LIST_HEAD(&pool->list);
767         INIT_LIST_HEAD(&pool->lru);
768         spin_lock_init(&pool->lru_lock);
769         INIT_WORK(&pool->shrink_work, shrink_worker);
770
771         zswap_pool_debug("created", pool);
772
773         return pool;
774
775 error:
776         if (pool->acomp_ctx)
777                 free_percpu(pool->acomp_ctx);
778         while (i--)
779                 zpool_destroy_pool(pool->zpools[i]);
780         kfree(pool);
781         return NULL;
782 }
783
784 static struct zswap_pool *__zswap_pool_create_fallback(void)
785 {
786         bool has_comp, has_zpool;
787
788         has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
789         if (!has_comp && strcmp(zswap_compressor,
790                                 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
791                 pr_err("compressor %s not available, using default %s\n",
792                        zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
793                 param_free_charp(&zswap_compressor);
794                 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
795                 has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
796         }
797         if (!has_comp) {
798                 pr_err("default compressor %s not available\n",
799                        zswap_compressor);
800                 param_free_charp(&zswap_compressor);
801                 zswap_compressor = ZSWAP_PARAM_UNSET;
802         }
803
804         has_zpool = zpool_has_pool(zswap_zpool_type);
805         if (!has_zpool && strcmp(zswap_zpool_type,
806                                  CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
807                 pr_err("zpool %s not available, using default %s\n",
808                        zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
809                 param_free_charp(&zswap_zpool_type);
810                 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
811                 has_zpool = zpool_has_pool(zswap_zpool_type);
812         }
813         if (!has_zpool) {
814                 pr_err("default zpool %s not available\n",
815                        zswap_zpool_type);
816                 param_free_charp(&zswap_zpool_type);
817                 zswap_zpool_type = ZSWAP_PARAM_UNSET;
818         }
819
820         if (!has_comp || !has_zpool)
821                 return NULL;
822
823         return zswap_pool_create(zswap_zpool_type, zswap_compressor);
824 }
825
826 static void zswap_pool_destroy(struct zswap_pool *pool)
827 {
828         int i;
829
830         zswap_pool_debug("destroying", pool);
831
832         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
833         free_percpu(pool->acomp_ctx);
834         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
835                 zpool_destroy_pool(pool->zpools[i]);
836         kfree(pool);
837 }
838
839 static int __must_check zswap_pool_get(struct zswap_pool *pool)
840 {
841         if (!pool)
842                 return 0;
843
844         return kref_get_unless_zero(&pool->kref);
845 }
846
847 static void __zswap_pool_release(struct work_struct *work)
848 {
849         struct zswap_pool *pool = container_of(work, typeof(*pool),
850                                                 release_work);
851
852         synchronize_rcu();
853
854         /* nobody should have been able to get a kref... */
855         WARN_ON(kref_get_unless_zero(&pool->kref));
856
857         /* pool is now off zswap_pools list and has no references. */
858         zswap_pool_destroy(pool);
859 }
860
861 static void __zswap_pool_empty(struct kref *kref)
862 {
863         struct zswap_pool *pool;
864
865         pool = container_of(kref, typeof(*pool), kref);
866
867         spin_lock(&zswap_pools_lock);
868
869         WARN_ON(pool == zswap_pool_current());
870
871         list_del_rcu(&pool->list);
872
873         INIT_WORK(&pool->release_work, __zswap_pool_release);
874         schedule_work(&pool->release_work);
875
876         spin_unlock(&zswap_pools_lock);
877 }
878
879 static void zswap_pool_put(struct zswap_pool *pool)
880 {
881         kref_put(&pool->kref, __zswap_pool_empty);
882 }
883
884 /*********************************
885 * param callbacks
886 **********************************/
887
888 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
889 {
890         /* no change required */
891         if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
892                 return false;
893         return true;
894 }
895
896 /* val must be a null-terminated string */
897 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
898                              char *type, char *compressor)
899 {
900         struct zswap_pool *pool, *put_pool = NULL;
901         char *s = strstrip((char *)val);
902         int ret = 0;
903         bool new_pool = false;
904
905         mutex_lock(&zswap_init_lock);
906         switch (zswap_init_state) {
907         case ZSWAP_UNINIT:
908                 /* if this is load-time (pre-init) param setting,
909                  * don't create a pool; that's done during init.
910                  */
911                 ret = param_set_charp(s, kp);
912                 break;
913         case ZSWAP_INIT_SUCCEED:
914                 new_pool = zswap_pool_changed(s, kp);
915                 break;
916         case ZSWAP_INIT_FAILED:
917                 pr_err("can't set param, initialization failed\n");
918                 ret = -ENODEV;
919         }
920         mutex_unlock(&zswap_init_lock);
921
922         /* no need to create a new pool, return directly */
923         if (!new_pool)
924                 return ret;
925
926         if (!type) {
927                 if (!zpool_has_pool(s)) {
928                         pr_err("zpool %s not available\n", s);
929                         return -ENOENT;
930                 }
931                 type = s;
932         } else if (!compressor) {
933                 if (!crypto_has_acomp(s, 0, 0)) {
934                         pr_err("compressor %s not available\n", s);
935                         return -ENOENT;
936                 }
937                 compressor = s;
938         } else {
939                 WARN_ON(1);
940                 return -EINVAL;
941         }
942
943         spin_lock(&zswap_pools_lock);
944
945         pool = zswap_pool_find_get(type, compressor);
946         if (pool) {
947                 zswap_pool_debug("using existing", pool);
948                 WARN_ON(pool == zswap_pool_current());
949                 list_del_rcu(&pool->list);
950         }
951
952         spin_unlock(&zswap_pools_lock);
953
954         if (!pool)
955                 pool = zswap_pool_create(type, compressor);
956
957         if (pool)
958                 ret = param_set_charp(s, kp);
959         else
960                 ret = -EINVAL;
961
962         spin_lock(&zswap_pools_lock);
963
964         if (!ret) {
965                 put_pool = zswap_pool_current();
966                 list_add_rcu(&pool->list, &zswap_pools);
967                 zswap_has_pool = true;
968         } else if (pool) {
969                 /* add the possibly pre-existing pool to the end of the pools
970                  * list; if it's new (and empty) then it'll be removed and
971                  * destroyed by the put after we drop the lock
972                  */
973                 list_add_tail_rcu(&pool->list, &zswap_pools);
974                 put_pool = pool;
975         }
976
977         spin_unlock(&zswap_pools_lock);
978
979         if (!zswap_has_pool && !pool) {
980                 /* if initial pool creation failed, and this pool creation also
981                  * failed, maybe both compressor and zpool params were bad.
982                  * Allow changing this param, so pool creation will succeed
983                  * when the other param is changed. We already verified this
984                  * param is ok in the zpool_has_pool() or crypto_has_acomp()
985                  * checks above.
986                  */
987                 ret = param_set_charp(s, kp);
988         }
989
990         /* drop the ref from either the old current pool,
991          * or the new pool we failed to add
992          */
993         if (put_pool)
994                 zswap_pool_put(put_pool);
995
996         return ret;
997 }
998
999 static int zswap_compressor_param_set(const char *val,
1000                                       const struct kernel_param *kp)
1001 {
1002         return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
1003 }
1004
1005 static int zswap_zpool_param_set(const char *val,
1006                                  const struct kernel_param *kp)
1007 {
1008         return __zswap_param_set(val, kp, NULL, zswap_compressor);
1009 }
1010
1011 static int zswap_enabled_param_set(const char *val,
1012                                    const struct kernel_param *kp)
1013 {
1014         int ret = -ENODEV;
1015
1016         /* if this is load-time (pre-init) param setting, only set param. */
1017         if (system_state != SYSTEM_RUNNING)
1018                 return param_set_bool(val, kp);
1019
1020         mutex_lock(&zswap_init_lock);
1021         switch (zswap_init_state) {
1022         case ZSWAP_UNINIT:
1023                 if (zswap_setup())
1024                         break;
1025                 fallthrough;
1026         case ZSWAP_INIT_SUCCEED:
1027                 if (!zswap_has_pool)
1028                         pr_err("can't enable, no pool configured\n");
1029                 else
1030                         ret = param_set_bool(val, kp);
1031                 break;
1032         case ZSWAP_INIT_FAILED:
1033                 pr_err("can't enable, initialization failed\n");
1034         }
1035         mutex_unlock(&zswap_init_lock);
1036
1037         return ret;
1038 }
1039
1040 /*********************************
1041 * writeback code
1042 **********************************/
1043 /*
1044  * Attempts to free an entry by adding a page to the swap cache,
1045  * decompressing the entry data into the page, and issuing a
1046  * bio write to write the page back to the swap device.
1047  *
1048  * This can be thought of as a "resumed writeback" of the page
1049  * to the swap device.  We are basically resuming the same swap
1050  * writeback path that was intercepted with the zswap_store()
1051  * in the first place.  After the page has been decompressed into
1052  * the swap cache, the compressed version stored by zswap can be
1053  * freed.
1054  */
1055 static int zswap_writeback_entry(struct zswap_entry *entry,
1056                                  struct zswap_tree *tree)
1057 {
1058         swp_entry_t swpentry = entry->swpentry;
1059         struct page *page;
1060         struct scatterlist input, output;
1061         struct crypto_acomp_ctx *acomp_ctx;
1062         struct zpool *pool = zswap_find_zpool(entry);
1063         bool page_was_allocated;
1064         u8 *src, *tmp = NULL;
1065         unsigned int dlen;
1066         int ret;
1067         struct writeback_control wbc = {
1068                 .sync_mode = WB_SYNC_NONE,
1069         };
1070
1071         if (!zpool_can_sleep_mapped(pool)) {
1072                 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1073                 if (!tmp)
1074                         return -ENOMEM;
1075         }
1076
1077         /* try to allocate swap cache page */
1078         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
1079                                        &page_was_allocated);
1080         if (!page) {
1081                 ret = -ENOMEM;
1082                 goto fail;
1083         }
1084
1085         /* Found an existing page, we raced with load/swapin */
1086         if (!page_was_allocated) {
1087                 put_page(page);
1088                 ret = -EEXIST;
1089                 goto fail;
1090         }
1091
1092         /*
1093          * Page is locked, and the swapcache is now secured against
1094          * concurrent swapping to and from the slot. Verify that the
1095          * swap entry hasn't been invalidated and recycled behind our
1096          * backs (our zswap_entry reference doesn't prevent that), to
1097          * avoid overwriting a new swap page with old compressed data.
1098          */
1099         spin_lock(&tree->lock);
1100         if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
1101                 spin_unlock(&tree->lock);
1102                 delete_from_swap_cache(page_folio(page));
1103                 ret = -ENOMEM;
1104                 goto fail;
1105         }
1106         spin_unlock(&tree->lock);
1107
1108         /* decompress */
1109         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1110         dlen = PAGE_SIZE;
1111
1112         src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
1113         if (!zpool_can_sleep_mapped(pool)) {
1114                 memcpy(tmp, src, entry->length);
1115                 src = tmp;
1116                 zpool_unmap_handle(pool, entry->handle);
1117         }
1118
1119         mutex_lock(acomp_ctx->mutex);
1120         sg_init_one(&input, src, entry->length);
1121         sg_init_table(&output, 1);
1122         sg_set_page(&output, page, PAGE_SIZE, 0);
1123         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1124         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1125         dlen = acomp_ctx->req->dlen;
1126         mutex_unlock(acomp_ctx->mutex);
1127
1128         if (!zpool_can_sleep_mapped(pool))
1129                 kfree(tmp);
1130         else
1131                 zpool_unmap_handle(pool, entry->handle);
1132
1133         BUG_ON(ret);
1134         BUG_ON(dlen != PAGE_SIZE);
1135
1136         /* page is up to date */
1137         SetPageUptodate(page);
1138
1139         /* move it to the tail of the inactive list after end_writeback */
1140         SetPageReclaim(page);
1141
1142         /* start writeback */
1143         __swap_writepage(page, &wbc);
1144         put_page(page);
1145         zswap_written_back_pages++;
1146
1147         return ret;
1148
1149 fail:
1150         if (!zpool_can_sleep_mapped(pool))
1151                 kfree(tmp);
1152
1153         /*
1154          * If we get here because the page is already in swapcache, a
1155          * load may be happening concurrently. It is safe and okay to
1156          * not free the entry. It is also okay to return !0.
1157          */
1158         return ret;
1159 }
1160
1161 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1162 {
1163         unsigned long *page;
1164         unsigned long val;
1165         unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1166
1167         page = (unsigned long *)ptr;
1168         val = page[0];
1169
1170         if (val != page[last_pos])
1171                 return 0;
1172
1173         for (pos = 1; pos < last_pos; pos++) {
1174                 if (val != page[pos])
1175                         return 0;
1176         }
1177
1178         *value = val;
1179
1180         return 1;
1181 }
1182
1183 static void zswap_fill_page(void *ptr, unsigned long value)
1184 {
1185         unsigned long *page;
1186
1187         page = (unsigned long *)ptr;
1188         memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1189 }
1190
1191 bool zswap_store(struct folio *folio)
1192 {
1193         swp_entry_t swp = folio->swap;
1194         int type = swp_type(swp);
1195         pgoff_t offset = swp_offset(swp);
1196         struct page *page = &folio->page;
1197         struct zswap_tree *tree = zswap_trees[type];
1198         struct zswap_entry *entry, *dupentry;
1199         struct scatterlist input, output;
1200         struct crypto_acomp_ctx *acomp_ctx;
1201         struct obj_cgroup *objcg = NULL;
1202         struct zswap_pool *pool;
1203         struct zpool *zpool;
1204         unsigned int dlen = PAGE_SIZE;
1205         unsigned long handle, value;
1206         char *buf;
1207         u8 *src, *dst;
1208         gfp_t gfp;
1209         int ret;
1210
1211         VM_WARN_ON_ONCE(!folio_test_locked(folio));
1212         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
1213
1214         /* Large folios aren't supported */
1215         if (folio_test_large(folio))
1216                 return false;
1217
1218         if (!zswap_enabled || !tree)
1219                 return false;
1220
1221         /*
1222          * XXX: zswap reclaim does not work with cgroups yet. Without a
1223          * cgroup-aware entry LRU, we will push out entries system-wide based on
1224          * local cgroup limits.
1225          */
1226         objcg = get_obj_cgroup_from_folio(folio);
1227         if (objcg && !obj_cgroup_may_zswap(objcg))
1228                 goto reject;
1229
1230         /* reclaim space if needed */
1231         if (zswap_is_full()) {
1232                 zswap_pool_limit_hit++;
1233                 zswap_pool_reached_full = true;
1234                 goto shrink;
1235         }
1236
1237         if (zswap_pool_reached_full) {
1238                if (!zswap_can_accept())
1239                         goto shrink;
1240                 else
1241                         zswap_pool_reached_full = false;
1242         }
1243
1244         /* allocate entry */
1245         entry = zswap_entry_cache_alloc(GFP_KERNEL);
1246         if (!entry) {
1247                 zswap_reject_kmemcache_fail++;
1248                 goto reject;
1249         }
1250
1251         if (zswap_same_filled_pages_enabled) {
1252                 src = kmap_atomic(page);
1253                 if (zswap_is_page_same_filled(src, &value)) {
1254                         kunmap_atomic(src);
1255                         entry->swpentry = swp_entry(type, offset);
1256                         entry->length = 0;
1257                         entry->value = value;
1258                         atomic_inc(&zswap_same_filled_pages);
1259                         goto insert_entry;
1260                 }
1261                 kunmap_atomic(src);
1262         }
1263
1264         if (!zswap_non_same_filled_pages_enabled)
1265                 goto freepage;
1266
1267         /* if entry is successfully added, it keeps the reference */
1268         entry->pool = zswap_pool_current_get();
1269         if (!entry->pool)
1270                 goto freepage;
1271
1272         /* compress */
1273         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1274
1275         mutex_lock(acomp_ctx->mutex);
1276
1277         dst = acomp_ctx->dstmem;
1278         sg_init_table(&input, 1);
1279         sg_set_page(&input, page, PAGE_SIZE, 0);
1280
1281         /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1282         sg_init_one(&output, dst, PAGE_SIZE * 2);
1283         acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1284         /*
1285          * it maybe looks a little bit silly that we send an asynchronous request,
1286          * then wait for its completion synchronously. This makes the process look
1287          * synchronous in fact.
1288          * Theoretically, acomp supports users send multiple acomp requests in one
1289          * acomp instance, then get those requests done simultaneously. but in this
1290          * case, zswap actually does store and load page by page, there is no
1291          * existing method to send the second page before the first page is done
1292          * in one thread doing zwap.
1293          * but in different threads running on different cpu, we have different
1294          * acomp instance, so multiple threads can do (de)compression in parallel.
1295          */
1296         ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1297         dlen = acomp_ctx->req->dlen;
1298
1299         if (ret)
1300                 goto put_dstmem;
1301
1302         /* store */
1303         zpool = zswap_find_zpool(entry);
1304         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1305         if (zpool_malloc_support_movable(zpool))
1306                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1307         ret = zpool_malloc(zpool, dlen, gfp, &handle);
1308         if (ret == -ENOSPC) {
1309                 zswap_reject_compress_poor++;
1310                 goto put_dstmem;
1311         }
1312         if (ret) {
1313                 zswap_reject_alloc_fail++;
1314                 goto put_dstmem;
1315         }
1316         buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
1317         memcpy(buf, dst, dlen);
1318         zpool_unmap_handle(zpool, handle);
1319         mutex_unlock(acomp_ctx->mutex);
1320
1321         /* populate entry */
1322         entry->swpentry = swp_entry(type, offset);
1323         entry->handle = handle;
1324         entry->length = dlen;
1325
1326 insert_entry:
1327         entry->objcg = objcg;
1328         if (objcg) {
1329                 obj_cgroup_charge_zswap(objcg, entry->length);
1330                 /* Account before objcg ref is moved to tree */
1331                 count_objcg_event(objcg, ZSWPOUT);
1332         }
1333
1334         /* map */
1335         spin_lock(&tree->lock);
1336         while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
1337                 zswap_duplicate_entry++;
1338                 zswap_invalidate_entry(tree, dupentry);
1339         }
1340         if (entry->length) {
1341                 spin_lock(&entry->pool->lru_lock);
1342                 list_add(&entry->lru, &entry->pool->lru);
1343                 spin_unlock(&entry->pool->lru_lock);
1344         }
1345         spin_unlock(&tree->lock);
1346
1347         /* update stats */
1348         atomic_inc(&zswap_stored_pages);
1349         zswap_update_total_size();
1350         count_vm_event(ZSWPOUT);
1351
1352         return true;
1353
1354 put_dstmem:
1355         mutex_unlock(acomp_ctx->mutex);
1356         zswap_pool_put(entry->pool);
1357 freepage:
1358         zswap_entry_cache_free(entry);
1359 reject:
1360         if (objcg)
1361                 obj_cgroup_put(objcg);
1362         return false;
1363
1364 shrink:
1365         pool = zswap_pool_last_get();
1366         if (pool)
1367                 queue_work(shrink_wq, &pool->shrink_work);
1368         goto reject;
1369 }
1370
1371 bool zswap_load(struct folio *folio)
1372 {
1373         swp_entry_t swp = folio->swap;
1374         int type = swp_type(swp);
1375         pgoff_t offset = swp_offset(swp);
1376         struct page *page = &folio->page;
1377         struct zswap_tree *tree = zswap_trees[type];
1378         struct zswap_entry *entry;
1379         struct scatterlist input, output;
1380         struct crypto_acomp_ctx *acomp_ctx;
1381         u8 *src, *dst, *tmp;
1382         struct zpool *zpool;
1383         unsigned int dlen;
1384         bool ret;
1385
1386         VM_WARN_ON_ONCE(!folio_test_locked(folio));
1387
1388         /* find */
1389         spin_lock(&tree->lock);
1390         entry = zswap_entry_find_get(&tree->rbroot, offset);
1391         if (!entry) {
1392                 spin_unlock(&tree->lock);
1393                 return false;
1394         }
1395         spin_unlock(&tree->lock);
1396
1397         if (!entry->length) {
1398                 dst = kmap_atomic(page);
1399                 zswap_fill_page(dst, entry->value);
1400                 kunmap_atomic(dst);
1401                 ret = true;
1402                 goto stats;
1403         }
1404
1405         zpool = zswap_find_zpool(entry);
1406         if (!zpool_can_sleep_mapped(zpool)) {
1407                 tmp = kmalloc(entry->length, GFP_KERNEL);
1408                 if (!tmp) {
1409                         ret = false;
1410                         goto freeentry;
1411                 }
1412         }
1413
1414         /* decompress */
1415         dlen = PAGE_SIZE;
1416         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
1417
1418         if (!zpool_can_sleep_mapped(zpool)) {
1419                 memcpy(tmp, src, entry->length);
1420                 src = tmp;
1421                 zpool_unmap_handle(zpool, entry->handle);
1422         }
1423
1424         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1425         mutex_lock(acomp_ctx->mutex);
1426         sg_init_one(&input, src, entry->length);
1427         sg_init_table(&output, 1);
1428         sg_set_page(&output, page, PAGE_SIZE, 0);
1429         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1430         if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
1431                 WARN_ON(1);
1432         mutex_unlock(acomp_ctx->mutex);
1433
1434         if (zpool_can_sleep_mapped(zpool))
1435                 zpool_unmap_handle(zpool, entry->handle);
1436         else
1437                 kfree(tmp);
1438
1439         ret = true;
1440 stats:
1441         count_vm_event(ZSWPIN);
1442         if (entry->objcg)
1443                 count_objcg_event(entry->objcg, ZSWPIN);
1444 freeentry:
1445         spin_lock(&tree->lock);
1446         if (ret && zswap_exclusive_loads_enabled) {
1447                 zswap_invalidate_entry(tree, entry);
1448                 folio_mark_dirty(folio);
1449         } else if (entry->length) {
1450                 spin_lock(&entry->pool->lru_lock);
1451                 list_move(&entry->lru, &entry->pool->lru);
1452                 spin_unlock(&entry->pool->lru_lock);
1453         }
1454         zswap_entry_put(tree, entry);
1455         spin_unlock(&tree->lock);
1456
1457         return ret;
1458 }
1459
1460 void zswap_invalidate(int type, pgoff_t offset)
1461 {
1462         struct zswap_tree *tree = zswap_trees[type];
1463         struct zswap_entry *entry;
1464
1465         /* find */
1466         spin_lock(&tree->lock);
1467         entry = zswap_rb_search(&tree->rbroot, offset);
1468         if (!entry) {
1469                 /* entry was written back */
1470                 spin_unlock(&tree->lock);
1471                 return;
1472         }
1473         zswap_invalidate_entry(tree, entry);
1474         spin_unlock(&tree->lock);
1475 }
1476
1477 void zswap_swapon(int type)
1478 {
1479         struct zswap_tree *tree;
1480
1481         tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1482         if (!tree) {
1483                 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1484                 return;
1485         }
1486
1487         tree->rbroot = RB_ROOT;
1488         spin_lock_init(&tree->lock);
1489         zswap_trees[type] = tree;
1490 }
1491
1492 void zswap_swapoff(int type)
1493 {
1494         struct zswap_tree *tree = zswap_trees[type];
1495         struct zswap_entry *entry, *n;
1496
1497         if (!tree)
1498                 return;
1499
1500         /* walk the tree and free everything */
1501         spin_lock(&tree->lock);
1502         rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1503                 zswap_free_entry(entry);
1504         tree->rbroot = RB_ROOT;
1505         spin_unlock(&tree->lock);
1506         kfree(tree);
1507         zswap_trees[type] = NULL;
1508 }
1509
1510 /*********************************
1511 * debugfs functions
1512 **********************************/
1513 #ifdef CONFIG_DEBUG_FS
1514 #include <linux/debugfs.h>
1515
1516 static struct dentry *zswap_debugfs_root;
1517
1518 static int zswap_debugfs_init(void)
1519 {
1520         if (!debugfs_initialized())
1521                 return -ENODEV;
1522
1523         zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1524
1525         debugfs_create_u64("pool_limit_hit", 0444,
1526                            zswap_debugfs_root, &zswap_pool_limit_hit);
1527         debugfs_create_u64("reject_reclaim_fail", 0444,
1528                            zswap_debugfs_root, &zswap_reject_reclaim_fail);
1529         debugfs_create_u64("reject_alloc_fail", 0444,
1530                            zswap_debugfs_root, &zswap_reject_alloc_fail);
1531         debugfs_create_u64("reject_kmemcache_fail", 0444,
1532                            zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1533         debugfs_create_u64("reject_compress_poor", 0444,
1534                            zswap_debugfs_root, &zswap_reject_compress_poor);
1535         debugfs_create_u64("written_back_pages", 0444,
1536                            zswap_debugfs_root, &zswap_written_back_pages);
1537         debugfs_create_u64("duplicate_entry", 0444,
1538                            zswap_debugfs_root, &zswap_duplicate_entry);
1539         debugfs_create_u64("pool_total_size", 0444,
1540                            zswap_debugfs_root, &zswap_pool_total_size);
1541         debugfs_create_atomic_t("stored_pages", 0444,
1542                                 zswap_debugfs_root, &zswap_stored_pages);
1543         debugfs_create_atomic_t("same_filled_pages", 0444,
1544                                 zswap_debugfs_root, &zswap_same_filled_pages);
1545
1546         return 0;
1547 }
1548 #else
1549 static int zswap_debugfs_init(void)
1550 {
1551         return 0;
1552 }
1553 #endif
1554
1555 /*********************************
1556 * module init and exit
1557 **********************************/
1558 static int zswap_setup(void)
1559 {
1560         struct zswap_pool *pool;
1561         int ret;
1562
1563         zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1564         if (!zswap_entry_cache) {
1565                 pr_err("entry cache creation failed\n");
1566                 goto cache_fail;
1567         }
1568
1569         ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1570                                 zswap_dstmem_prepare, zswap_dstmem_dead);
1571         if (ret) {
1572                 pr_err("dstmem alloc failed\n");
1573                 goto dstmem_fail;
1574         }
1575
1576         ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1577                                       "mm/zswap_pool:prepare",
1578                                       zswap_cpu_comp_prepare,
1579                                       zswap_cpu_comp_dead);
1580         if (ret)
1581                 goto hp_fail;
1582
1583         pool = __zswap_pool_create_fallback();
1584         if (pool) {
1585                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1586                         zpool_get_type(pool->zpools[0]));
1587                 list_add(&pool->list, &zswap_pools);
1588                 zswap_has_pool = true;
1589         } else {
1590                 pr_err("pool creation failed\n");
1591                 zswap_enabled = false;
1592         }
1593
1594         shrink_wq = create_workqueue("zswap-shrink");
1595         if (!shrink_wq)
1596                 goto fallback_fail;
1597
1598         if (zswap_debugfs_init())
1599                 pr_warn("debugfs initialization failed\n");
1600         zswap_init_state = ZSWAP_INIT_SUCCEED;
1601         return 0;
1602
1603 fallback_fail:
1604         if (pool)
1605                 zswap_pool_destroy(pool);
1606 hp_fail:
1607         cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
1608 dstmem_fail:
1609         kmem_cache_destroy(zswap_entry_cache);
1610 cache_fail:
1611         /* if built-in, we aren't unloaded on failure; don't allow use */
1612         zswap_init_state = ZSWAP_INIT_FAILED;
1613         zswap_enabled = false;
1614         return -ENOMEM;
1615 }
1616
1617 static int __init zswap_init(void)
1618 {
1619         if (!zswap_enabled)
1620                 return 0;
1621         return zswap_setup();
1622 }
1623 /* must be late so crypto has time to come up */
1624 late_initcall(zswap_init);
1625
1626 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1627 MODULE_DESCRIPTION("Compressed cache for swap pages");