e23d814b539227ae64d73f0753eb5562a920db84
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / staging / zcache / zcache-main.c
1 /*
2  * zcache.c
3  *
4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5  * Copyright (c) 2010,2011, Nitin Gupta
6  *
7  * Zcache provides an in-kernel "host implementation" for transcendent memory
8  * ("tmem") and, thus indirectly, for cleancache and frontswap.  Zcache uses
9  * lzo1x compression to improve density and an embedded allocator called
10  * "zbud" which "buddies" two compressed pages semi-optimally in each physical
11  * pageframe.  Zbud is integrally tied into tmem to allow pageframes to
12  * be "reclaimed" efficiently.
13  */
14
15 #include <linux/module.h>
16 #include <linux/cpu.h>
17 #include <linux/highmem.h>
18 #include <linux/list.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/string.h>
23 #include <linux/atomic.h>
24 #include <linux/math64.h>
25 #include <linux/crypto.h>
26 #include <linux/swap.h>
27 #include <linux/swapops.h>
28 #include <linux/pagemap.h>
29 #include <linux/writeback.h>
30
31 #include <linux/cleancache.h>
32 #include <linux/frontswap.h>
33 #include "tmem.h"
34 #include "zcache.h"
35 #include "zbud.h"
36 #include "ramster.h"
37 #include "debug.h"
38 #ifdef CONFIG_RAMSTER
39 static bool ramster_enabled __read_mostly;
40 #else
41 #define ramster_enabled false
42 #endif
43
44 #ifndef __PG_WAS_ACTIVE
45 static inline bool PageWasActive(struct page *page)
46 {
47         return true;
48 }
49
50 static inline void SetPageWasActive(struct page *page)
51 {
52 }
53 #endif
54
55 #ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS
56 static bool frontswap_has_exclusive_gets __read_mostly = true;
57 #else
58 static bool frontswap_has_exclusive_gets __read_mostly;
59 static inline void frontswap_tmem_exclusive_gets(bool b)
60 {
61 }
62 #endif
63
64 /*
65  * mark pampd to special value in order that later
66  * retrieve will identify zero-filled pages
67  */
68 #define ZERO_FILLED 0x2
69
70 /* enable (or fix code) when Seth's patches are accepted upstream */
71 #define zcache_writeback_enabled 0
72
73 static bool zcache_enabled __read_mostly;
74 static bool disable_cleancache __read_mostly;
75 static bool disable_frontswap __read_mostly;
76 static bool disable_frontswap_ignore_nonactive __read_mostly;
77 static bool disable_cleancache_ignore_nonactive __read_mostly;
78 static char *namestr __read_mostly = "zcache";
79
80 #define ZCACHE_GFP_MASK \
81         (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
82
83 /* crypto API for zcache  */
84 #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
85 static char zcache_comp_name[ZCACHE_COMP_NAME_SZ] __read_mostly;
86 static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms __read_mostly;
87
88 enum comp_op {
89         ZCACHE_COMPOP_COMPRESS,
90         ZCACHE_COMPOP_DECOMPRESS
91 };
92
93 static inline int zcache_comp_op(enum comp_op op,
94                                 const u8 *src, unsigned int slen,
95                                 u8 *dst, unsigned int *dlen)
96 {
97         struct crypto_comp *tfm;
98         int ret = -1;
99
100         BUG_ON(!zcache_comp_pcpu_tfms);
101         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
102         BUG_ON(!tfm);
103         switch (op) {
104         case ZCACHE_COMPOP_COMPRESS:
105                 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
106                 break;
107         case ZCACHE_COMPOP_DECOMPRESS:
108                 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
109                 break;
110         default:
111                 ret = -EINVAL;
112         }
113         put_cpu();
114         return ret;
115 }
116
117 /*
118  * policy parameters
119  */
120
121 /*
122  * byte count defining poor compression; pages with greater zsize will be
123  * rejected
124  */
125 static unsigned int zbud_max_zsize __read_mostly = (PAGE_SIZE / 8) * 7;
126 /*
127  * byte count defining poor *mean* compression; pages with greater zsize
128  * will be rejected until sufficient better-compressed pages are accepted
129  * driving the mean below this threshold
130  */
131 static unsigned int zbud_max_mean_zsize __read_mostly = (PAGE_SIZE / 8) * 5;
132
133 /*
134  * for now, used named slabs so can easily track usage; later can
135  * either just use kmalloc, or perhaps add a slab-like allocator
136  * to more carefully manage total memory utilization
137  */
138 static struct kmem_cache *zcache_objnode_cache;
139 static struct kmem_cache *zcache_obj_cache;
140
141 static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
142
143 /* Used by debug.c */
144 ssize_t zcache_pers_zpages;
145 u64 zcache_pers_zbytes;
146 ssize_t zcache_eph_pageframes;
147 ssize_t zcache_pers_pageframes;
148
149 /* Used by this code. */
150 ssize_t zcache_last_active_file_pageframes;
151 ssize_t zcache_last_inactive_file_pageframes;
152 ssize_t zcache_last_active_anon_pageframes;
153 ssize_t zcache_last_inactive_anon_pageframes;
154 #ifdef CONFIG_ZCACHE_WRITEBACK
155 ssize_t zcache_writtenback_pages;
156 ssize_t zcache_outstanding_writeback_pages;
157 #endif
158 /*
159  * zcache core code starts here
160  */
161
162 static struct zcache_client zcache_host;
163 static struct zcache_client zcache_clients[MAX_CLIENTS];
164
165 static inline bool is_local_client(struct zcache_client *cli)
166 {
167         return cli == &zcache_host;
168 }
169
170 static struct zcache_client *zcache_get_client_by_id(uint16_t cli_id)
171 {
172         struct zcache_client *cli = &zcache_host;
173
174         if (cli_id != LOCAL_CLIENT) {
175                 if (cli_id >= MAX_CLIENTS)
176                         goto out;
177                 cli = &zcache_clients[cli_id];
178         }
179 out:
180         return cli;
181 }
182
183 /*
184  * Tmem operations assume the poolid implies the invoking client.
185  * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
186  * RAMster has each client numbered by cluster node, and a KVM version
187  * of zcache would have one client per guest and each client might
188  * have a poolid==N.
189  */
190 struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
191 {
192         struct tmem_pool *pool = NULL;
193         struct zcache_client *cli = NULL;
194
195         cli = zcache_get_client_by_id(cli_id);
196         if (cli == NULL)
197                 goto out;
198         if (!is_local_client(cli))
199                 atomic_inc(&cli->refcount);
200         if (poolid < MAX_POOLS_PER_CLIENT) {
201                 pool = cli->tmem_pools[poolid];
202                 if (pool != NULL)
203                         atomic_inc(&pool->refcount);
204         }
205 out:
206         return pool;
207 }
208
209 void zcache_put_pool(struct tmem_pool *pool)
210 {
211         struct zcache_client *cli = NULL;
212
213         if (pool == NULL)
214                 BUG();
215         cli = pool->client;
216         atomic_dec(&pool->refcount);
217         if (!is_local_client(cli))
218                 atomic_dec(&cli->refcount);
219 }
220
221 int zcache_new_client(uint16_t cli_id)
222 {
223         struct zcache_client *cli;
224         int ret = -1;
225
226         cli = zcache_get_client_by_id(cli_id);
227         if (cli == NULL)
228                 goto out;
229         if (cli->allocated)
230                 goto out;
231         cli->allocated = 1;
232         ret = 0;
233 out:
234         return ret;
235 }
236
237 /*
238  * zcache implementation for tmem host ops
239  */
240
241 static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
242 {
243         struct tmem_objnode *objnode = NULL;
244         struct zcache_preload *kp;
245         int i;
246
247         kp = &__get_cpu_var(zcache_preloads);
248         for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
249                 objnode = kp->objnodes[i];
250                 if (objnode != NULL) {
251                         kp->objnodes[i] = NULL;
252                         break;
253                 }
254         }
255         BUG_ON(objnode == NULL);
256         inc_zcache_objnode_count();
257         return objnode;
258 }
259
260 static void zcache_objnode_free(struct tmem_objnode *objnode,
261                                         struct tmem_pool *pool)
262 {
263         dec_zcache_objnode_count();
264         kmem_cache_free(zcache_objnode_cache, objnode);
265 }
266
267 static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
268 {
269         struct tmem_obj *obj = NULL;
270         struct zcache_preload *kp;
271
272         kp = &__get_cpu_var(zcache_preloads);
273         obj = kp->obj;
274         BUG_ON(obj == NULL);
275         kp->obj = NULL;
276         inc_zcache_obj_count();
277         return obj;
278 }
279
280 static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
281 {
282         dec_zcache_obj_count();
283         kmem_cache_free(zcache_obj_cache, obj);
284 }
285
286 /*
287  * Compressing zero-filled pages will waste memory and introduce
288  * serious fragmentation, skip it to avoid overhead.
289  */
290 static bool page_is_zero_filled(struct page *p)
291 {
292         unsigned int pos;
293         char *page;
294
295         page = kmap_atomic(p);
296         for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
297                 if (page[pos]) {
298                         kunmap_atomic(page);
299                         return false;
300                 }
301         }
302         kunmap_atomic(page);
303
304         return true;
305 }
306
307 static void handle_zero_filled_page(void *p)
308 {
309         void *user_mem;
310         struct page *page = (struct page *)p;
311
312         user_mem = kmap_atomic(page);
313         memset(user_mem, 0, PAGE_SIZE);
314         kunmap_atomic(user_mem);
315
316         flush_dcache_page(page);
317 }
318
319 static struct tmem_hostops zcache_hostops = {
320         .obj_alloc = zcache_obj_alloc,
321         .obj_free = zcache_obj_free,
322         .objnode_alloc = zcache_objnode_alloc,
323         .objnode_free = zcache_objnode_free,
324 };
325
326 static struct page *zcache_alloc_page(void)
327 {
328         struct page *page = alloc_page(ZCACHE_GFP_MASK);
329
330         if (page != NULL)
331                 inc_zcache_pageframes_alloced();
332         return page;
333 }
334
335 static void zcache_free_page(struct page *page)
336 {
337         long curr_pageframes;
338         static long max_pageframes, min_pageframes;
339
340         if (page == NULL)
341                 BUG();
342         __free_page(page);
343         inc_zcache_pageframes_freed();
344         curr_pageframes = curr_pageframes_count();
345         if (curr_pageframes > max_pageframes)
346                 max_pageframes = curr_pageframes;
347         if (curr_pageframes < min_pageframes)
348                 min_pageframes = curr_pageframes;
349 #ifdef CONFIG_ZCACHE_DEBUG
350         if (curr_pageframes > 2L || curr_pageframes < -2L) {
351                 /* pr_info here */
352         }
353 #endif
354 }
355
356 /*
357  * zcache implementations for PAM page descriptor ops
358  */
359
360 /* forward reference */
361 static void zcache_compress(struct page *from,
362                                 void **out_va, unsigned *out_len);
363
364 static struct page *zcache_evict_eph_pageframe(void);
365
366 static void *zcache_pampd_eph_create(char *data, size_t size, bool raw,
367                                         struct tmem_handle *th)
368 {
369         void *pampd = NULL, *cdata = data;
370         unsigned clen = size;
371         bool zero_filled = false;
372         struct page *page = (struct page *)(data), *newpage;
373
374         if (page_is_zero_filled(page)) {
375                 clen = 0;
376                 zero_filled = true;
377                 inc_zcache_zero_filled_pages();
378                 goto got_pampd;
379         }
380
381         if (!raw) {
382                 zcache_compress(page, &cdata, &clen);
383                 if (clen > zbud_max_buddy_size()) {
384                         inc_zcache_compress_poor();
385                         goto out;
386                 }
387         } else {
388                 BUG_ON(clen > zbud_max_buddy_size());
389         }
390
391         /* look for space via an existing match first */
392         pampd = (void *)zbud_match_prep(th, true, cdata, clen);
393         if (pampd != NULL)
394                 goto got_pampd;
395
396         /* no match, now we need to find (or free up) a full page */
397         newpage = zcache_alloc_page();
398         if (newpage != NULL)
399                 goto create_in_new_page;
400
401         inc_zcache_failed_getfreepages();
402         /* can't allocate a page, evict an ephemeral page via LRU */
403         newpage = zcache_evict_eph_pageframe();
404         if (newpage == NULL) {
405                 inc_zcache_eph_ate_tail_failed();
406                 goto out;
407         }
408         inc_zcache_eph_ate_tail();
409
410 create_in_new_page:
411         pampd = (void *)zbud_create_prep(th, true, cdata, clen, newpage);
412         BUG_ON(pampd == NULL);
413         inc_zcache_eph_pageframes();
414
415 got_pampd:
416         inc_zcache_eph_zbytes(clen);
417         inc_zcache_eph_zpages();
418         if (ramster_enabled && raw && !zero_filled)
419                 ramster_count_foreign_pages(true, 1);
420         if (zero_filled)
421                 pampd = (void *)ZERO_FILLED;
422 out:
423         return pampd;
424 }
425
426 static void *zcache_pampd_pers_create(char *data, size_t size, bool raw,
427                                         struct tmem_handle *th)
428 {
429         void *pampd = NULL, *cdata = data;
430         unsigned clen = size;
431         bool zero_filled = false;
432         struct page *page = (struct page *)(data), *newpage;
433         unsigned long zbud_mean_zsize;
434         unsigned long curr_pers_zpages, total_zsize;
435
436         if (data == NULL) {
437                 BUG_ON(!ramster_enabled);
438                 goto create_pampd;
439         }
440
441         if (page_is_zero_filled(page)) {
442                 clen = 0;
443                 zero_filled = true;
444                 inc_zcache_zero_filled_pages();
445                 goto got_pampd;
446         }
447
448         curr_pers_zpages = zcache_pers_zpages;
449 /* FIXME CONFIG_RAMSTER... subtract atomic remote_pers_pages here? */
450         if (!raw)
451                 zcache_compress(page, &cdata, &clen);
452         /* reject if compression is too poor */
453         if (clen > zbud_max_zsize) {
454                 inc_zcache_compress_poor();
455                 goto out;
456         }
457         /* reject if mean compression is too poor */
458         if ((clen > zbud_max_mean_zsize) && (curr_pers_zpages > 0)) {
459                 total_zsize = zcache_pers_zbytes;
460                 if ((long)total_zsize < 0)
461                         total_zsize = 0;
462                 zbud_mean_zsize = div_u64(total_zsize,
463                                         curr_pers_zpages);
464                 if (zbud_mean_zsize > zbud_max_mean_zsize) {
465                         inc_zcache_mean_compress_poor();
466                         goto out;
467                 }
468         }
469
470 create_pampd:
471         /* look for space via an existing match first */
472         pampd = (void *)zbud_match_prep(th, false, cdata, clen);
473         if (pampd != NULL)
474                 goto got_pampd;
475
476         /* no match, now we need to find (or free up) a full page */
477         newpage = zcache_alloc_page();
478         if (newpage != NULL)
479                 goto create_in_new_page;
480         /*
481          * FIXME do the following only if eph is oversized?
482          * if (zcache_eph_pageframes >
483          * (global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE) +
484          * global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE)))
485          */
486         inc_zcache_failed_getfreepages();
487         /* can't allocate a page, evict an ephemeral page via LRU */
488         newpage = zcache_evict_eph_pageframe();
489         if (newpage == NULL) {
490                 inc_zcache_pers_ate_eph_failed();
491                 goto out;
492         }
493         inc_zcache_pers_ate_eph();
494
495 create_in_new_page:
496         pampd = (void *)zbud_create_prep(th, false, cdata, clen, newpage);
497         BUG_ON(pampd == NULL);
498         inc_zcache_pers_pageframes();
499
500 got_pampd:
501         inc_zcache_pers_zpages();
502         inc_zcache_pers_zbytes(clen);
503         if (ramster_enabled && raw && !zero_filled)
504                 ramster_count_foreign_pages(false, 1);
505         if (zero_filled)
506                 pampd = (void *)ZERO_FILLED;
507 out:
508         return pampd;
509 }
510
511 /*
512  * This is called directly from zcache_put_page to pre-allocate space
513  * to store a zpage.
514  */
515 void *zcache_pampd_create(char *data, unsigned int size, bool raw,
516                                         int eph, struct tmem_handle *th)
517 {
518         void *pampd = NULL;
519         struct zcache_preload *kp;
520         struct tmem_objnode *objnode;
521         struct tmem_obj *obj;
522         int i;
523
524         BUG_ON(!irqs_disabled());
525         /* pre-allocate per-cpu metadata */
526         BUG_ON(zcache_objnode_cache == NULL);
527         BUG_ON(zcache_obj_cache == NULL);
528         kp = &__get_cpu_var(zcache_preloads);
529         for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
530                 objnode = kp->objnodes[i];
531                 if (objnode == NULL) {
532                         objnode = kmem_cache_alloc(zcache_objnode_cache,
533                                                         ZCACHE_GFP_MASK);
534                         if (unlikely(objnode == NULL)) {
535                                 inc_zcache_failed_alloc();
536                                 goto out;
537                         }
538                         kp->objnodes[i] = objnode;
539                 }
540         }
541         if (kp->obj == NULL) {
542                 obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
543                 kp->obj = obj;
544         }
545         if (unlikely(kp->obj == NULL)) {
546                 inc_zcache_failed_alloc();
547                 goto out;
548         }
549         /*
550          * ok, have all the metadata pre-allocated, now do the data
551          * but since how we allocate the data is dependent on ephemeral
552          * or persistent, we split the call here to different sub-functions
553          */
554         if (eph)
555                 pampd = zcache_pampd_eph_create(data, size, raw, th);
556         else
557                 pampd = zcache_pampd_pers_create(data, size, raw, th);
558 out:
559         return pampd;
560 }
561
562 /*
563  * This is a pamops called via tmem_put and is necessary to "finish"
564  * a pampd creation.
565  */
566 void zcache_pampd_create_finish(void *pampd, bool eph)
567 {
568         if (pampd != (void *)ZERO_FILLED)
569                 zbud_create_finish((struct zbudref *)pampd, eph);
570 }
571
572 /*
573  * This is passed as a function parameter to zbud_decompress so that
574  * zbud need not be familiar with the details of crypto. It assumes that
575  * the bytes from_va and to_va through from_va+size-1 and to_va+size-1 are
576  * kmapped.  It must be successful, else there is a logic bug somewhere.
577  */
578 static void zcache_decompress(char *from_va, unsigned int size, char *to_va)
579 {
580         int ret;
581         unsigned int outlen = PAGE_SIZE;
582
583         ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
584                                 to_va, &outlen);
585         BUG_ON(ret);
586         BUG_ON(outlen != PAGE_SIZE);
587 }
588
589 /*
590  * Decompress from the kernel va to a pageframe
591  */
592 void zcache_decompress_to_page(char *from_va, unsigned int size,
593                                         struct page *to_page)
594 {
595         char *to_va = kmap_atomic(to_page);
596         zcache_decompress(from_va, size, to_va);
597         kunmap_atomic(to_va);
598 }
599
600 /*
601  * fill the pageframe corresponding to the struct page with the data
602  * from the passed pampd
603  */
604 static int zcache_pampd_get_data(char *data, size_t *sizep, bool raw,
605                                         void *pampd, struct tmem_pool *pool,
606                                         struct tmem_oid *oid, uint32_t index)
607 {
608         int ret;
609         bool eph = !is_persistent(pool);
610
611         BUG_ON(preemptible());
612         BUG_ON(eph);    /* fix later if shared pools get implemented */
613         BUG_ON(pampd_is_remote(pampd));
614
615         if (pampd == (void *)ZERO_FILLED) {
616                 handle_zero_filled_page(data);
617                 if (!raw)
618                         *sizep = PAGE_SIZE;
619                 return 0;
620         }
621
622         if (raw)
623                 ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd,
624                                                 sizep, eph);
625         else {
626                 ret = zbud_decompress((struct page *)(data),
627                                         (struct zbudref *)pampd, false,
628                                         zcache_decompress);
629                 *sizep = PAGE_SIZE;
630         }
631         return ret;
632 }
633
634 /*
635  * fill the pageframe corresponding to the struct page with the data
636  * from the passed pampd
637  */
638 static int zcache_pampd_get_data_and_free(char *data, size_t *sizep, bool raw,
639                                         void *pampd, struct tmem_pool *pool,
640                                         struct tmem_oid *oid, uint32_t index)
641 {
642         int ret = 0;
643         bool eph = !is_persistent(pool), zero_filled = false;
644         struct page *page = NULL;
645         unsigned int zsize, zpages;
646
647         BUG_ON(preemptible());
648         BUG_ON(pampd_is_remote(pampd));
649
650         if (pampd == (void *)ZERO_FILLED) {
651                 handle_zero_filled_page(data);
652                 zero_filled = true;
653                 zsize = 0;
654                 zpages = 1;
655                 if (!raw)
656                         *sizep = PAGE_SIZE;
657                 dec_zcache_zero_filled_pages();
658                 goto zero_fill;
659         }
660
661         if (raw)
662                 ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd,
663                                                 sizep, eph);
664         else {
665                 ret = zbud_decompress((struct page *)(data),
666                                         (struct zbudref *)pampd, eph,
667                                         zcache_decompress);
668                 *sizep = PAGE_SIZE;
669         }
670         page = zbud_free_and_delist((struct zbudref *)pampd, eph,
671                                         &zsize, &zpages);
672 zero_fill:
673         if (eph) {
674                 if (page)
675                         dec_zcache_eph_pageframes();
676                 dec_zcache_eph_zpages(zpages);
677                 dec_zcache_eph_zbytes(zsize);
678         } else {
679                 if (page)
680                         dec_zcache_pers_pageframes();
681                 dec_zcache_pers_zpages(zpages);
682                 dec_zcache_pers_zbytes(zsize);
683         }
684         if (!is_local_client(pool->client) && !zero_filled)
685                 ramster_count_foreign_pages(eph, -1);
686         if (page && !zero_filled)
687                 zcache_free_page(page);
688         return ret;
689 }
690
691 /*
692  * free the pampd and remove it from any zcache lists
693  * pampd must no longer be pointed to from any tmem data structures!
694  */
695 static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
696                               struct tmem_oid *oid, uint32_t index, bool acct)
697 {
698         struct page *page = NULL;
699         unsigned int zsize, zpages;
700         bool zero_filled = false;
701
702         BUG_ON(preemptible());
703
704         if (pampd == (void *)ZERO_FILLED) {
705                 zero_filled = true;
706                 zsize = 0;
707                 zpages = 1;
708                 dec_zcache_zero_filled_pages();
709         }
710
711         if (pampd_is_remote(pampd) && !zero_filled) {
712                 BUG_ON(!ramster_enabled);
713                 pampd = ramster_pampd_free(pampd, pool, oid, index, acct);
714                 if (pampd == NULL)
715                         return;
716         }
717         if (is_ephemeral(pool)) {
718                 if (!zero_filled)
719                         page = zbud_free_and_delist((struct zbudref *)pampd,
720                                                 true, &zsize, &zpages);
721                 if (page)
722                         dec_zcache_eph_pageframes();
723                 dec_zcache_eph_zpages(zpages);
724                 dec_zcache_eph_zbytes(zsize);
725                 /* FIXME CONFIG_RAMSTER... check acct parameter? */
726         } else {
727                 if (!zero_filled)
728                         page = zbud_free_and_delist((struct zbudref *)pampd,
729                                                 false, &zsize, &zpages);
730                 if (page)
731                         dec_zcache_pers_pageframes();
732                 dec_zcache_pers_zpages(zpages);
733                 dec_zcache_pers_zbytes(zsize);
734         }
735         if (!is_local_client(pool->client) && !zero_filled)
736                 ramster_count_foreign_pages(is_ephemeral(pool), -1);
737         if (page && !zero_filled)
738                 zcache_free_page(page);
739 }
740
741 static struct tmem_pamops zcache_pamops = {
742         .create_finish = zcache_pampd_create_finish,
743         .get_data = zcache_pampd_get_data,
744         .get_data_and_free = zcache_pampd_get_data_and_free,
745         .free = zcache_pampd_free,
746 };
747
748 /*
749  * zcache compression/decompression and related per-cpu stuff
750  */
751
752 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
753 #define ZCACHE_DSTMEM_ORDER 1
754
755 static void zcache_compress(struct page *from, void **out_va, unsigned *out_len)
756 {
757         int ret;
758         unsigned char *dmem = __get_cpu_var(zcache_dstmem);
759         char *from_va;
760
761         BUG_ON(!irqs_disabled());
762         /* no buffer or no compressor so can't compress */
763         BUG_ON(dmem == NULL);
764         *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
765         from_va = kmap_atomic(from);
766         mb();
767         ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
768                                 out_len);
769         BUG_ON(ret);
770         *out_va = dmem;
771         kunmap_atomic(from_va);
772 }
773
774 static int zcache_comp_cpu_up(int cpu)
775 {
776         struct crypto_comp *tfm;
777
778         tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
779         if (IS_ERR(tfm))
780                 return NOTIFY_BAD;
781         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
782         return NOTIFY_OK;
783 }
784
785 static void zcache_comp_cpu_down(int cpu)
786 {
787         struct crypto_comp *tfm;
788
789         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
790         crypto_free_comp(tfm);
791         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
792 }
793
794 static int zcache_cpu_notifier(struct notifier_block *nb,
795                                 unsigned long action, void *pcpu)
796 {
797         int ret, i, cpu = (long)pcpu;
798         struct zcache_preload *kp;
799
800         switch (action) {
801         case CPU_UP_PREPARE:
802                 ret = zcache_comp_cpu_up(cpu);
803                 if (ret != NOTIFY_OK) {
804                         pr_err("%s: can't allocate compressor xform\n",
805                                 namestr);
806                         return ret;
807                 }
808                 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
809                         GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
810                 if (ramster_enabled)
811                         ramster_cpu_up(cpu);
812                 break;
813         case CPU_DEAD:
814         case CPU_UP_CANCELED:
815                 zcache_comp_cpu_down(cpu);
816                 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
817                         ZCACHE_DSTMEM_ORDER);
818                 per_cpu(zcache_dstmem, cpu) = NULL;
819                 kp = &per_cpu(zcache_preloads, cpu);
820                 for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
821                         if (kp->objnodes[i])
822                                 kmem_cache_free(zcache_objnode_cache,
823                                                 kp->objnodes[i]);
824                 }
825                 if (kp->obj) {
826                         kmem_cache_free(zcache_obj_cache, kp->obj);
827                         kp->obj = NULL;
828                 }
829                 if (ramster_enabled)
830                         ramster_cpu_down(cpu);
831                 break;
832         default:
833                 break;
834         }
835         return NOTIFY_OK;
836 }
837
838 static struct notifier_block zcache_cpu_notifier_block = {
839         .notifier_call = zcache_cpu_notifier
840 };
841
842 /*
843  * The following code interacts with the zbud eviction and zbud
844  * zombify code to access LRU pages
845  */
846
847 static struct page *zcache_evict_eph_pageframe(void)
848 {
849         struct page *page;
850         unsigned int zsize = 0, zpages = 0;
851
852         page = zbud_evict_pageframe_lru(&zsize, &zpages);
853         if (page == NULL)
854                 goto out;
855         dec_zcache_eph_zbytes(zsize);
856         dec_zcache_eph_zpages(zpages);
857         inc_zcache_evicted_eph_zpages(zpages);
858         dec_zcache_eph_pageframes();
859         inc_zcache_evicted_eph_pageframes();
860 out:
861         return page;
862 }
863
864 #ifdef CONFIG_ZCACHE_WRITEBACK
865
866 static atomic_t zcache_outstanding_writeback_pages_atomic = ATOMIC_INIT(0);
867
868 static inline void inc_zcache_outstanding_writeback_pages(void)
869 {
870         zcache_outstanding_writeback_pages =
871             atomic_inc_return(&zcache_outstanding_writeback_pages_atomic);
872 }
873 static inline void dec_zcache_outstanding_writeback_pages(void)
874 {
875         zcache_outstanding_writeback_pages =
876           atomic_dec_return(&zcache_outstanding_writeback_pages_atomic);
877 };
878 static void unswiz(struct tmem_oid oid, u32 index,
879                                 unsigned *type, pgoff_t *offset);
880
881 /*
882  *  Choose an LRU persistent pageframe and attempt to write it back to
883  *  the backing swap disk by calling frontswap_writeback on both zpages.
884  *
885  *  This is work-in-progress.
886  */
887
888 static void zcache_end_swap_write(struct bio *bio, int err)
889 {
890         end_swap_bio_write(bio, err);
891         dec_zcache_outstanding_writeback_pages();
892         zcache_writtenback_pages++;
893 }
894
895 /*
896  * zcache_get_swap_cache_page
897  *
898  * This is an adaption of read_swap_cache_async()
899  *
900  * If success, page is returned in retpage
901  * Returns 0 if page was already in the swap cache, page is not locked
902  * Returns 1 if the new page needs to be populated, page is locked
903  */
904 static int zcache_get_swap_cache_page(int type, pgoff_t offset,
905                                 struct page *new_page)
906 {
907         struct page *found_page;
908         swp_entry_t entry = swp_entry(type, offset);
909         int err;
910
911         BUG_ON(new_page == NULL);
912         do {
913                 /*
914                  * First check the swap cache.  Since this is normally
915                  * called after lookup_swap_cache() failed, re-calling
916                  * that would confuse statistics.
917                  */
918                 found_page = find_get_page(&swapper_space, entry.val);
919                 if (found_page)
920                         return 0;
921
922                 /*
923                  * call radix_tree_preload() while we can wait.
924                  */
925                 err = radix_tree_preload(GFP_KERNEL);
926                 if (err)
927                         break;
928
929                 /*
930                  * Swap entry may have been freed since our caller observed it.
931                  */
932                 err = swapcache_prepare(entry);
933                 if (err == -EEXIST) { /* seems racy */
934                         radix_tree_preload_end();
935                         continue;
936                 }
937                 if (err) { /* swp entry is obsolete ? */
938                         radix_tree_preload_end();
939                         break;
940                 }
941
942                 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
943                 __set_page_locked(new_page);
944                 SetPageSwapBacked(new_page);
945                 err = __add_to_swap_cache(new_page, entry);
946                 if (likely(!err)) {
947                         radix_tree_preload_end();
948                         lru_cache_add_anon(new_page);
949                         return 1;
950                 }
951                 radix_tree_preload_end();
952                 ClearPageSwapBacked(new_page);
953                 __clear_page_locked(new_page);
954                 /*
955                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
956                  * clear SWAP_HAS_CACHE flag.
957                  */
958                 swapcache_free(entry, NULL);
959                 /* FIXME: is it possible to get here without err==-ENOMEM?
960                  * If not, we can dispense with the do loop, use goto retry */
961         } while (err != -ENOMEM);
962
963         return -ENOMEM;
964 }
965
966 /*
967  * Given a frontswap zpage in zcache (identified by type/offset) and
968  * an empty page, put the page into the swap cache, use frontswap
969  * to get the page from zcache into the empty page, then give it
970  * to the swap subsystem to send to disk (carefully avoiding the
971  * possibility that frontswap might snatch it back).
972  * Returns < 0 if error, 0 if successful, and 1 if successful but
973  * the newpage passed in not needed and should be freed.
974  */
975 static int zcache_frontswap_writeback_zpage(int type, pgoff_t offset,
976                                         struct page *newpage)
977 {
978         struct page *page = newpage;
979         int ret;
980         struct writeback_control wbc = {
981                 .sync_mode = WB_SYNC_NONE,
982         };
983
984         ret = zcache_get_swap_cache_page(type, offset, page);
985         if (ret < 0)
986                 return ret;
987         else if (ret == 0) {
988                 /* more uptodate page is already in swapcache */
989                 __frontswap_invalidate_page(type, offset);
990                 return 1;
991         }
992
993         BUG_ON(!frontswap_has_exclusive_gets); /* load must also invalidate */
994         /* FIXME: how is it possible to get here when page is unlocked? */
995         __frontswap_load(page);
996         SetPageUptodate(page);  /* above does SetPageDirty, is that enough? */
997
998         /* start writeback */
999         SetPageReclaim(page);
1000         /*
1001          * Return value is ignored here because it doesn't change anything
1002          * for us.  Page is returned unlocked.
1003          */
1004         (void)__swap_writepage(page, &wbc, zcache_end_swap_write);
1005         page_cache_release(page);
1006         inc_zcache_outstanding_writeback_pages();
1007
1008         return 0;
1009 }
1010
1011 /*
1012  * The following is still a magic number... we want to allow forward progress
1013  * for writeback because it clears out needed RAM when under pressure, but
1014  * we don't want to allow writeback to absorb and queue too many GFP_KERNEL
1015  * pages if the swap device is very slow.
1016  */
1017 #define ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES 6400
1018
1019 /*
1020  * Try to allocate two free pages, first using a non-aggressive alloc,
1021  * then by evicting zcache ephemeral (clean pagecache) pages, and last
1022  * by aggressive GFP_KERNEL alloc.  We allow zbud to choose a pageframe
1023  * consisting of 1-2 zbuds/zpages, then call the writeback_zpage helper
1024  * function above for each.
1025  */
1026 static int zcache_frontswap_writeback(void)
1027 {
1028         struct tmem_handle th[2];
1029         int ret = 0;
1030         int nzbuds, writeback_ret;
1031         unsigned type;
1032         struct page *znewpage1 = NULL, *znewpage2 = NULL;
1033         struct page *evictpage1 = NULL, *evictpage2 = NULL;
1034         struct page *newpage1 = NULL, *newpage2 = NULL;
1035         struct page *page1 = NULL, *page2 = NULL;
1036         pgoff_t offset;
1037
1038         znewpage1 = alloc_page(ZCACHE_GFP_MASK);
1039         znewpage2 = alloc_page(ZCACHE_GFP_MASK);
1040         if (znewpage1 == NULL)
1041                 evictpage1 = zcache_evict_eph_pageframe();
1042         if (znewpage2 == NULL)
1043                 evictpage2 = zcache_evict_eph_pageframe();
1044
1045         if ((evictpage1 == NULL || evictpage2 == NULL) &&
1046             atomic_read(&zcache_outstanding_writeback_pages_atomic) >
1047                                 ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES) {
1048                 goto free_and_out;
1049         }
1050         if (znewpage1 == NULL && evictpage1 == NULL)
1051                 newpage1 = alloc_page(GFP_KERNEL);
1052         if (znewpage2 == NULL && evictpage2 == NULL)
1053                 newpage2 = alloc_page(GFP_KERNEL);
1054         if (newpage1 == NULL || newpage2 == NULL)
1055                         goto free_and_out;
1056
1057         /* ok, we have two pageframes pre-allocated, get a pair of zbuds */
1058         nzbuds = zbud_make_zombie_lru(&th[0], NULL, NULL, false);
1059         if (nzbuds == 0) {
1060                 ret = -ENOENT;
1061                 goto free_and_out;
1062         }
1063
1064         /* process the first zbud */
1065         unswiz(th[0].oid, th[0].index, &type, &offset);
1066         page1 = (znewpage1 != NULL) ? znewpage1 :
1067                         ((newpage1 != NULL) ? newpage1 : evictpage1);
1068         writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page1);
1069         if (writeback_ret < 0) {
1070                 ret = -ENOMEM;
1071                 goto free_and_out;
1072         }
1073         if (evictpage1 != NULL)
1074                 zcache_pageframes_freed =
1075                         atomic_inc_return(&zcache_pageframes_freed_atomic);
1076         if (writeback_ret == 0) {
1077                 /* zcache_get_swap_cache_page will free, don't double free */
1078                 znewpage1 = NULL;
1079                 newpage1 = NULL;
1080                 evictpage1 = NULL;
1081         }
1082         if (nzbuds < 2)
1083                 goto free_and_out;
1084
1085         /* if there is a second zbud, process it */
1086         unswiz(th[1].oid, th[1].index, &type, &offset);
1087         page2 = (znewpage2 != NULL) ? znewpage2 :
1088                         ((newpage2 != NULL) ? newpage2 : evictpage2);
1089         writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page2);
1090         if (writeback_ret < 0) {
1091                 ret = -ENOMEM;
1092                 goto free_and_out;
1093         }
1094         if (evictpage2 != NULL)
1095                 zcache_pageframes_freed =
1096                         atomic_inc_return(&zcache_pageframes_freed_atomic);
1097         if (writeback_ret == 0) {
1098                 znewpage2 = NULL;
1099                 newpage2 = NULL;
1100                 evictpage2 = NULL;
1101         }
1102
1103 free_and_out:
1104         if (znewpage1 != NULL)
1105                 page_cache_release(znewpage1);
1106         if (znewpage2 != NULL)
1107                 page_cache_release(znewpage2);
1108         if (newpage1 != NULL)
1109                 page_cache_release(newpage1);
1110         if (newpage2 != NULL)
1111                 page_cache_release(newpage2);
1112         if (evictpage1 != NULL)
1113                 zcache_free_page(evictpage1);
1114         if (evictpage2 != NULL)
1115                 zcache_free_page(evictpage2);
1116         return ret;
1117 }
1118 #endif /* CONFIG_ZCACHE_WRITEBACK */
1119
1120 /*
1121  * When zcache is disabled ("frozen"), pools can be created and destroyed,
1122  * but all puts (and thus all other operations that require memory allocation)
1123  * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
1124  * data consistency requires all puts while frozen to be converted into
1125  * flushes.
1126  */
1127 static bool zcache_freeze;
1128
1129 /*
1130  * This zcache shrinker interface reduces the number of ephemeral pageframes
1131  * used by zcache to approximately the same as the total number of LRU_FILE
1132  * pageframes in use, and now also reduces the number of persistent pageframes
1133  * used by zcache to approximately the same as the total number of LRU_ANON
1134  * pageframes in use.  FIXME POLICY: Probably the writeback should only occur
1135  * if the eviction doesn't free enough pages.
1136  */
1137 static int shrink_zcache_memory(struct shrinker *shrink,
1138                                 struct shrink_control *sc)
1139 {
1140         static bool in_progress;
1141         int ret = -1;
1142         int nr = sc->nr_to_scan;
1143         int nr_evict = 0;
1144         int nr_writeback = 0;
1145         struct page *page;
1146         int  file_pageframes_inuse, anon_pageframes_inuse;
1147
1148         if (nr <= 0)
1149                 goto skip_evict;
1150
1151         /* don't allow more than one eviction thread at a time */
1152         if (in_progress)
1153                 goto skip_evict;
1154
1155         in_progress = true;
1156
1157         /* we are going to ignore nr, and target a different value */
1158         zcache_last_active_file_pageframes =
1159                 global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
1160         zcache_last_inactive_file_pageframes =
1161                 global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
1162         file_pageframes_inuse = zcache_last_active_file_pageframes +
1163                                 zcache_last_inactive_file_pageframes;
1164         if (zcache_eph_pageframes > file_pageframes_inuse)
1165                 nr_evict = zcache_eph_pageframes - file_pageframes_inuse;
1166         else
1167                 nr_evict = 0;
1168         while (nr_evict-- > 0) {
1169                 page = zcache_evict_eph_pageframe();
1170                 if (page == NULL)
1171                         break;
1172                 zcache_free_page(page);
1173         }
1174
1175         zcache_last_active_anon_pageframes =
1176                 global_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON);
1177         zcache_last_inactive_anon_pageframes =
1178                 global_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON);
1179         anon_pageframes_inuse = zcache_last_active_anon_pageframes +
1180                                 zcache_last_inactive_anon_pageframes;
1181         if (zcache_pers_pageframes > anon_pageframes_inuse)
1182                 nr_writeback = zcache_pers_pageframes - anon_pageframes_inuse;
1183         else
1184                 nr_writeback = 0;
1185         while (nr_writeback-- > 0) {
1186 #ifdef CONFIG_ZCACHE_WRITEBACK
1187                 int writeback_ret;
1188                 writeback_ret = zcache_frontswap_writeback();
1189                 if (writeback_ret == -ENOMEM)
1190 #endif
1191                         break;
1192         }
1193         in_progress = false;
1194
1195 skip_evict:
1196         /* resample: has changed, but maybe not all the way yet */
1197         zcache_last_active_file_pageframes =
1198                 global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
1199         zcache_last_inactive_file_pageframes =
1200                 global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
1201         ret = zcache_eph_pageframes - zcache_last_active_file_pageframes +
1202                 zcache_last_inactive_file_pageframes;
1203         if (ret < 0)
1204                 ret = 0;
1205         return ret;
1206 }
1207
1208 static struct shrinker zcache_shrinker = {
1209         .shrink = shrink_zcache_memory,
1210         .seeks = DEFAULT_SEEKS,
1211 };
1212
1213 /*
1214  * zcache shims between cleancache/frontswap ops and tmem
1215  */
1216
1217 /* FIXME rename these core routines to zcache_tmemput etc? */
1218 int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1219                                 uint32_t index, void *page,
1220                                 unsigned int size, bool raw, int ephemeral)
1221 {
1222         struct tmem_pool *pool;
1223         struct tmem_handle th;
1224         int ret = -1;
1225         void *pampd = NULL;
1226
1227         BUG_ON(!irqs_disabled());
1228         pool = zcache_get_pool_by_id(cli_id, pool_id);
1229         if (unlikely(pool == NULL))
1230                 goto out;
1231         if (!zcache_freeze) {
1232                 ret = 0;
1233                 th.client_id = cli_id;
1234                 th.pool_id = pool_id;
1235                 th.oid = *oidp;
1236                 th.index = index;
1237                 pampd = zcache_pampd_create((char *)page, size, raw,
1238                                 ephemeral, &th);
1239                 if (pampd == NULL) {
1240                         ret = -ENOMEM;
1241                         if (ephemeral)
1242                                 inc_zcache_failed_eph_puts();
1243                         else
1244                                 inc_zcache_failed_pers_puts();
1245                 } else {
1246                         if (ramster_enabled)
1247                                 ramster_do_preload_flnode(pool);
1248                         ret = tmem_put(pool, oidp, index, 0, pampd);
1249                         if (ret < 0)
1250                                 BUG();
1251                 }
1252                 zcache_put_pool(pool);
1253         } else {
1254                 inc_zcache_put_to_flush();
1255                 if (ramster_enabled)
1256                         ramster_do_preload_flnode(pool);
1257                 if (atomic_read(&pool->obj_count) > 0)
1258                         /* the put fails whether the flush succeeds or not */
1259                         (void)tmem_flush_page(pool, oidp, index);
1260                 zcache_put_pool(pool);
1261         }
1262 out:
1263         return ret;
1264 }
1265
1266 int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1267                                 uint32_t index, void *page,
1268                                 size_t *sizep, bool raw, int get_and_free)
1269 {
1270         struct tmem_pool *pool;
1271         int ret = -1;
1272         bool eph;
1273
1274         if (!raw) {
1275                 BUG_ON(irqs_disabled());
1276                 BUG_ON(in_softirq());
1277         }
1278         pool = zcache_get_pool_by_id(cli_id, pool_id);
1279         eph = is_ephemeral(pool);
1280         if (likely(pool != NULL)) {
1281                 if (atomic_read(&pool->obj_count) > 0)
1282                         ret = tmem_get(pool, oidp, index, (char *)(page),
1283                                         sizep, raw, get_and_free);
1284                 zcache_put_pool(pool);
1285         }
1286         WARN_ONCE((!is_ephemeral(pool) && (ret != 0)),
1287                         "zcache_get fails on persistent pool, "
1288                         "bad things are very likely to happen soon\n");
1289 #ifdef RAMSTER_TESTING
1290         if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool)))
1291                 pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret);
1292 #endif
1293         return ret;
1294 }
1295
1296 int zcache_flush_page(int cli_id, int pool_id,
1297                                 struct tmem_oid *oidp, uint32_t index)
1298 {
1299         struct tmem_pool *pool;
1300         int ret = -1;
1301         unsigned long flags;
1302
1303         local_irq_save(flags);
1304         inc_zcache_flush_total();
1305         pool = zcache_get_pool_by_id(cli_id, pool_id);
1306         if (ramster_enabled)
1307                 ramster_do_preload_flnode(pool);
1308         if (likely(pool != NULL)) {
1309                 if (atomic_read(&pool->obj_count) > 0)
1310                         ret = tmem_flush_page(pool, oidp, index);
1311                 zcache_put_pool(pool);
1312         }
1313         if (ret >= 0)
1314                 inc_zcache_flush_found();
1315         local_irq_restore(flags);
1316         return ret;
1317 }
1318
1319 int zcache_flush_object(int cli_id, int pool_id,
1320                                 struct tmem_oid *oidp)
1321 {
1322         struct tmem_pool *pool;
1323         int ret = -1;
1324         unsigned long flags;
1325
1326         local_irq_save(flags);
1327         inc_zcache_flobj_total();
1328         pool = zcache_get_pool_by_id(cli_id, pool_id);
1329         if (ramster_enabled)
1330                 ramster_do_preload_flnode(pool);
1331         if (likely(pool != NULL)) {
1332                 if (atomic_read(&pool->obj_count) > 0)
1333                         ret = tmem_flush_object(pool, oidp);
1334                 zcache_put_pool(pool);
1335         }
1336         if (ret >= 0)
1337                 inc_zcache_flobj_found();
1338         local_irq_restore(flags);
1339         return ret;
1340 }
1341
1342 static int zcache_client_destroy_pool(int cli_id, int pool_id)
1343 {
1344         struct tmem_pool *pool = NULL;
1345         struct zcache_client *cli = NULL;
1346         int ret = -1;
1347
1348         if (pool_id < 0)
1349                 goto out;
1350         if (cli_id == LOCAL_CLIENT)
1351                 cli = &zcache_host;
1352         else if ((unsigned int)cli_id < MAX_CLIENTS)
1353                 cli = &zcache_clients[cli_id];
1354         if (cli == NULL)
1355                 goto out;
1356         atomic_inc(&cli->refcount);
1357         pool = cli->tmem_pools[pool_id];
1358         if (pool == NULL)
1359                 goto out;
1360         cli->tmem_pools[pool_id] = NULL;
1361         /* wait for pool activity on other cpus to quiesce */
1362         while (atomic_read(&pool->refcount) != 0)
1363                 ;
1364         atomic_dec(&cli->refcount);
1365         local_bh_disable();
1366         ret = tmem_destroy_pool(pool);
1367         local_bh_enable();
1368         kfree(pool);
1369         if (cli_id == LOCAL_CLIENT)
1370                 pr_info("%s: destroyed local pool id=%d\n", namestr, pool_id);
1371         else
1372                 pr_info("%s: destroyed pool id=%d, client=%d\n",
1373                                 namestr, pool_id, cli_id);
1374 out:
1375         return ret;
1376 }
1377
1378 int zcache_new_pool(uint16_t cli_id, uint32_t flags)
1379 {
1380         int poolid = -1;
1381         struct tmem_pool *pool;
1382         struct zcache_client *cli = NULL;
1383
1384         if (cli_id == LOCAL_CLIENT)
1385                 cli = &zcache_host;
1386         else if ((unsigned int)cli_id < MAX_CLIENTS)
1387                 cli = &zcache_clients[cli_id];
1388         if (cli == NULL)
1389                 goto out;
1390         atomic_inc(&cli->refcount);
1391         pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
1392         if (pool == NULL)
1393                 goto out;
1394
1395         for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
1396                 if (cli->tmem_pools[poolid] == NULL)
1397                         break;
1398         if (poolid >= MAX_POOLS_PER_CLIENT) {
1399                 pr_info("%s: pool creation failed: max exceeded\n", namestr);
1400                 kfree(pool);
1401                 poolid = -1;
1402                 goto out;
1403         }
1404         atomic_set(&pool->refcount, 0);
1405         pool->client = cli;
1406         pool->pool_id = poolid;
1407         tmem_new_pool(pool, flags);
1408         cli->tmem_pools[poolid] = pool;
1409         if (cli_id == LOCAL_CLIENT)
1410                 pr_info("%s: created %s local tmem pool, id=%d\n", namestr,
1411                         flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1412                         poolid);
1413         else
1414                 pr_info("%s: created %s tmem pool, id=%d, client=%d\n", namestr,
1415                         flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1416                         poolid, cli_id);
1417 out:
1418         if (cli != NULL)
1419                 atomic_dec(&cli->refcount);
1420         return poolid;
1421 }
1422
1423 static int zcache_local_new_pool(uint32_t flags)
1424 {
1425         return zcache_new_pool(LOCAL_CLIENT, flags);
1426 }
1427
1428 int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph)
1429 {
1430         struct tmem_pool *pool;
1431         struct zcache_client *cli = NULL;
1432         uint32_t flags = eph ? 0 : TMEM_POOL_PERSIST;
1433         int ret = -1;
1434
1435         BUG_ON(!ramster_enabled);
1436         if (cli_id == LOCAL_CLIENT)
1437                 goto out;
1438         if (pool_id >= MAX_POOLS_PER_CLIENT)
1439                 goto out;
1440         if (cli_id >= MAX_CLIENTS)
1441                 goto out;
1442
1443         cli = &zcache_clients[cli_id];
1444         if ((eph && disable_cleancache) || (!eph && disable_frontswap)) {
1445                 pr_err("zcache_autocreate_pool: pool type disabled\n");
1446                 goto out;
1447         }
1448         if (!cli->allocated) {
1449                 if (zcache_new_client(cli_id)) {
1450                         pr_err("zcache_autocreate_pool: can't create client\n");
1451                         goto out;
1452                 }
1453                 cli = &zcache_clients[cli_id];
1454         }
1455         atomic_inc(&cli->refcount);
1456         pool = cli->tmem_pools[pool_id];
1457         if (pool != NULL) {
1458                 if (pool->persistent && eph) {
1459                         pr_err("zcache_autocreate_pool: type mismatch\n");
1460                         goto out;
1461                 }
1462                 ret = 0;
1463                 goto out;
1464         }
1465         pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
1466         if (pool == NULL)
1467                 goto out;
1468
1469         atomic_set(&pool->refcount, 0);
1470         pool->client = cli;
1471         pool->pool_id = pool_id;
1472         tmem_new_pool(pool, flags);
1473         cli->tmem_pools[pool_id] = pool;
1474         pr_info("%s: AUTOcreated %s tmem poolid=%d, for remote client=%d\n",
1475                 namestr, flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1476                 pool_id, cli_id);
1477         ret = 0;
1478 out:
1479         if (cli != NULL)
1480                 atomic_dec(&cli->refcount);
1481         return ret;
1482 }
1483
1484 /**********
1485  * Two kernel functionalities currently can be layered on top of tmem.
1486  * These are "cleancache" which is used as a second-chance cache for clean
1487  * page cache pages; and "frontswap" which is used for swap pages
1488  * to avoid writes to disk.  A generic "shim" is provided here for each
1489  * to translate in-kernel semantics to zcache semantics.
1490  */
1491
1492 static void zcache_cleancache_put_page(int pool_id,
1493                                         struct cleancache_filekey key,
1494                                         pgoff_t index, struct page *page)
1495 {
1496         u32 ind = (u32) index;
1497         struct tmem_oid oid = *(struct tmem_oid *)&key;
1498
1499         if (!disable_cleancache_ignore_nonactive && !PageWasActive(page)) {
1500                 inc_zcache_eph_nonactive_puts_ignored();
1501                 return;
1502         }
1503         if (likely(ind == index))
1504                 (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index,
1505                                         page, PAGE_SIZE, false, 1);
1506 }
1507
1508 static int zcache_cleancache_get_page(int pool_id,
1509                                         struct cleancache_filekey key,
1510                                         pgoff_t index, struct page *page)
1511 {
1512         u32 ind = (u32) index;
1513         struct tmem_oid oid = *(struct tmem_oid *)&key;
1514         size_t size;
1515         int ret = -1;
1516
1517         if (likely(ind == index)) {
1518                 ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index,
1519                                         page, &size, false, 0);
1520                 BUG_ON(ret >= 0 && size != PAGE_SIZE);
1521                 if (ret == 0)
1522                         SetPageWasActive(page);
1523         }
1524         return ret;
1525 }
1526
1527 static void zcache_cleancache_flush_page(int pool_id,
1528                                         struct cleancache_filekey key,
1529                                         pgoff_t index)
1530 {
1531         u32 ind = (u32) index;
1532         struct tmem_oid oid = *(struct tmem_oid *)&key;
1533
1534         if (likely(ind == index))
1535                 (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
1536 }
1537
1538 static void zcache_cleancache_flush_inode(int pool_id,
1539                                         struct cleancache_filekey key)
1540 {
1541         struct tmem_oid oid = *(struct tmem_oid *)&key;
1542
1543         (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
1544 }
1545
1546 static void zcache_cleancache_flush_fs(int pool_id)
1547 {
1548         if (pool_id >= 0)
1549                 (void)zcache_client_destroy_pool(LOCAL_CLIENT, pool_id);
1550 }
1551
1552 static int zcache_cleancache_init_fs(size_t pagesize)
1553 {
1554         BUG_ON(sizeof(struct cleancache_filekey) !=
1555                                 sizeof(struct tmem_oid));
1556         BUG_ON(pagesize != PAGE_SIZE);
1557         return zcache_local_new_pool(0);
1558 }
1559
1560 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1561 {
1562         /* shared pools are unsupported and map to private */
1563         BUG_ON(sizeof(struct cleancache_filekey) !=
1564                                 sizeof(struct tmem_oid));
1565         BUG_ON(pagesize != PAGE_SIZE);
1566         return zcache_local_new_pool(0);
1567 }
1568
1569 static struct cleancache_ops zcache_cleancache_ops = {
1570         .put_page = zcache_cleancache_put_page,
1571         .get_page = zcache_cleancache_get_page,
1572         .invalidate_page = zcache_cleancache_flush_page,
1573         .invalidate_inode = zcache_cleancache_flush_inode,
1574         .invalidate_fs = zcache_cleancache_flush_fs,
1575         .init_shared_fs = zcache_cleancache_init_shared_fs,
1576         .init_fs = zcache_cleancache_init_fs
1577 };
1578
1579 struct cleancache_ops zcache_cleancache_register_ops(void)
1580 {
1581         struct cleancache_ops old_ops =
1582                 cleancache_register_ops(&zcache_cleancache_ops);
1583
1584         return old_ops;
1585 }
1586
1587 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1588 static int zcache_frontswap_poolid __read_mostly = -1;
1589
1590 /*
1591  * Swizzling increases objects per swaptype, increasing tmem concurrency
1592  * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
1593  * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1594  * frontswap_get_page(), but has side-effects. Hence using 8.
1595  */
1596 #define SWIZ_BITS               8
1597 #define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
1598 #define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1599 #define iswiz(_ind)             (_ind >> SWIZ_BITS)
1600
1601 static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1602 {
1603         struct tmem_oid oid = { .oid = { 0 } };
1604         oid.oid[0] = _oswiz(type, ind);
1605         return oid;
1606 }
1607
1608 #ifdef CONFIG_ZCACHE_WRITEBACK
1609 static void unswiz(struct tmem_oid oid, u32 index,
1610                                 unsigned *type, pgoff_t *offset)
1611 {
1612         *type = (unsigned)(oid.oid[0] >> SWIZ_BITS);
1613         *offset = (pgoff_t)((index << SWIZ_BITS) |
1614                         (oid.oid[0] & SWIZ_MASK));
1615 }
1616 #endif
1617
1618 static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1619                                         struct page *page)
1620 {
1621         u64 ind64 = (u64)offset;
1622         u32 ind = (u32)offset;
1623         struct tmem_oid oid = oswiz(type, ind);
1624         int ret = -1;
1625         unsigned long flags;
1626
1627         BUG_ON(!PageLocked(page));
1628         if (!disable_frontswap_ignore_nonactive && !PageWasActive(page)) {
1629                 inc_zcache_pers_nonactive_puts_ignored();
1630                 ret = -ERANGE;
1631                 goto out;
1632         }
1633         if (likely(ind64 == ind)) {
1634                 local_irq_save(flags);
1635                 ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1636                                         &oid, iswiz(ind),
1637                                         page, PAGE_SIZE, false, 0);
1638                 local_irq_restore(flags);
1639         }
1640 out:
1641         return ret;
1642 }
1643
1644 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1645  * was not present (should never happen!) */
1646 static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1647                                         struct page *page)
1648 {
1649         u64 ind64 = (u64)offset;
1650         u32 ind = (u32)offset;
1651         struct tmem_oid oid = oswiz(type, ind);
1652         size_t size;
1653         int ret = -1, get_and_free;
1654
1655         if (frontswap_has_exclusive_gets)
1656                 get_and_free = 1;
1657         else
1658                 get_and_free = -1;
1659         BUG_ON(!PageLocked(page));
1660         if (likely(ind64 == ind)) {
1661                 ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1662                                         &oid, iswiz(ind),
1663                                         page, &size, false, get_and_free);
1664                 BUG_ON(ret >= 0 && size != PAGE_SIZE);
1665         }
1666         return ret;
1667 }
1668
1669 /* flush a single page from frontswap */
1670 static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1671 {
1672         u64 ind64 = (u64)offset;
1673         u32 ind = (u32)offset;
1674         struct tmem_oid oid = oswiz(type, ind);
1675
1676         if (likely(ind64 == ind))
1677                 (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1678                                         &oid, iswiz(ind));
1679 }
1680
1681 /* flush all pages from the passed swaptype */
1682 static void zcache_frontswap_flush_area(unsigned type)
1683 {
1684         struct tmem_oid oid;
1685         int ind;
1686
1687         for (ind = SWIZ_MASK; ind >= 0; ind--) {
1688                 oid = oswiz(type, ind);
1689                 (void)zcache_flush_object(LOCAL_CLIENT,
1690                                                 zcache_frontswap_poolid, &oid);
1691         }
1692 }
1693
1694 static void zcache_frontswap_init(unsigned ignored)
1695 {
1696         /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1697         if (zcache_frontswap_poolid < 0)
1698                 zcache_frontswap_poolid =
1699                         zcache_local_new_pool(TMEM_POOL_PERSIST);
1700 }
1701
1702 static struct frontswap_ops zcache_frontswap_ops = {
1703         .store = zcache_frontswap_put_page,
1704         .load = zcache_frontswap_get_page,
1705         .invalidate_page = zcache_frontswap_flush_page,
1706         .invalidate_area = zcache_frontswap_flush_area,
1707         .init = zcache_frontswap_init
1708 };
1709
1710 struct frontswap_ops zcache_frontswap_register_ops(void)
1711 {
1712         struct frontswap_ops old_ops =
1713                 frontswap_register_ops(&zcache_frontswap_ops);
1714
1715         return old_ops;
1716 }
1717
1718 /*
1719  * zcache initialization
1720  * NOTE FOR NOW zcache or ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER
1721  * OR NOTHING HAPPENS!
1722  */
1723
1724 static int __init enable_zcache(char *s)
1725 {
1726         zcache_enabled = true;
1727         return 1;
1728 }
1729 __setup("zcache", enable_zcache);
1730
1731 static int __init enable_ramster(char *s)
1732 {
1733         zcache_enabled = true;
1734 #ifdef CONFIG_RAMSTER
1735         ramster_enabled = true;
1736 #endif
1737         return 1;
1738 }
1739 __setup("ramster", enable_ramster);
1740
1741 /* allow independent dynamic disabling of cleancache and frontswap */
1742
1743 static int __init no_cleancache(char *s)
1744 {
1745         disable_cleancache = true;
1746         return 1;
1747 }
1748
1749 __setup("nocleancache", no_cleancache);
1750
1751 static int __init no_frontswap(char *s)
1752 {
1753         disable_frontswap = true;
1754         return 1;
1755 }
1756
1757 __setup("nofrontswap", no_frontswap);
1758
1759 static int __init no_frontswap_exclusive_gets(char *s)
1760 {
1761         frontswap_has_exclusive_gets = false;
1762         return 1;
1763 }
1764
1765 __setup("nofrontswapexclusivegets", no_frontswap_exclusive_gets);
1766
1767 static int __init no_frontswap_ignore_nonactive(char *s)
1768 {
1769         disable_frontswap_ignore_nonactive = true;
1770         return 1;
1771 }
1772
1773 __setup("nofrontswapignorenonactive", no_frontswap_ignore_nonactive);
1774
1775 static int __init no_cleancache_ignore_nonactive(char *s)
1776 {
1777         disable_cleancache_ignore_nonactive = true;
1778         return 1;
1779 }
1780
1781 __setup("nocleancacheignorenonactive", no_cleancache_ignore_nonactive);
1782
1783 static int __init enable_zcache_compressor(char *s)
1784 {
1785         strlcpy(zcache_comp_name, s, sizeof(zcache_comp_name));
1786         zcache_enabled = true;
1787         return 1;
1788 }
1789 __setup("zcache=", enable_zcache_compressor);
1790
1791
1792 static int __init zcache_comp_init(void)
1793 {
1794         int ret = 0;
1795
1796         /* check crypto algorithm */
1797         if (*zcache_comp_name != '\0') {
1798                 ret = crypto_has_comp(zcache_comp_name, 0, 0);
1799                 if (!ret)
1800                         pr_info("zcache: %s not supported\n",
1801                                         zcache_comp_name);
1802         }
1803         if (!ret)
1804                 strcpy(zcache_comp_name, "lzo");
1805         ret = crypto_has_comp(zcache_comp_name, 0, 0);
1806         if (!ret) {
1807                 ret = 1;
1808                 goto out;
1809         }
1810         pr_info("zcache: using %s compressor\n", zcache_comp_name);
1811
1812         /* alloc percpu transforms */
1813         ret = 0;
1814         zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
1815         if (!zcache_comp_pcpu_tfms)
1816                 ret = 1;
1817 out:
1818         return ret;
1819 }
1820
1821 static int __init zcache_init(void)
1822 {
1823         int ret = 0;
1824
1825         if (ramster_enabled) {
1826                 namestr = "ramster";
1827                 ramster_register_pamops(&zcache_pamops);
1828         }
1829         zcache_debugfs_init();
1830         if (zcache_enabled) {
1831                 unsigned int cpu;
1832
1833                 tmem_register_hostops(&zcache_hostops);
1834                 tmem_register_pamops(&zcache_pamops);
1835                 ret = register_cpu_notifier(&zcache_cpu_notifier_block);
1836                 if (ret) {
1837                         pr_err("%s: can't register cpu notifier\n", namestr);
1838                         goto out;
1839                 }
1840                 ret = zcache_comp_init();
1841                 if (ret) {
1842                         pr_err("%s: compressor initialization failed\n",
1843                                 namestr);
1844                         goto out;
1845                 }
1846                 for_each_online_cpu(cpu) {
1847                         void *pcpu = (void *)(long)cpu;
1848                         zcache_cpu_notifier(&zcache_cpu_notifier_block,
1849                                 CPU_UP_PREPARE, pcpu);
1850                 }
1851         }
1852         zcache_objnode_cache = kmem_cache_create("zcache_objnode",
1853                                 sizeof(struct tmem_objnode), 0, 0, NULL);
1854         zcache_obj_cache = kmem_cache_create("zcache_obj",
1855                                 sizeof(struct tmem_obj), 0, 0, NULL);
1856         ret = zcache_new_client(LOCAL_CLIENT);
1857         if (ret) {
1858                 pr_err("%s: can't create client\n", namestr);
1859                 goto out;
1860         }
1861         zbud_init();
1862         if (zcache_enabled && !disable_cleancache) {
1863                 struct cleancache_ops old_ops;
1864
1865                 register_shrinker(&zcache_shrinker);
1866                 old_ops = zcache_cleancache_register_ops();
1867                 pr_info("%s: cleancache enabled using kernel transcendent "
1868                         "memory and compression buddies\n", namestr);
1869 #ifdef CONFIG_ZCACHE_DEBUG
1870                 pr_info("%s: cleancache: ignorenonactive = %d\n",
1871                         namestr, !disable_cleancache_ignore_nonactive);
1872 #endif
1873                 if (old_ops.init_fs != NULL)
1874                         pr_warn("%s: cleancache_ops overridden\n", namestr);
1875         }
1876         if (zcache_enabled && !disable_frontswap) {
1877                 struct frontswap_ops old_ops;
1878
1879                 old_ops = zcache_frontswap_register_ops();
1880                 if (frontswap_has_exclusive_gets)
1881                         frontswap_tmem_exclusive_gets(true);
1882                 pr_info("%s: frontswap enabled using kernel transcendent "
1883                         "memory and compression buddies\n", namestr);
1884 #ifdef CONFIG_ZCACHE_DEBUG
1885                 pr_info("%s: frontswap: excl gets = %d active only = %d\n",
1886                         namestr, frontswap_has_exclusive_gets,
1887                         !disable_frontswap_ignore_nonactive);
1888 #endif
1889                 if (old_ops.init != NULL)
1890                         pr_warn("%s: frontswap_ops overridden\n", namestr);
1891         }
1892         if (ramster_enabled)
1893                 ramster_init(!disable_cleancache, !disable_frontswap,
1894                                 frontswap_has_exclusive_gets);
1895 out:
1896         return ret;
1897 }
1898
1899 late_initcall(zcache_init);