Merge tag 'tty-3.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty
[platform/adaptation/renesas_rcar/renesas_kernel.git] / drivers / staging / zcache / ramster / ramster.c
1 /*
2  * ramster.c
3  *
4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of
7  * kernels to dynamically pool their RAM so that a RAM-hungry workload on one
8  * machine can temporarily and transparently utilize RAM on another machine
9  * which is presumably idle or running a non-RAM-hungry workload.
10  *
11  * RAMster combines a clustering and messaging foundation based on the ocfs2
12  * cluster layer with the in-kernel compression implementation of zcache, and
13  * adds code to glue them together.  When a page is "put" to RAMster, it is
14  * compressed and stored locally.  Periodically, a thread will "remotify" these
15  * pages by sending them via messages to a remote machine.  When the page is
16  * later needed as indicated by a page fault, a "get" is issued.  If the data
17  * is local, it is uncompressed and the fault is resolved.  If the data is
18  * remote, a message is sent to fetch the data and the faulting thread sleeps;
19  * when the data arrives, the thread awakens, the data is decompressed and
20  * the fault is resolved.
21
22  * As of V5, clusters up to eight nodes are supported; each node can remotify
23  * pages to one specified node, so clusters can be configured as clients to
24  * a "memory server".  Some simple policy is in place that will need to be
25  * refined over time.  Larger clusters and fault-resistant protocols can also
26  * be added over time.
27  */
28
29 #include <linux/module.h>
30 #include <linux/cpu.h>
31 #include <linux/highmem.h>
32 #include <linux/list.h>
33 #include <linux/lzo.h>
34 #include <linux/slab.h>
35 #include <linux/spinlock.h>
36 #include <linux/types.h>
37 #include <linux/atomic.h>
38 #include <linux/frontswap.h>
39 #include "../tmem.h"
40 #include "../zcache.h"
41 #include "../zbud.h"
42 #include "ramster.h"
43 #include "ramster_nodemanager.h"
44 #include "tcp.h"
45 #include "debug.h"
46
47 #define RAMSTER_TESTING
48
49 #ifndef CONFIG_SYSFS
50 #error "ramster needs sysfs to define cluster nodes to use"
51 #endif
52
53 static bool use_cleancache __read_mostly;
54 static bool use_frontswap __read_mostly;
55 static bool use_frontswap_exclusive_gets __read_mostly;
56
57 /* These must be sysfs not debugfs as they are checked/used by userland!! */
58 static unsigned long ramster_interface_revision __read_mostly =
59         R2NM_API_VERSION; /* interface revision must match userspace! */
60 static unsigned long ramster_pers_remotify_enable __read_mostly;
61 static unsigned long ramster_eph_remotify_enable __read_mostly;
62 static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);
63 #define MANUAL_NODES 8
64 static bool ramster_nodes_manual_up[MANUAL_NODES] __read_mostly;
65 static int ramster_remote_target_nodenum __read_mostly = -1;
66
67 /* Used by this code. */
68 long ramster_flnodes;
69 ssize_t ramster_foreign_eph_pages;
70 ssize_t ramster_foreign_pers_pages;
71 /* FIXME frontswap selfshrinking knobs in debugfs? */
72
73 static LIST_HEAD(ramster_rem_op_list);
74 static DEFINE_SPINLOCK(ramster_rem_op_list_lock);
75 static DEFINE_PER_CPU(struct ramster_preload, ramster_preloads);
76
77 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1);
78 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2);
79
80 static struct kmem_cache *ramster_flnode_cache __read_mostly;
81
82 static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)
83 {
84         struct flushlist_node *flnode = NULL;
85         struct ramster_preload *kp;
86
87         kp = &__get_cpu_var(ramster_preloads);
88         flnode = kp->flnode;
89         BUG_ON(flnode == NULL);
90         kp->flnode = NULL;
91         inc_ramster_flnodes();
92         return flnode;
93 }
94
95 /* the "flush list" asynchronously collects pages to remotely flush */
96 #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
97 static void ramster_flnode_free(struct flushlist_node *flnode,
98                                 struct tmem_pool *pool)
99 {
100         dec_ramster_flnodes();
101         BUG_ON(ramster_flnodes < 0);
102         kmem_cache_free(ramster_flnode_cache, flnode);
103 }
104
105 int ramster_do_preload_flnode(struct tmem_pool *pool)
106 {
107         struct ramster_preload *kp;
108         struct flushlist_node *flnode;
109         int ret = -ENOMEM;
110
111         BUG_ON(!irqs_disabled());
112         if (unlikely(ramster_flnode_cache == NULL))
113                 BUG();
114         kp = &__get_cpu_var(ramster_preloads);
115         flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);
116         if (unlikely(flnode == NULL) && kp->flnode == NULL)
117                 BUG();  /* FIXME handle more gracefully, but how??? */
118         else if (kp->flnode == NULL)
119                 kp->flnode = flnode;
120         else
121                 kmem_cache_free(ramster_flnode_cache, flnode);
122         return ret;
123 }
124
125 /*
126  * Called by the message handler after a (still compressed) page has been
127  * fetched from the remote machine in response to an "is_remote" tmem_get
128  * or persistent tmem_localify.  For a tmem_get, "extra" is the address of
129  * the page that is to be filled to successfully resolve the tmem_get; for
130  * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
131  * in the local zcache).  "data" points to "size" bytes of (compressed) data
132  * passed in the message.  In the case of a persistent remote get, if
133  * pre-allocation was successful (see ramster_repatriate_preload), the page
134  * is placed into both local zcache and at "extra".
135  */
136 int ramster_localify(int pool_id, struct tmem_oid *oidp, uint32_t index,
137                         char *data, unsigned int size, void *extra)
138 {
139         int ret = -ENOENT;
140         unsigned long flags;
141         struct tmem_pool *pool;
142         bool eph, delete = false;
143         void *pampd, *saved_hb;
144         struct tmem_obj *obj;
145
146         pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);
147         if (unlikely(pool == NULL))
148                 /* pool doesn't exist anymore */
149                 goto out;
150         eph = is_ephemeral(pool);
151         local_irq_save(flags);  /* FIXME: maybe only disable softirqs? */
152         pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);
153         if (pampd == NULL) {
154                 /* hmmm... must have been a flush while waiting */
155 #ifdef RAMSTER_TESTING
156                 pr_err("UNTESTED pampd==NULL in ramster_localify\n");
157 #endif
158                 if (eph)
159                         inc_ramster_remote_eph_pages_unsucc_get();
160                 else
161                         inc_ramster_remote_pers_pages_unsucc_get();
162                 obj = NULL;
163                 goto finish;
164         } else if (unlikely(!pampd_is_remote(pampd))) {
165                 /* hmmm... must have been a dup put while waiting */
166 #ifdef RAMSTER_TESTING
167                 pr_err("UNTESTED dup while waiting in ramster_localify\n");
168 #endif
169                 if (eph)
170                         inc_ramster_remote_eph_pages_unsucc_get();
171                 else
172                         inc_ramster_remote_pers_pages_unsucc_get();
173                 obj = NULL;
174                 pampd = NULL;
175                 ret = -EEXIST;
176                 goto finish;
177         } else if (size == 0) {
178                 /* no remote data, delete the local is_remote pampd */
179                 pampd = NULL;
180                 if (eph)
181                         inc_ramster_remote_eph_pages_unsucc_get();
182                 else
183                         BUG();
184                 delete = true;
185                 goto finish;
186         }
187         if (pampd_is_intransit(pampd)) {
188                 /*
189                  *  a pampd is marked intransit if it is remote and space has
190                  *  been allocated for it locally (note, only happens for
191                  *  persistent pages, in which case the remote copy is freed)
192                  */
193                 BUG_ON(eph);
194                 pampd = pampd_mask_intransit_and_remote(pampd);
195                 zbud_copy_to_zbud(pampd, data, size);
196         } else {
197                 /*
198                  * setting pampd to NULL tells tmem_localify_finish to leave
199                  * pampd alone... meaning it is left pointing to the
200                  * remote copy
201                  */
202                 pampd = NULL;
203                 obj = NULL;
204         }
205         /*
206          * but in all cases, we decompress direct-to-memory to complete
207          * the remotify and return success
208          */
209         BUG_ON(extra == NULL);
210         zcache_decompress_to_page(data, size, (struct page *)extra);
211         if (eph)
212                 inc_ramster_remote_eph_pages_succ_get();
213         else
214                 inc_ramster_remote_pers_pages_succ_get();
215         ret = 0;
216 finish:
217         tmem_localify_finish(obj, index, pampd, saved_hb, delete);
218         zcache_put_pool(pool);
219         local_irq_restore(flags);
220 out:
221         return ret;
222 }
223
224 void ramster_pampd_new_obj(struct tmem_obj *obj)
225 {
226         obj->extra = NULL;
227 }
228
229 void ramster_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj,
230                                 bool pool_destroy)
231 {
232         struct flushlist_node *flnode;
233
234         BUG_ON(preemptible());
235         if (obj->extra == NULL)
236                 return;
237         if (pool_destroy && is_ephemeral(pool))
238                 /* FIXME don't bother with remote eph data for now */
239                 return;
240         BUG_ON(!pampd_is_remote(obj->extra));
241         flnode = ramster_flnode_alloc(pool);
242         flnode->xh.client_id = pampd_remote_node(obj->extra);
243         flnode->xh.pool_id = pool->pool_id;
244         flnode->xh.oid = obj->oid;
245         flnode->xh.index = FLUSH_ENTIRE_OBJECT;
246         flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;
247         spin_lock(&ramster_rem_op_list_lock);
248         list_add(&flnode->rem_op.list, &ramster_rem_op_list);
249         spin_unlock(&ramster_rem_op_list_lock);
250 }
251
252 /*
253  * Called on a remote persistent tmem_get to attempt to preallocate
254  * local storage for the data contained in the remote persistent page.
255  * If successfully preallocated, returns the pampd, marked as remote and
256  * in_transit.  Else returns NULL.  Note that the appropriate tmem data
257  * structure must be locked.
258  */
259 void *ramster_pampd_repatriate_preload(void *pampd, struct tmem_pool *pool,
260                                         struct tmem_oid *oidp, uint32_t index,
261                                         bool *intransit)
262 {
263         int clen = pampd_remote_size(pampd), c;
264         void *ret_pampd = NULL;
265         unsigned long flags;
266         struct tmem_handle th;
267
268         BUG_ON(!pampd_is_remote(pampd));
269         BUG_ON(is_ephemeral(pool));
270         if (use_frontswap_exclusive_gets)
271                 /* don't need local storage */
272                 goto out;
273         if (pampd_is_intransit(pampd)) {
274                 /*
275                  * to avoid multiple allocations (and maybe a memory leak)
276                  * don't preallocate if already in the process of being
277                  * repatriated
278                  */
279                 *intransit = true;
280                 goto out;
281         }
282         *intransit = false;
283         local_irq_save(flags);
284         th.client_id = pampd_remote_node(pampd);
285         th.pool_id = pool->pool_id;
286         th.oid = *oidp;
287         th.index = index;
288         ret_pampd = zcache_pampd_create(NULL, clen, true, false, &th);
289         if (ret_pampd != NULL) {
290                 /*
291                  *  a pampd is marked intransit if it is remote and space has
292                  *  been allocated for it locally (note, only happens for
293                  *  persistent pages, in which case the remote copy is freed)
294                  */
295                 ret_pampd = pampd_mark_intransit(ret_pampd);
296                 c = atomic_dec_return(&ramster_remote_pers_pages);
297                 WARN_ON_ONCE(c < 0);
298         } else {
299                 inc_ramster_pers_pages_remote_nomem();
300         }
301         local_irq_restore(flags);
302 out:
303         return ret_pampd;
304 }
305
306 /*
307  * Called on a remote tmem_get to invoke a message to fetch the page.
308  * Might sleep so no tmem locks can be held.  "extra" is passed
309  * all the way through the round-trip messaging to ramster_localify.
310  */
311 int ramster_pampd_repatriate(void *fake_pampd, void *real_pampd,
312                                 struct tmem_pool *pool,
313                                 struct tmem_oid *oid, uint32_t index,
314                                 bool free, void *extra)
315 {
316         struct tmem_xhandle xh;
317         int ret;
318
319         if (pampd_is_intransit(real_pampd))
320                 /* have local space pre-reserved, so free remote copy */
321                 free = true;
322         xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);
323         /* unreliable request/response for now */
324         ret = r2net_remote_async_get(&xh, free,
325                                         pampd_remote_node(fake_pampd),
326                                         pampd_remote_size(fake_pampd),
327                                         pampd_remote_cksum(fake_pampd),
328                                         extra);
329         return ret;
330 }
331
332 bool ramster_pampd_is_remote(void *pampd)
333 {
334         return pampd_is_remote(pampd);
335 }
336
337 int ramster_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)
338 {
339         int ret = -1;
340
341         if (new_pampd != NULL) {
342                 if (obj->extra == NULL)
343                         obj->extra = new_pampd;
344                 /* enforce that all remote pages in an object reside
345                  * in the same node! */
346                 else if (pampd_remote_node(new_pampd) !=
347                                 pampd_remote_node((void *)(obj->extra)))
348                         BUG();
349                 ret = 0;
350         }
351         return ret;
352 }
353
354 void *ramster_pampd_free(void *pampd, struct tmem_pool *pool,
355                               struct tmem_oid *oid, uint32_t index, bool acct)
356 {
357         bool eph = is_ephemeral(pool);
358         void *local_pampd = NULL;
359         int c;
360
361         BUG_ON(preemptible());
362         BUG_ON(!pampd_is_remote(pampd));
363         WARN_ON(acct == false);
364         if (oid == NULL) {
365                 /*
366                  * a NULL oid means to ignore this pampd free
367                  * as the remote freeing will be handled elsewhere
368                  */
369         } else if (eph) {
370                 /* FIXME remote flush optional but probably good idea */
371         } else if (pampd_is_intransit(pampd)) {
372                 /* did a pers remote get_and_free, so just free local */
373                 local_pampd = pampd_mask_intransit_and_remote(pampd);
374         } else {
375                 struct flushlist_node *flnode =
376                         ramster_flnode_alloc(pool);
377
378                 flnode->xh.client_id = pampd_remote_node(pampd);
379                 flnode->xh.pool_id = pool->pool_id;
380                 flnode->xh.oid = *oid;
381                 flnode->xh.index = index;
382                 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;
383                 spin_lock(&ramster_rem_op_list_lock);
384                 list_add(&flnode->rem_op.list, &ramster_rem_op_list);
385                 spin_unlock(&ramster_rem_op_list_lock);
386                 c = atomic_dec_return(&ramster_remote_pers_pages);
387                 WARN_ON_ONCE(c < 0);
388         }
389         return local_pampd;
390 }
391
392 void ramster_count_foreign_pages(bool eph, int count)
393 {
394         BUG_ON(count != 1 && count != -1);
395         if (eph) {
396                 if (count > 0) {
397                         inc_ramster_foreign_eph_pages();
398                 } else {
399                         dec_ramster_foreign_eph_pages();
400                         WARN_ON_ONCE(ramster_foreign_eph_pages < 0);
401                 }
402         } else {
403                 if (count > 0) {
404                         inc_ramster_foreign_pers_pages();
405                 } else {
406                         dec_ramster_foreign_pers_pages();
407                         WARN_ON_ONCE(ramster_foreign_pers_pages < 0);
408                 }
409         }
410 }
411
412 /*
413  * For now, just push over a few pages every few seconds to
414  * ensure that it basically works
415  */
416 static struct workqueue_struct *ramster_remotify_workqueue;
417 static void ramster_remotify_process(struct work_struct *work);
418 static DECLARE_DELAYED_WORK(ramster_remotify_worker,
419                 ramster_remotify_process);
420
421 static void ramster_remotify_queue_delayed_work(unsigned long delay)
422 {
423         if (!queue_delayed_work(ramster_remotify_workqueue,
424                                 &ramster_remotify_worker, delay))
425                 pr_err("ramster_remotify: bad workqueue\n");
426 }
427
428 static void ramster_remote_flush_page(struct flushlist_node *flnode)
429 {
430         struct tmem_xhandle *xh;
431         int remotenode, ret;
432
433         preempt_disable();
434         xh = &flnode->xh;
435         remotenode = flnode->xh.client_id;
436         ret = r2net_remote_flush(xh, remotenode);
437         if (ret >= 0)
438                 inc_ramster_remote_pages_flushed();
439         else
440                 inc_ramster_remote_page_flushes_failed();
441         preempt_enable_no_resched();
442         ramster_flnode_free(flnode, NULL);
443 }
444
445 static void ramster_remote_flush_object(struct flushlist_node *flnode)
446 {
447         struct tmem_xhandle *xh;
448         int remotenode, ret;
449
450         preempt_disable();
451         xh = &flnode->xh;
452         remotenode = flnode->xh.client_id;
453         ret = r2net_remote_flush_object(xh, remotenode);
454         if (ret >= 0)
455                 inc_ramster_remote_objects_flushed();
456         else
457                 inc_ramster_remote_object_flushes_failed();
458         preempt_enable_no_resched();
459         ramster_flnode_free(flnode, NULL);
460 }
461
462 int ramster_remotify_pageframe(bool eph)
463 {
464         struct tmem_xhandle xh;
465         unsigned int size;
466         int remotenode, ret, zbuds;
467         struct tmem_pool *pool;
468         unsigned long flags;
469         unsigned char cksum;
470         char *p;
471         int i, j;
472         unsigned char *tmpmem[2];
473         struct tmem_handle th[2];
474         unsigned int zsize[2];
475
476         tmpmem[0] = __get_cpu_var(ramster_remoteputmem1);
477         tmpmem[1] = __get_cpu_var(ramster_remoteputmem2);
478         local_bh_disable();
479         zbuds = zbud_make_zombie_lru(&th[0], &tmpmem[0], &zsize[0], eph);
480         /* now OK to release lock set in caller */
481         local_bh_enable();
482         if (zbuds == 0)
483                 goto out;
484         BUG_ON(zbuds > 2);
485         for (i = 0; i < zbuds; i++) {
486                 xh.client_id = th[i].client_id;
487                 xh.pool_id = th[i].pool_id;
488                 xh.oid = th[i].oid;
489                 xh.index = th[i].index;
490                 size = zsize[i];
491                 BUG_ON(size == 0 || size > zbud_max_buddy_size());
492                 for (p = tmpmem[i], cksum = 0, j = 0; j < size; j++)
493                         cksum += *p++;
494                 ret = r2net_remote_put(&xh, tmpmem[i], size, eph, &remotenode);
495                 if (ret != 0) {
496                 /*
497                  * This is some form of a memory leak... if the remote put
498                  * fails, there will never be another attempt to remotify
499                  * this page.  But since we've dropped the zv pointer,
500                  * the page may have been freed or the data replaced
501                  * so we can't just "put it back" in the remote op list.
502                  * Even if we could, not sure where to put it in the list
503                  * because there may be flushes that must be strictly
504                  * ordered vs the put.  So leave this as a FIXME for now.
505                  * But count them so we know if it becomes a problem.
506                  */
507                         if (eph)
508                                 inc_ramster_eph_pages_remote_failed();
509                         else
510                                 inc_ramster_pers_pages_remote_failed();
511                         break;
512                 } else {
513                         if (!eph)
514                                 atomic_inc(&ramster_remote_pers_pages);
515                 }
516                 if (eph)
517                         inc_ramster_eph_pages_remoted();
518                 else
519                         inc_ramster_pers_pages_remoted();
520                 /*
521                  * data was successfully remoted so change the local version to
522                  * point to the remote node where it landed
523                  */
524                 local_bh_disable();
525                 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);
526                 local_irq_save(flags);
527                 (void)tmem_replace(pool, &xh.oid, xh.index,
528                                 pampd_make_remote(remotenode, size, cksum));
529                 local_irq_restore(flags);
530                 zcache_put_pool(pool);
531                 local_bh_enable();
532         }
533 out:
534         return zbuds;
535 }
536
537 static void zcache_do_remotify_flushes(void)
538 {
539         struct ramster_remotify_hdr *rem_op;
540         union remotify_list_node *u;
541
542         while (1) {
543                 spin_lock(&ramster_rem_op_list_lock);
544                 if (list_empty(&ramster_rem_op_list)) {
545                         spin_unlock(&ramster_rem_op_list_lock);
546                         goto out;
547                 }
548                 rem_op = list_first_entry(&ramster_rem_op_list,
549                                 struct ramster_remotify_hdr, list);
550                 list_del_init(&rem_op->list);
551                 spin_unlock(&ramster_rem_op_list_lock);
552                 u = (union remotify_list_node *)rem_op;
553                 switch (rem_op->op) {
554                 case RAMSTER_REMOTIFY_FLUSH_PAGE:
555                         ramster_remote_flush_page((struct flushlist_node *)u);
556                         break;
557                 case RAMSTER_REMOTIFY_FLUSH_OBJ:
558                         ramster_remote_flush_object((struct flushlist_node *)u);
559                         break;
560                 default:
561                         BUG();
562                 }
563         }
564 out:
565         return;
566 }
567
568 static void ramster_remotify_process(struct work_struct *work)
569 {
570         static bool remotify_in_progress;
571         int i;
572
573         BUG_ON(irqs_disabled());
574         if (remotify_in_progress)
575                 goto requeue;
576         if (ramster_remote_target_nodenum == -1)
577                 goto requeue;
578         remotify_in_progress = true;
579         if (use_cleancache && ramster_eph_remotify_enable) {
580                 for (i = 0; i < 100; i++) {
581                         zcache_do_remotify_flushes();
582                         (void)ramster_remotify_pageframe(true);
583                 }
584         }
585         if (use_frontswap && ramster_pers_remotify_enable) {
586                 for (i = 0; i < 100; i++) {
587                         zcache_do_remotify_flushes();
588                         (void)ramster_remotify_pageframe(false);
589                 }
590         }
591         remotify_in_progress = false;
592 requeue:
593         ramster_remotify_queue_delayed_work(HZ);
594 }
595
596 void __init ramster_remotify_init(void)
597 {
598         unsigned long n = 60UL;
599         ramster_remotify_workqueue =
600                 create_singlethread_workqueue("ramster_remotify");
601         ramster_remotify_queue_delayed_work(n * HZ);
602 }
603
604 static ssize_t ramster_manual_node_up_show(struct kobject *kobj,
605                                 struct kobj_attribute *attr, char *buf)
606 {
607         int i;
608         char *p = buf;
609         for (i = 0; i < MANUAL_NODES; i++)
610                 if (ramster_nodes_manual_up[i])
611                         p += sprintf(p, "%d ", i);
612         p += sprintf(p, "\n");
613         return p - buf;
614 }
615
616 static ssize_t ramster_manual_node_up_store(struct kobject *kobj,
617                 struct kobj_attribute *attr, const char *buf, size_t count)
618 {
619         int err;
620         unsigned long node_num;
621
622         err = kstrtoul(buf, 10, &node_num);
623         if (err) {
624                 pr_err("ramster: bad strtoul?\n");
625                 return -EINVAL;
626         }
627         if (node_num >= MANUAL_NODES) {
628                 pr_err("ramster: bad node_num=%lu?\n", node_num);
629                 return -EINVAL;
630         }
631         if (ramster_nodes_manual_up[node_num]) {
632                 pr_err("ramster: node %d already up, ignoring\n",
633                                                         (int)node_num);
634         } else {
635                 ramster_nodes_manual_up[node_num] = true;
636                 r2net_hb_node_up_manual((int)node_num);
637         }
638         return count;
639 }
640
641 static struct kobj_attribute ramster_manual_node_up_attr = {
642         .attr = { .name = "manual_node_up", .mode = 0644 },
643         .show = ramster_manual_node_up_show,
644         .store = ramster_manual_node_up_store,
645 };
646
647 static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj,
648                                 struct kobj_attribute *attr, char *buf)
649 {
650         if (ramster_remote_target_nodenum == -1UL)
651                 return sprintf(buf, "unset\n");
652         else
653                 return sprintf(buf, "%d\n", ramster_remote_target_nodenum);
654 }
655
656 static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj,
657                 struct kobj_attribute *attr, const char *buf, size_t count)
658 {
659         int err;
660         unsigned long node_num;
661
662         err = kstrtoul(buf, 10, &node_num);
663         if (err) {
664                 pr_err("ramster: bad strtoul?\n");
665                 return -EINVAL;
666         } else if (node_num == -1UL) {
667                 pr_err("ramster: disabling all remotification, "
668                         "data may still reside on remote nodes however\n");
669                 return -EINVAL;
670         } else if (node_num >= MANUAL_NODES) {
671                 pr_err("ramster: bad node_num=%lu?\n", node_num);
672                 return -EINVAL;
673         } else if (!ramster_nodes_manual_up[node_num]) {
674                 pr_err("ramster: node %d not up, ignoring setting "
675                         "of remotification target\n", (int)node_num);
676         } else if (r2net_remote_target_node_set((int)node_num) >= 0) {
677                 pr_info("ramster: node %d set as remotification target\n",
678                                 (int)node_num);
679                 ramster_remote_target_nodenum = (int)node_num;
680         } else {
681                 pr_err("ramster: bad num to node node_num=%d?\n",
682                                 (int)node_num);
683                 return -EINVAL;
684         }
685         return count;
686 }
687
688 static struct kobj_attribute ramster_remote_target_nodenum_attr = {
689         .attr = { .name = "remote_target_nodenum", .mode = 0644 },
690         .show = ramster_remote_target_nodenum_show,
691         .store = ramster_remote_target_nodenum_store,
692 };
693
694 #define RAMSTER_SYSFS_RO(_name) \
695         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
696                                 struct kobj_attribute *attr, char *buf) \
697         { \
698                 return sprintf(buf, "%lu\n", ramster_##_name); \
699         } \
700         static struct kobj_attribute ramster_##_name##_attr = { \
701                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
702                 .show = ramster_##_name##_show, \
703         }
704
705 #define RAMSTER_SYSFS_RW(_name) \
706         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
707                                 struct kobj_attribute *attr, char *buf) \
708         { \
709                 return sprintf(buf, "%lu\n", ramster_##_name); \
710         } \
711         static ssize_t ramster_##_name##_store(struct kobject *kobj, \
712                 struct kobj_attribute *attr, const char *buf, size_t count) \
713         { \
714                 int err; \
715                 unsigned long enable; \
716                 err = kstrtoul(buf, 10, &enable); \
717                 if (err) \
718                         return -EINVAL; \
719                 ramster_##_name = enable; \
720                 return count; \
721         } \
722         static struct kobj_attribute ramster_##_name##_attr = { \
723                 .attr = { .name = __stringify(_name), .mode = 0644 }, \
724                 .show = ramster_##_name##_show, \
725                 .store = ramster_##_name##_store, \
726         }
727
728 #define RAMSTER_SYSFS_RO_ATOMIC(_name) \
729         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
730                                 struct kobj_attribute *attr, char *buf) \
731         { \
732             return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
733         } \
734         static struct kobj_attribute ramster_##_name##_attr = { \
735                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
736                 .show = ramster_##_name##_show, \
737         }
738
739 RAMSTER_SYSFS_RO(interface_revision);
740 RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);
741 RAMSTER_SYSFS_RW(pers_remotify_enable);
742 RAMSTER_SYSFS_RW(eph_remotify_enable);
743
744 static struct attribute *ramster_attrs[] = {
745         &ramster_interface_revision_attr.attr,
746         &ramster_remote_pers_pages_attr.attr,
747         &ramster_manual_node_up_attr.attr,
748         &ramster_remote_target_nodenum_attr.attr,
749         &ramster_pers_remotify_enable_attr.attr,
750         &ramster_eph_remotify_enable_attr.attr,
751         NULL,
752 };
753
754 static struct attribute_group ramster_attr_group = {
755         .attrs = ramster_attrs,
756         .name = "ramster",
757 };
758
759 /*
760  * frontswap selfshrinking
761  */
762
763 /* In HZ, controls frequency of worker invocation. */
764 static unsigned int selfshrink_interval __read_mostly = 5;
765 /* Enable/disable with sysfs. */
766 static bool frontswap_selfshrinking __read_mostly;
767
768 static void selfshrink_process(struct work_struct *work);
769 static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);
770
771 /* Enable/disable with kernel boot option. */
772 static bool use_frontswap_selfshrink __initdata = true;
773
774 /*
775  * The default values for the following parameters were deemed reasonable
776  * by experimentation, may be workload-dependent, and can all be
777  * adjusted via sysfs.
778  */
779
780 /* Control rate for frontswap shrinking. Higher hysteresis is slower. */
781 static unsigned int frontswap_hysteresis __read_mostly = 20;
782
783 /*
784  * Number of selfshrink worker invocations to wait before observing that
785  * frontswap selfshrinking should commence. Note that selfshrinking does
786  * not use a separate worker thread.
787  */
788 static unsigned int frontswap_inertia __read_mostly = 3;
789
790 /* Countdown to next invocation of frontswap_shrink() */
791 static unsigned long frontswap_inertia_counter;
792
793 /*
794  * Invoked by the selfshrink worker thread, uses current number of pages
795  * in frontswap (frontswap_curr_pages()), previous status, and control
796  * values (hysteresis and inertia) to determine if frontswap should be
797  * shrunk and what the new frontswap size should be.  Note that
798  * frontswap_shrink is essentially a partial swapoff that immediately
799  * transfers pages from the "swap device" (frontswap) back into kernel
800  * RAM; despite the name, frontswap "shrinking" is very different from
801  * the "shrinker" interface used by the kernel MM subsystem to reclaim
802  * memory.
803  */
804 static void frontswap_selfshrink(void)
805 {
806         static unsigned long cur_frontswap_pages;
807         static unsigned long last_frontswap_pages;
808         static unsigned long tgt_frontswap_pages;
809
810         last_frontswap_pages = cur_frontswap_pages;
811         cur_frontswap_pages = frontswap_curr_pages();
812         if (!cur_frontswap_pages ||
813                         (cur_frontswap_pages > last_frontswap_pages)) {
814                 frontswap_inertia_counter = frontswap_inertia;
815                 return;
816         }
817         if (frontswap_inertia_counter && --frontswap_inertia_counter)
818                 return;
819         if (cur_frontswap_pages <= frontswap_hysteresis)
820                 tgt_frontswap_pages = 0;
821         else
822                 tgt_frontswap_pages = cur_frontswap_pages -
823                         (cur_frontswap_pages / frontswap_hysteresis);
824         frontswap_shrink(tgt_frontswap_pages);
825 }
826
827 static int __init ramster_nofrontswap_selfshrink_setup(char *s)
828 {
829         use_frontswap_selfshrink = false;
830         return 1;
831 }
832
833 __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup);
834
835 static void selfshrink_process(struct work_struct *work)
836 {
837         if (frontswap_selfshrinking && frontswap_enabled) {
838                 frontswap_selfshrink();
839                 schedule_delayed_work(&selfshrink_worker,
840                         selfshrink_interval * HZ);
841         }
842 }
843
844 void ramster_cpu_up(int cpu)
845 {
846         unsigned char *p1 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
847         unsigned char *p2 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
848         BUG_ON(!p1 || !p2);
849         per_cpu(ramster_remoteputmem1, cpu) = p1;
850         per_cpu(ramster_remoteputmem2, cpu) = p2;
851 }
852
853 void ramster_cpu_down(int cpu)
854 {
855         struct ramster_preload *kp;
856
857         kfree(per_cpu(ramster_remoteputmem1, cpu));
858         per_cpu(ramster_remoteputmem1, cpu) = NULL;
859         kfree(per_cpu(ramster_remoteputmem2, cpu));
860         per_cpu(ramster_remoteputmem2, cpu) = NULL;
861         kp = &per_cpu(ramster_preloads, cpu);
862         if (kp->flnode) {
863                 kmem_cache_free(ramster_flnode_cache, kp->flnode);
864                 kp->flnode = NULL;
865         }
866 }
867
868 void ramster_register_pamops(struct tmem_pamops *pamops)
869 {
870         pamops->free_obj = ramster_pampd_free_obj;
871         pamops->new_obj = ramster_pampd_new_obj;
872         pamops->replace_in_obj = ramster_pampd_replace_in_obj;
873         pamops->is_remote = ramster_pampd_is_remote;
874         pamops->repatriate = ramster_pampd_repatriate;
875         pamops->repatriate_preload = ramster_pampd_repatriate_preload;
876 }
877
878 void __init ramster_init(bool cleancache, bool frontswap,
879                                 bool frontswap_exclusive_gets)
880 {
881         int ret = 0;
882
883         if (cleancache)
884                 use_cleancache = true;
885         if (frontswap)
886                 use_frontswap = true;
887         if (frontswap_exclusive_gets)
888                 use_frontswap_exclusive_gets = true;
889         ramster_debugfs_init();
890         ret = sysfs_create_group(mm_kobj, &ramster_attr_group);
891         if (ret)
892                 pr_err("ramster: can't create sysfs for ramster\n");
893         (void)r2net_register_handlers();
894         INIT_LIST_HEAD(&ramster_rem_op_list);
895         ramster_flnode_cache = kmem_cache_create("ramster_flnode",
896                                 sizeof(struct flushlist_node), 0, 0, NULL);
897         frontswap_selfshrinking = use_frontswap_selfshrink;
898         if (frontswap_selfshrinking) {
899                 pr_info("ramster: Initializing frontswap selfshrink driver.\n");
900                 schedule_delayed_work(&selfshrink_worker,
901                                         selfshrink_interval * HZ);
902         }
903         ramster_remotify_init();
904 }