1 // SPDX-License-Identifier: GPL-2.0-only
5 * This code provides lightweight version of KSM.
7 * Copyright (C) 2020 Samsung Electronics Co., Ltd.
8 * Author: Sung-hun Kim (sfoon.kim@samsung.com)
12 * Memory merging support.
14 * This code enables dynamic sharing of identical pages found in different
15 * memory areas, even if they are not shared by fork()
17 * Copyright (C) 2008-2009 Red Hat, Inc.
25 #include <linux/errno.h>
28 #include <linux/mman.h>
29 #include <linux/sched.h>
30 #include <linux/sched/mm.h>
31 #include <linux/sched/coredump.h>
32 #include <linux/rwsem.h>
33 #include <linux/pagemap.h>
34 #include <linux/rmap.h>
35 #include <linux/spinlock.h>
36 #include <linux/xxhash.h>
37 #include <linux/delay.h>
38 #include <linux/kthread.h>
39 #include <linux/wait.h>
40 #include <linux/slab.h>
41 #include <linux/rbtree.h>
42 #include <linux/memory.h>
43 #include <linux/mmu_notifier.h>
44 #include <linux/swap.h>
45 #include <linux/ksm.h>
46 #include <linux/hashtable.h>
47 #include <linux/freezer.h>
48 #include <linux/oom.h>
49 #include <linux/numa.h>
51 #include <asm/tlbflush.h>
56 #define DO_NUMA(x) do { (x); } while (0)
59 #define DO_NUMA(x) do { } while (0)
62 #define ksm_debug(fmt, ...) \
63 printk(KERN_DEBUG "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
64 #define ksm_err(fmt, ...) \
65 printk(KERN_ERR "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
70 * A few notes about the KSM scanning process,
71 * to make it easier to understand the data structures below:
73 * In order to reduce excessive scanning, KSM sorts the memory pages by their
74 * contents into a data structure that holds pointers to the pages' locations.
76 * Since the contents of the pages may change at any moment, KSM cannot just
77 * insert the pages into a normal sorted tree and expect it to find anything.
78 * Therefore KSM uses two data structures - the stable and the unstable tree.
80 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
81 * by their contents. Because each such page is write-protected, searching on
82 * this tree is fully assured to be working (except when pages are unmapped),
83 * and therefore this tree is called the stable tree.
85 * The stable tree node includes information required for reverse
86 * mapping from a KSM page to virtual addresses that map this page.
88 * In order to avoid large latencies of the rmap walks on KSM pages,
89 * KSM maintains two types of nodes in the stable tree:
91 * * the regular nodes that keep the reverse mapping structures in a
93 * * the "chains" that link nodes ("dups") that represent the same
94 * write protected memory content, but each "dup" corresponds to a
95 * different KSM page copy of that content
97 * Internally, the regular nodes, "dups" and "chains" are represented
98 * using the same :c:type:`struct stable_node` structure.
100 * In addition to the stable tree, KSM uses a second data structure called the
101 * unstable tree: this tree holds pointers to pages which have been found to
102 * be "unchanged for a period of time". The unstable tree sorts these pages
103 * by their contents, but since they are not write-protected, KSM cannot rely
104 * upon the unstable tree to work correctly - the unstable tree is liable to
105 * be corrupted as its contents are modified, and so it is called unstable.
107 * KSM solves this problem by several techniques:
109 * 1) The unstable tree is flushed every time KSM completes scanning all
110 * memory areas, and then the tree is rebuilt again from the beginning.
111 * 2) KSM will only insert into the unstable tree, pages whose hash value
112 * has not changed since the previous scan of all memory areas.
113 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
114 * colors of the nodes and not on their contents, assuring that even when
115 * the tree gets "corrupted" it won't get out of balance, so scanning time
116 * remains the same (also, searching and inserting nodes in an rbtree uses
117 * the same algorithm, so we have no overhead when we flush and rebuild).
118 * 4) KSM never flushes the stable tree, which means that even if it were to
119 * take 10 attempts to find a page in the unstable tree, once it is found,
120 * it is secured in the stable tree. (When we scan a new page, we first
121 * compare it against the stable tree, and then against the unstable tree.)
123 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
124 * stable trees and multiple unstable trees: one of each for each NUMA node.
128 * A few notes about lightweight KSM.
130 * A smart crawler leverages semantics of tasks in Tizen.
131 * When the application goes to background, it is attached to freezer
132 * task group. LKSM crawler hooks this event and adds a "frozen task"
133 * to candidate list for scanning.
137 /* merge window size */
141 * struct mm_slot - ksm information per mm that is being scanned
142 * @link: link to the mm_slots hash list
143 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
144 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
145 * @mm: the mm that this information is valid for
147 * extension - added for LKSM
148 * @state: state of mm_slot (frozen, listed, scanned, newcomer)
149 * @merge_idx: merge window index to store the number of currently merged pages
150 * @nr_merged_win: merge window to keep recent three numbers
151 * @nr_merged: sum of nr_merged_win, used to maintain vips_list (ordered list)
152 * @ordered_list: list ordered by nr_merged
153 * @scanning_size: number of anonymous pages in mm_struct
154 * @fault_cnt: last read count of page fault (minor + major)
155 * @elapsed: elapsed scanning time
156 * @nr_scans: number of scanning pages (can be different with scanning_size)
159 struct hlist_node link;
160 struct list_head mm_list;
161 struct list_head scan_list;
162 struct rmap_item *rmap_list;
163 struct mm_struct *mm;
168 int nr_merged_win[MERGE_WIN];
170 struct rb_node ordered_list;
172 unsigned long scanning_size; /* in number of pages */
173 unsigned long fault_cnt;
174 unsigned long elapsed;
177 #ifdef CONFIG_LKSM_FILTER
178 /* used for releasing lksm_region */
179 struct list_head ref_list;
186 * scanning mode of LKSM:
187 * LKSM_SCAN_PARTIAL: perform deduplication on subset of processes
188 * LKSM_SCAN_FULL: perform deduplication on full set of processes
190 enum lksm_scan_mode {
197 * struct ksm_scan - cursor for scanning
198 * @address: the next address inside that to be scanned
199 * @rmap_list: link to the next rmap to be scanned in the rmap_list
200 * @mm_slot: the current mm_slot we are scanning
201 * @remove_mm_list: temporary list for batching flush of removed slots
202 * @nr_scannable: the number of remaining unscanned scannable slots
203 * @nr_frozen: the number of remaining unscanned frozen slots
204 * @scan_round: scanning round (partial + full)
205 * @nr_full_scan: the number of full scanning
206 * @scan_mode: coverage of current scanning
208 * There is only the one ksm_scan instance of this cursor structure.
211 unsigned long address;
212 struct rmap_item **rmap_list;
214 struct mm_slot *mm_slot;
215 struct list_head remove_mm_list;
217 /* statistics of scanning targets */
218 atomic_t nr_scannable;
221 unsigned long scan_round;
222 unsigned long nr_full_scan;
224 enum lksm_scan_mode scan_mode;
226 #ifdef CONFIG_LKSM_FILTER
227 struct lksm_region *region;
228 unsigned long vma_base_addr;
229 struct vm_area_struct *cached_vma;
230 #endif /* CONFIG_LKSM_FILTER */
234 * struct stable_node - node of the stable rbtree
235 * @node: rb node of this ksm page in the stable tree
236 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
237 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
238 * @list: linked into migrate_nodes, pending placement in the proper node tree
239 * @hlist: hlist head of rmap_items using this ksm page
240 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
241 * @chain_prune_time: time of the last full garbage collection
242 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
243 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
247 struct rb_node node; /* when node of stable tree */
248 struct { /* when listed for migration */
249 struct list_head *head;
251 struct hlist_node hlist_dup;
252 struct list_head list;
256 struct hlist_head hlist;
259 unsigned long chain_prune_time;
262 * STABLE_NODE_CHAIN can be any negative number in
263 * rmap_hlist_len negative range, but better not -1 to be able
264 * to reliably detect underflows.
266 #define STABLE_NODE_CHAIN -1024
274 * struct rmap_item - reverse mapping item for virtual addresses
275 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
276 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
277 * @nid: NUMA node id of unstable tree in which linked (may not match page)
278 * @region: pointer to the mapped region (LKSM feature)
279 * @mm: the memory structure this rmap_item is pointing into
280 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
281 * @oldchecksum: previous checksum of the page at that virtual address
282 * @node: rb node of this rmap_item in the unstable tree
283 * @head: pointer to stable_node heading this list in the stable tree
284 * @base_addr: used for calculating offset of the address (LKSM feature)
285 * @hlist: link into hlist of rmap_items hanging off that stable_node
288 struct rmap_item *rmap_list;
290 struct anon_vma *anon_vma; /* when stable */
292 int nid; /* when node of unstable tree */
294 #ifdef CONFIG_LKSM_FILTER
295 struct lksm_region *region; /* when unstable */
298 struct mm_struct *mm;
299 unsigned long address; /* + low bits used for flags below */
300 unsigned int oldchecksum; /* when unstable (LSB is a frozen bit) */
302 struct rb_node node; /* when node of unstable tree */
303 struct { /* when listed from stable tree */
304 #ifdef CONFIG_LKSM_FILTER
306 struct stable_node *head;
307 unsigned long base_addr; /* temporal storage for merge */
310 struct stable_node *head;
311 #endif /* CONFIG_LKSM_FILTER */
312 struct hlist_node hlist;
317 #define SEQNR_MASK 0x0ff /* low bits of unstable tree scan_round */
318 #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
319 #define STABLE_FLAG 0x200 /* is listed from the stable tree */
320 #define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
321 /* to mask all the flags */
323 /* The stable and unstable tree heads */
324 static struct rb_root one_stable_tree[1] = { RB_ROOT };
325 static struct rb_root one_unstable_tree[1] = { RB_ROOT };
326 static struct rb_root *root_stable_tree = one_stable_tree;
327 static struct rb_root *root_unstable_tree = one_unstable_tree;
329 #define LKSM_NODE_ID 0
331 /* Recently migrated nodes of stable tree, pending proper placement */
332 static LIST_HEAD(migrate_nodes);
333 #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
335 /* list for VIP processes */
336 static struct rb_root vips_list = RB_ROOT;
337 static int lksm_max_vips = 20;
339 #define MM_SLOTS_HASH_BITS 10
340 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
341 static DEFINE_HASHTABLE(task_slots_hash, MM_SLOTS_HASH_BITS);
344 * two list heads in LKSM:
345 * - ksm_mm_head: a head for traversing whole list of processes,
346 not used for scanning itself
347 * - ksm_scan_head: a head for a list of currently scanning processes
349 static struct mm_slot ksm_mm_head = {
350 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
353 static struct mm_slot ksm_scan_head = {
354 .scan_list = LIST_HEAD_INIT(ksm_scan_head.scan_list),
357 static struct ksm_scan ksm_scan = {
358 .mm_slot = &ksm_scan_head,
361 static struct kmem_cache *rmap_item_cache;
362 static struct kmem_cache *stable_node_cache;
363 static struct kmem_cache *mm_slot_cache;
364 static struct kmem_cache *task_slot_cache;
366 /* The number of nodes in the stable tree */
367 static unsigned long ksm_pages_shared;
369 /* The number of page slots additionally sharing those nodes */
370 static unsigned long ksm_pages_sharing;
372 /* The number of nodes in the unstable tree */
373 static unsigned long ksm_pages_unshared;
375 /* The number of rmap_items in use: to calculate pages_volatile */
376 static unsigned long ksm_rmap_items;
378 /* The number of stable_node chains */
379 static unsigned long ksm_stable_node_chains;
381 /* The number of stable_node dups linked to the stable_node chains */
382 static unsigned long ksm_stable_node_dups;
384 /* Delay in pruning stale stable_node_dups in the stable_node_chains */
385 static int ksm_stable_node_chains_prune_millisecs = 2000;
387 /* Maximum number of page slots sharing a stable node */
388 static int ksm_max_page_sharing = 256;
390 /* Number of pages ksmd should scan in one batch */
391 static unsigned int ksm_thread_pages_to_scan = 100;
393 /* Milliseconds ksmd should sleep between batches */
394 static unsigned int ksm_thread_sleep_millisecs = 20;
396 /* Checksum of an empty (zeroed) page */
397 static unsigned int zero_checksum __read_mostly;
399 /* Processes tracked by KSM thread */
400 static unsigned int ksm_nr_added_process;
402 /* Whether to merge empty (zeroed) pages with actual zero pages */
403 static bool ksm_use_zero_pages __read_mostly;
405 /* An indicator for KSM scanning */
406 static atomic_t ksm_one_shot_scanning;
408 /* Boosting when the scanner performs partial scan */
409 static unsigned int lksm_boosted_pages_to_scan = 100;
410 static unsigned int lksm_default_pages_to_scan = 100;
413 /* Zeroed when merging across nodes is not allowed */
414 static unsigned int ksm_merge_across_nodes = 1;
415 static int ksm_nr_node_ids = 1;
417 #define ksm_merge_across_nodes 1U
418 #define ksm_nr_node_ids 1
422 * Default policy for KSM_RUN_ONESHOT:
423 * KSM performs both scannings only when the user requests it.
424 * If scanning is ended, both crawler and scanner threads are blocked until
425 * the next request is coming.
427 #define KSM_RUN_STOP 0
428 #define KSM_RUN_MERGE 1
429 #define KSM_RUN_UNMERGE 2
430 #define KSM_RUN_OFFLINE 4
431 #define KSM_RUN_ONESHOT 8
433 static unsigned long ksm_run = KSM_RUN_STOP;
434 static atomic_t ksm_state; /* 0: in crawling 1: in scanning */
436 #define lksm_check_scan_state(ksm_state) (atomic_read(&ksm_state) == 1)
437 #define lksm_set_scan_state(ksm_state) (atomic_set(&ksm_state, 1))
438 #define lksm_clear_scan_state(ksm_state) (atomic_set(&ksm_state, 0))
441 struct task_struct *task;
443 unsigned long inserted;
444 struct list_head list;
445 struct hlist_node hlist;
450 * When a process stops running on forground (e.g., going to background),
451 * the system daemon (e.g., resourced) puts it to cgroup_freezer.
452 * Once a process joins into freezer cgroup, the system kernel does not count
453 * it as a runnable process, and thus it cannot be scheduled on CPU.
454 * So, I regard processes in freezer cgroup as a frozen state and that can be
455 * good candidates of memory deduplication.
457 * LKSM provides a hook to catch the moment that the process is being frozen.
458 * With the hook, ksm crawler can get candidate list for memory deduplication.
459 * (see kernel/cgroup_freezer.c)
461 #define FROZEN_BIT 0x01
462 #define LISTED_BIT 0x02
464 #define lksm_test_rmap_frozen(rmap_item) (rmap_item->oldchecksum & FROZEN_BIT)
465 #define lksm_set_rmap_frozen(rmap_item) (rmap_item->oldchecksum |= FROZEN_BIT)
466 #define lksm_clear_rmap_frozen(rmap_item) (rmap_item->oldchecksum &= ~FROZEN_BIT)
467 #define lksm_clear_checksum_frozen(checksum) (checksum &= ~FROZEN_BIT)
469 #define KSM_MM_FROZEN 0x01
470 #define KSM_MM_LISTED 0x02
471 #define KSM_MM_NEWCOMER 0x04
472 #define KSM_MM_SCANNED 0x08
473 #ifdef CONFIG_LKSM_FILTER
474 #define KSM_MM_PREPARED 0x10
477 #define lksm_test_mm_state(mm_slot, bit) (mm_slot->state & bit)
478 #define lksm_set_mm_state(mm_slot, bit) (mm_slot->state |= bit)
479 #define lksm_clear_mm_state(mm_slot, bit) (mm_slot->state &= ~bit)
481 #ifdef CONFIG_LKSM_FILTER
482 #define LKSM_REGION_HASH_BITS 10
483 static DEFINE_HASHTABLE(lksm_region_hash, LKSM_REGION_HASH_BITS);
484 spinlock_t lksm_region_lock;
487 * LKSM uses the filter when the region is scanned more than
488 * LKSM_REGION_MATURE round
490 #define LKSM_REGION_MATURE 5
491 #define lksm_region_mature(round, region) \
492 ((round - region->scan_round) > LKSM_REGION_MATURE)
494 enum lksm_region_type {
497 LKSM_REGION_FILE1, /* file mapped region: data section */
498 LKSM_REGION_FILE2, /* file mapped region: bss section */
499 LKSM_REGION_CONFLICT, /* conflicted regions: do not filtering */
503 static const char * const region_type_str[] = {
512 /* sharing statistics for each region type */
513 static int region_share[LKSM_REGION_UNKNOWN + 1];
516 * lksm_region: A region represents a physical mapped area.
517 * Each process can have its own instance of a region, namely vma.
518 * Regions for not-a-file-mapped areas like heap and stack just have
519 * abstract representations as symbols.
521 * LKSM leverages the region for offset-based filtering.
522 * Each region has a filter which records offsets of addresses of
523 * shared pages in the region.
524 * If once a region is matured, LKSM uses the filter to skip scanning of
527 * @type: type of region, refer above enumeration
528 * @len: length of filter (in the number of 64-bit variables)
529 * @ino: inode number if the region is mapped to file
530 * @merge_cnt: the number of merged pages in the region
531 * @filter_cnt: the number of set bits in filter
532 * @scan_round: the birth scan round of this region
533 * @conflict: the count of size changed, clue for conflict
534 * @refcount: if it reaches zero, the region will be freed
535 * @hnode: hash node for finding region by ino
536 * @next: data region can have a next (bss) region
537 * @prev: reverse pointer to data region
539 * A few notes about bitmap filter variable:
540 * LKSM uses bitmap filter for skipping scan of unsharable pages.
541 * If a region is smaller than 256KB (<= 64 pages),
542 * it can be covered by a bitmap stored in a 64-bit variable.
543 * LKSM only allocates a bitmap array as a filter when the region is
544 * larger than 256KB, otherwise it uses a 64-bit variable as a filter.
546 * @filter: when the region is bigger than 64 pages
547 * @single_filter: when the region is smaller than or equal to 64 pages
549 #define SINGLE_FILTER_LEN 1 /* a region can be covered by single variable */
552 enum lksm_region_type type;
560 struct hlist_node hnode;
561 struct lksm_region *next;
562 struct lksm_region *prev;
564 unsigned long *filter;
565 unsigned long single_filter;
571 * Contains references from processes to regions
574 struct lksm_region_ref {
575 struct list_head list; /* listed by mm_slot */
576 struct lksm_region *region;
579 /* the number of registered lksm_regions */
580 static unsigned int lksm_nr_regions;
582 /* the upper limit for region lookup */
583 #define LKSM_REGION_ITER_MAX 8
585 #define lksm_region_size(start, end) ((int)(end - start) >> PAGE_SHIFT)
586 #define lksm_bitmap_size(size) ((size >> 6) + ((size % BITS_PER_LONG) ? 1 : 0))
588 /* all processes share one lksm_region for their heaps */
589 static struct lksm_region heap_region, unknown_region;
591 static void lksm_register_file_anon_region(struct mm_slot *slot,
592 struct vm_area_struct *vma);
593 static struct lksm_region *lksm_find_region(struct vm_area_struct *vma);
594 #endif /* CONFIG_LKSM_FILTER */
596 static int initial_round = 3;
597 static unsigned long ksm_crawl_round;
598 static unsigned long crawler_sleep;
600 /* statistical information */
601 static int lksm_nr_merged; /* global merge count */
602 static int lksm_nr_broken; /* global broken count */
603 static int lksm_nr_scanned_slot; /* global scanned slot count */
604 static int lksm_slot_nr_merged; /* per-slot merge count */
605 static int lksm_slot_nr_broken; /* per-slot broken count */
607 /* initially, KSM takes small full scan interval */
608 #define DEFAULT_FULL_SCAN_INTERVAL 60000 /* 60 seconds */
609 static unsigned long full_scan_interval = 100;
611 /* statistical information about scanning time */
612 static unsigned long lksm_last_scan_time;
613 static unsigned long lksm_proc_scan_time;
615 /* stuffs for pruning short-lived task */
616 #define KSM_SHORT_TASK_TIME 100
617 static unsigned long short_lived_thresh = KSM_SHORT_TASK_TIME;
619 #define get_task_runtime(task) (task->se.sum_exec_runtime)
620 #define ms_to_ns(ms) (ms * 1000 * 1000)
621 #define check_short_task(task) \
622 (get_task_runtime(task) < ms_to_ns(short_lived_thresh))
624 static void wait_while_offlining(void);
625 static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen);
627 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
628 static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
629 static DEFINE_MUTEX(ksm_thread_mutex);
630 static DEFINE_SPINLOCK(ksm_mmlist_lock);
631 static DECLARE_WAIT_QUEUE_HEAD(ksm_crawl_wait);
633 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
634 sizeof(struct __struct), __alignof__(struct __struct),\
637 static int __init ksm_slab_init(void)
639 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
640 if (!rmap_item_cache)
643 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
644 if (!stable_node_cache)
647 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
650 task_slot_cache = KSM_KMEM_CACHE(task_slot, 0);
651 if (!task_slot_cache)
657 kmem_cache_destroy(mm_slot_cache);
659 kmem_cache_destroy(stable_node_cache);
661 kmem_cache_destroy(rmap_item_cache);
666 static void __init ksm_slab_free(void)
668 kmem_cache_destroy(mm_slot_cache);
669 kmem_cache_destroy(stable_node_cache);
670 kmem_cache_destroy(rmap_item_cache);
671 mm_slot_cache = NULL;
674 static __always_inline bool is_stable_node_chain(struct stable_node *chain)
676 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
679 static __always_inline bool is_stable_node_dup(struct stable_node *dup)
681 return dup->head == STABLE_NODE_DUP_HEAD;
684 static inline void stable_node_chain_add_dup(struct stable_node *dup,
685 struct stable_node *chain)
687 VM_BUG_ON(is_stable_node_dup(dup));
688 dup->head = STABLE_NODE_DUP_HEAD;
689 VM_BUG_ON(!is_stable_node_chain(chain));
690 hlist_add_head(&dup->hlist_dup, &chain->hlist);
691 ksm_stable_node_dups++;
694 static inline void __stable_node_dup_del(struct stable_node *dup)
696 VM_BUG_ON(!is_stable_node_dup(dup));
697 hlist_del(&dup->hlist_dup);
698 ksm_stable_node_dups--;
701 static inline void stable_node_dup_del(struct stable_node *dup)
703 VM_BUG_ON(is_stable_node_chain(dup));
704 if (is_stable_node_dup(dup))
705 __stable_node_dup_del(dup);
707 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
708 #ifdef CONFIG_DEBUG_VM
713 static inline struct rmap_item *alloc_rmap_item(void)
715 struct rmap_item *rmap_item;
717 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
718 __GFP_NORETRY | __GFP_NOWARN);
724 static inline void free_rmap_item(struct rmap_item *rmap_item)
727 rmap_item->mm = NULL; /* debug safety */
728 kmem_cache_free(rmap_item_cache, rmap_item);
731 static inline struct stable_node *alloc_stable_node(void)
734 * The allocation can take too long with GFP_KERNEL when memory is under
735 * pressure, which may lead to hung task warnings. Adding __GFP_HIGH
736 * grants access to memory reserves, helping to avoid this problem.
738 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
741 static inline void free_stable_node(struct stable_node *stable_node)
743 VM_BUG_ON(stable_node->rmap_hlist_len &&
744 !is_stable_node_chain(stable_node));
745 kmem_cache_free(stable_node_cache, stable_node);
748 static inline struct mm_slot *alloc_mm_slot(void)
750 if (!mm_slot_cache) /* initialization failed */
752 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
755 static inline void free_mm_slot(struct mm_slot *mm_slot)
757 kmem_cache_free(mm_slot_cache, mm_slot);
760 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
762 struct mm_slot *slot;
764 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
771 static void insert_to_mm_slots_hash(struct mm_struct *mm,
772 struct mm_slot *mm_slot)
775 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
778 static inline struct task_slot *alloc_task_slot(void)
780 if (!task_slot_cache)
782 return kmem_cache_zalloc(task_slot_cache, GFP_NOWAIT);
785 static inline void free_task_slot(struct task_slot *task_slot)
787 kmem_cache_free(task_slot_cache, task_slot);
790 static struct task_slot *get_task_slot(struct task_struct *task)
792 struct task_slot *slot;
794 hash_for_each_possible(task_slots_hash, slot, hlist,
796 if (slot->task == task)
801 static inline void insert_to_task_slots_hash(struct task_slot *slot)
803 hash_add(task_slots_hash, &slot->hlist, (unsigned long)slot->task);
807 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
808 * page tables after it has passed through ksm_exit() - which, if necessary,
809 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
810 * a special flag: they can just back out as soon as mm_users goes to zero.
811 * ksm_test_exit() is used throughout to make this test for exit: in some
812 * places for correctness, in some places just to avoid unnecessary work.
814 static inline bool ksm_test_exit(struct mm_struct *mm)
816 return atomic_read(&mm->mm_users) == 0;
820 * We use break_ksm to break COW on a ksm page: it's a stripped down
822 * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
825 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
826 * in case the application has unmapped and remapped mm,addr meanwhile.
827 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
828 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
830 * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
831 * of the process that owns 'vma'. We also do not want to enforce
832 * protection keys here anyway.
834 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
841 page = follow_page(vma, addr,
842 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
843 if (IS_ERR_OR_NULL(page))
846 ret = handle_mm_fault(vma, addr,
847 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
849 ret = VM_FAULT_WRITE;
851 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
853 * We must loop because handle_mm_fault() may back out if there's
854 * any difficulty e.g. if pte accessed bit gets updated concurrently.
856 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
857 * COW has been broken, even if the vma does not permit VM_WRITE;
858 * but note that a concurrent fault might break PageKsm for us.
860 * VM_FAULT_SIGBUS could occur if we race with truncation of the
861 * backing file, which also invalidates anonymous pages: that's
862 * okay, that truncation will have unmapped the PageKsm for us.
864 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
865 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
866 * current task has TIF_MEMDIE set, and will be OOM killed on return
867 * to user; and ksmd, having no mm, would never be chosen for that.
869 * But if the mm is in a limited mem_cgroup, then the fault may fail
870 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
871 * even ksmd can fail in this way - though it's usually breaking ksm
872 * just to undo a merge it made a moment before, so unlikely to oom.
874 * That's a pity: we might therefore have more kernel pages allocated
875 * than we're counting as nodes in the stable tree; but ksm_do_scan
876 * will retry to break_cow on each pass, so should recover the page
877 * in due course. The important thing is to not let VM_MERGEABLE
878 * be cleared while any such pages might remain in the area.
880 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
883 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
886 struct vm_area_struct *vma;
887 if (ksm_test_exit(mm))
889 vma = find_vma(mm, addr);
890 if (!vma || vma->vm_start > addr)
892 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
897 static void break_cow(struct rmap_item *rmap_item)
899 struct mm_struct *mm = rmap_item->mm;
900 unsigned long addr = rmap_item->address;
901 struct vm_area_struct *vma;
904 * It is not an accident that whenever we want to break COW
905 * to undo, we also need to drop a reference to the anon_vma.
907 put_anon_vma(rmap_item->anon_vma);
909 down_read(&mm->mmap_sem);
910 vma = find_mergeable_vma(mm, addr);
912 break_ksm(vma, addr);
913 up_read(&mm->mmap_sem);
916 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
918 struct mm_struct *mm = rmap_item->mm;
919 unsigned long addr = rmap_item->address;
920 struct vm_area_struct *vma;
923 down_read(&mm->mmap_sem);
924 vma = find_mergeable_vma(mm, addr);
928 page = follow_page(vma, addr, FOLL_GET);
929 if (IS_ERR_OR_NULL(page))
931 if (PageAnon(page)) {
932 flush_anon_page(vma, page, addr);
933 flush_dcache_page(page);
939 up_read(&mm->mmap_sem);
943 #ifdef CONFIG_LKSM_FILTER
944 static inline int is_heap(struct vm_area_struct *vma)
946 return vma->vm_start <= vma->vm_mm->brk &&
947 vma->vm_end >= vma->vm_mm->start_brk;
950 /* below code is copied from fs/proc/task_mmu.c */
952 static int is_stack(struct vm_area_struct *vma)
954 return vma->vm_start <= vma->vm_mm->start_stack &&
955 vma->vm_end >= vma->vm_mm->start_stack;
958 static int is_exec(struct vm_area_struct *vma)
960 return (vma->vm_flags & VM_EXEC);
962 #endif /* CONFIG_LKSM_FILTER */
965 * ksm_join: a wrapper function of ksm_enter.
966 * The function sets VM_MERGEABLE flag of vmas in the given mm_struct.
968 static int ksm_join(struct mm_struct *mm, int frozen)
970 struct vm_area_struct *vma;
971 struct mm_slot *slot;
972 int newly_allocated = 0;
974 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
975 slot = __ksm_enter_alloc_slot(mm, frozen);
980 slot = get_mm_slot(mm);
982 ksm_err("there is no mm_slot for %p", mm);
987 for (vma = mm->mmap; vma; vma = vma->vm_next) {
988 if (vma->vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
989 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
990 VM_HUGETLB | VM_MIXEDMAP))
992 vma->vm_flags |= VM_MERGEABLE;
993 #ifdef CONFIG_LKSM_FILTER
995 * Many page sharings come from library pages because processes
996 * are sharing runtime framwork of the OS.
997 * Thus, anonymous pages related with file-mapped areas can show
998 * sharing patterns which can be exploited in LKSM while other
999 * anonymous regions (e.g., heap) don't.
1000 * LKSM only tracks file-related regions to make filters.
1002 if (!is_heap(vma) && !is_stack(vma) &&
1003 !is_exec(vma) && vma->anon_vma)
1004 lksm_register_file_anon_region(slot, vma);
1008 return newly_allocated;
1011 #define ksm_join_write_lock(mm, frozen, ret) do {\
1012 down_write(&mm->mmap_sem); \
1013 ret = ksm_join(mm, frozen); \
1014 up_write(&mm->mmap_sem); \
1017 #ifdef CONFIG_LKSM_FILTER
1018 static void lksm_region_ref_append
1019 (struct mm_slot *slot, struct lksm_region *region)
1021 struct lksm_region_ref *ref;
1024 ref = kzalloc(sizeof(struct lksm_region_ref), GFP_KERNEL);
1027 ref->region = region;
1028 list_add_tail(&ref->list, &slot->ref_list);
1030 atomic_inc(®ion->refcount);
1033 static void lksm_region_free(struct lksm_region *region)
1035 unsigned long flags;
1037 ksm_debug("lets free region(%p) prev(%p)", region, region->prev);
1038 spin_lock_irqsave(&lksm_region_lock, flags);
1039 if (!region->next) {
1041 if (atomic_read(®ion->prev->refcount) == 0) {
1042 hash_del(®ion->prev->hnode);
1043 if (region->prev->len > SINGLE_FILTER_LEN)
1044 kfree(region->prev->filter);
1045 kfree(region->prev);
1047 ksm_debug("prev region(%p) has ref count(%d)",
1049 atomic_read(®ion->prev->refcount));
1050 region->prev->next = NULL;
1053 hash_del(®ion->hnode);
1054 if (region->len > SINGLE_FILTER_LEN)
1055 kfree(region->filter);
1058 spin_unlock_irqrestore(&lksm_region_lock, flags);
1061 static void lksm_region_ref_remove(struct lksm_region_ref *ref)
1063 list_del_init(&ref->list);
1064 if (atomic_dec_and_test(&ref->region->refcount))
1065 lksm_region_free(ref->region);
1069 static void lksm_region_ref_list_release(struct mm_slot *slot)
1071 struct lksm_region_ref *ref, *next;
1073 ksm_debug("release %p ref list", slot);
1074 list_for_each_entry_safe(ref, next, &slot->ref_list, list) {
1075 lksm_region_ref_remove(ref);
1078 #endif /* CONFIG_LKSM_FILTER */
1081 * This helper is used for getting right index into array of tree roots.
1082 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
1083 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
1084 * every node has its own stable and unstable tree.
1086 static inline int get_kpfn_nid(unsigned long kpfn)
1088 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
1091 static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
1092 struct rb_root *root)
1094 struct stable_node *chain = alloc_stable_node();
1095 VM_BUG_ON(is_stable_node_chain(dup));
1096 if (likely(chain)) {
1097 INIT_HLIST_HEAD(&chain->hlist);
1098 chain->chain_prune_time = jiffies;
1099 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
1100 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
1101 chain->nid = NUMA_NO_NODE; /* debug */
1103 ksm_stable_node_chains++;
1106 * Put the stable node chain in the first dimension of
1107 * the stable tree and at the same time remove the old
1110 rb_replace_node(&dup->node, &chain->node, root);
1113 * Move the old stable node to the second dimension
1114 * queued in the hlist_dup. The invariant is that all
1115 * dup stable_nodes in the chain->hlist point to pages
1116 * that are wrprotected and have the exact same
1119 stable_node_chain_add_dup(dup, chain);
1124 static inline void free_stable_node_chain(struct stable_node *chain,
1125 struct rb_root *root)
1127 rb_erase(&chain->node, root);
1128 free_stable_node(chain);
1129 ksm_stable_node_chains--;
1132 static void remove_node_from_stable_tree(struct stable_node *stable_node)
1134 struct rmap_item *rmap_item;
1136 /* check it's not STABLE_NODE_CHAIN or negative */
1137 BUG_ON(stable_node->rmap_hlist_len < 0);
1139 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1140 if (rmap_item->hlist.next) {
1141 ksm_pages_sharing--;
1142 lksm_slot_nr_broken++;
1146 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
1147 stable_node->rmap_hlist_len--;
1148 put_anon_vma(rmap_item->anon_vma);
1149 rmap_item->address &= PAGE_MASK;
1154 * We need the second aligned pointer of the migrate_nodes
1155 * list_head to stay clear from the rb_parent_color union
1156 * (aligned and different than any node) and also different
1157 * from &migrate_nodes. This will verify that future list.h changes
1158 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
1160 #if defined(GCC_VERSION) && GCC_VERSION >= 40903
1161 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
1162 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
1165 if (stable_node->head == &migrate_nodes)
1166 list_del(&stable_node->list);
1168 stable_node_dup_del(stable_node);
1169 free_stable_node(stable_node);
1172 enum get_ksm_page_flags {
1173 GET_KSM_PAGE_NOLOCK,
1175 GET_KSM_PAGE_TRYLOCK
1179 * get_ksm_page: checks if the page indicated by the stable node
1180 * is still its ksm page, despite having held no reference to it.
1181 * In which case we can trust the content of the page, and it
1182 * returns the gotten page; but if the page has now been zapped,
1183 * remove the stale node from the stable tree and return NULL.
1184 * But beware, the stable node's page might be being migrated.
1186 * You would expect the stable_node to hold a reference to the ksm page.
1187 * But if it increments the page's count, swapping out has to wait for
1188 * ksmd to come around again before it can free the page, which may take
1189 * seconds or even minutes: much too unresponsive. So instead we use a
1190 * "keyhole reference": access to the ksm page from the stable node peeps
1191 * out through its keyhole to see if that page still holds the right key,
1192 * pointing back to this stable node. This relies on freeing a PageAnon
1193 * page to reset its page->mapping to NULL, and relies on no other use of
1194 * a page to put something that might look like our key in page->mapping.
1195 * is on its way to being freed; but it is an anomaly to bear in mind.
1197 static struct page *get_ksm_page(struct stable_node *stable_node,
1198 enum get_ksm_page_flags flags)
1201 void *expected_mapping;
1204 expected_mapping = (void *)((unsigned long)stable_node |
1207 kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
1208 page = pfn_to_page(kpfn);
1209 if (READ_ONCE(page->mapping) != expected_mapping)
1213 * We cannot do anything with the page while its refcount is 0.
1214 * Usually 0 means free, or tail of a higher-order page: in which
1215 * case this node is no longer referenced, and should be freed;
1216 * however, it might mean that the page is under page_ref_freeze().
1217 * The __remove_mapping() case is easy, again the node is now stale;
1218 * the same is in reuse_ksm_page() case; but if page is swapcache
1219 * in migrate_page_move_mapping(), it might still be our page,
1220 * in which case it's essential to keep the node.
1222 while (!get_page_unless_zero(page)) {
1224 * Another check for page->mapping != expected_mapping would
1225 * work here too. We have chosen the !PageSwapCache test to
1226 * optimize the common case, when the page is or is about to
1227 * be freed: PageSwapCache is cleared (under spin_lock_irq)
1228 * in the ref_freeze section of __remove_mapping(); but Anon
1229 * page->mapping reset to NULL later, in free_pages_prepare().
1231 if (!PageSwapCache(page))
1236 if (READ_ONCE(page->mapping) != expected_mapping) {
1241 if (flags == GET_KSM_PAGE_TRYLOCK) {
1242 if (!trylock_page(page)) {
1244 return ERR_PTR(-EBUSY);
1246 } else if (flags == GET_KSM_PAGE_LOCK)
1249 if (flags != GET_KSM_PAGE_NOLOCK) {
1250 if (READ_ONCE(page->mapping) != expected_mapping) {
1260 * We come here from above when page->mapping or !PageSwapCache
1261 * suggests that the node is stale; but it might be under migration.
1262 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
1263 * before checking whether node->kpfn has been changed.
1266 if (READ_ONCE(stable_node->kpfn) != kpfn)
1268 remove_node_from_stable_tree(stable_node);
1273 * Removing rmap_item from stable or unstable tree.
1274 * This function will clean the information from the stable/unstable tree.
1276 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1278 if (rmap_item->address & STABLE_FLAG) {
1279 struct stable_node *stable_node;
1282 stable_node = rmap_item->head;
1283 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1287 hlist_del(&rmap_item->hlist);
1291 if (!hlist_empty(&stable_node->hlist)) {
1292 ksm_pages_sharing--;
1293 lksm_slot_nr_broken++;
1297 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
1298 stable_node->rmap_hlist_len--;
1300 put_anon_vma(rmap_item->anon_vma);
1301 rmap_item->address &= PAGE_MASK;
1303 } else if (rmap_item->address & UNSTABLE_FLAG) {
1306 * Usually ksmd can and must skip the rb_erase, because
1307 * root_unstable_tree was already reset to RB_ROOT.
1308 * But be careful when an mm is exiting: do the rb_erase
1309 * if this rmap_item was inserted by this scan, rather
1310 * than left over from before.
1312 age = (unsigned char)(ksm_scan.scan_round - rmap_item->address);
1314 rb_erase(&rmap_item->node,
1315 root_unstable_tree + NUMA(rmap_item->nid));
1317 RB_CLEAR_NODE(&rmap_item->node);
1319 ksm_pages_unshared--;
1320 rmap_item->address &= PAGE_MASK;
1323 cond_resched(); /* we're called from many long loops */
1326 static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1327 struct rmap_item **rmap_list)
1329 while (*rmap_list) {
1330 struct rmap_item *rmap_item = *rmap_list;
1331 *rmap_list = rmap_item->rmap_list;
1332 remove_rmap_item_from_tree(rmap_item);
1333 free_rmap_item(rmap_item);
1338 * Though it's very tempting to unmerge rmap_items from stable tree rather
1339 * than check every pte of a given vma, the locking doesn't quite work for
1340 * that - an rmap_item is assigned to the stable tree after inserting ksm
1341 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
1342 * rmap_items from parent to child at fork time (so as not to waste time
1343 * if exit comes before the next scan reaches it).
1345 * Similarly, although we'd like to remove rmap_items (so updating counts
1346 * and freeing memory) when unmerging an area, it's easier to leave that
1347 * to the next pass of ksmd - consider, for example, how ksmd might be
1348 * in cmp_and_merge_page on one of the rmap_items we would be removing.
1350 static int unmerge_ksm_pages(struct vm_area_struct *vma,
1351 unsigned long start, unsigned long end)
1356 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1357 if (ksm_test_exit(vma->vm_mm))
1359 if (signal_pending(current))
1362 err = break_ksm(vma, addr);
1367 static inline struct stable_node *page_stable_node(struct page *page)
1369 return PageKsm(page) ? page_rmapping(page) : NULL;
1372 static inline void set_page_stable_node(struct page *page,
1373 struct stable_node *stable_node)
1375 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1380 * Only called through the sysfs control interface:
1382 static int remove_stable_node(struct stable_node *stable_node)
1387 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1390 * get_ksm_page did remove_node_from_stable_tree itself.
1396 * Page could be still mapped if this races with __mmput() running in
1397 * between ksm_exit() and exit_mmap(). Just refuse to let
1398 * merge_across_nodes/max_page_sharing be switched.
1401 if (!page_mapped(page)) {
1403 * The stable node did not yet appear stale to get_ksm_page(),
1404 * since that allows for an unmapped ksm page to be recognized
1405 * right up until it is freed; but the node is safe to remove.
1406 * This page might be in a pagevec waiting to be freed,
1407 * or it might be PageSwapCache (perhaps under writeback),
1408 * or it might have been removed from swapcache a moment ago.
1410 set_page_stable_node(page, NULL);
1411 remove_node_from_stable_tree(stable_node);
1420 static int remove_stable_node_chain(struct stable_node *stable_node,
1421 struct rb_root *root)
1423 struct stable_node *dup;
1424 struct hlist_node *hlist_safe;
1426 if (!is_stable_node_chain(stable_node)) {
1427 VM_BUG_ON(is_stable_node_dup(stable_node));
1428 if (remove_stable_node(stable_node))
1434 hlist_for_each_entry_safe(dup, hlist_safe,
1435 &stable_node->hlist, hlist_dup) {
1436 VM_BUG_ON(!is_stable_node_dup(dup));
1437 if (remove_stable_node(dup))
1440 BUG_ON(!hlist_empty(&stable_node->hlist));
1441 free_stable_node_chain(stable_node, root);
1445 static int remove_all_stable_nodes(void)
1447 struct stable_node *stable_node, *next;
1451 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
1452 while (root_stable_tree[nid].rb_node) {
1453 stable_node = rb_entry(root_stable_tree[nid].rb_node,
1454 struct stable_node, node);
1455 if (remove_stable_node_chain(stable_node,
1456 root_stable_tree + nid)) {
1458 break; /* proceed to next nid */
1463 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
1464 if (remove_stable_node(stable_node))
1471 static int unmerge_and_remove_all_rmap_items(void)
1473 struct mm_slot *mm_slot;
1474 struct mm_struct *mm;
1475 struct vm_area_struct *vma;
1478 spin_lock(&ksm_mmlist_lock);
1479 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1480 struct mm_slot, mm_list);
1481 spin_unlock(&ksm_mmlist_lock);
1483 for (mm_slot = ksm_scan.mm_slot;
1484 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1486 down_read(&mm->mmap_sem);
1487 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1488 if (ksm_test_exit(mm))
1490 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1492 err = unmerge_ksm_pages(vma,
1493 vma->vm_start, vma->vm_end);
1498 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1499 up_read(&mm->mmap_sem);
1501 spin_lock(&ksm_mmlist_lock);
1502 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1503 struct mm_slot, mm_list);
1504 if (ksm_test_exit(mm)) {
1505 hash_del(&mm_slot->link);
1506 list_del(&mm_slot->mm_list);
1507 spin_unlock(&ksm_mmlist_lock);
1509 free_mm_slot(mm_slot);
1510 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1513 spin_unlock(&ksm_mmlist_lock);
1516 /* Clean up stable nodes, but don't worry if some are still busy */
1517 remove_all_stable_nodes();
1518 ksm_scan.scan_round = 0;
1522 up_read(&mm->mmap_sem);
1523 spin_lock(&ksm_mmlist_lock);
1524 ksm_scan.mm_slot = &ksm_mm_head;
1525 spin_unlock(&ksm_mmlist_lock);
1528 #endif /* CONFIG_SYSFS */
1530 static u32 calc_checksum(struct page *page)
1533 void *addr = kmap_atomic(page);
1534 checksum = xxhash(addr, PAGE_SIZE, 0);
1535 kunmap_atomic(addr);
1536 return lksm_clear_checksum_frozen(checksum);
1539 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1542 struct mm_struct *mm = vma->vm_mm;
1543 struct page_vma_mapped_walk pvmw = {
1549 struct mmu_notifier_range range;
1551 pvmw.address = page_address_in_vma(page, vma);
1552 if (pvmw.address == -EFAULT)
1555 BUG_ON(PageTransCompound(page));
1557 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1559 pvmw.address + PAGE_SIZE);
1560 mmu_notifier_invalidate_range_start(&range);
1562 if (!page_vma_mapped_walk(&pvmw))
1564 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1567 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1568 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1569 mm_tlb_flush_pending(mm)) {
1572 swapped = PageSwapCache(page);
1573 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1575 * Ok this is tricky, when get_user_pages_fast() run it doesn't
1576 * take any lock, therefore the check that we are going to make
1577 * with the pagecount against the mapcount is racey and
1578 * O_DIRECT can happen right after the check.
1579 * So we clear the pte and flush the tlb before the check
1580 * this assure us that no O_DIRECT can happen after the check
1581 * or in the middle of the check.
1583 * No need to notify as we are downgrading page table to read
1584 * only not changing it to point to a new page.
1586 * See Documentation/vm/mmu_notifier.rst
1588 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1590 * Check that no O_DIRECT or similar I/O is in progress on the
1593 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1594 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1597 if (pte_dirty(entry))
1598 set_page_dirty(page);
1600 if (pte_protnone(entry))
1601 entry = pte_mkclean(pte_clear_savedwrite(entry));
1603 entry = pte_mkclean(pte_wrprotect(entry));
1604 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1606 *orig_pte = *pvmw.pte;
1610 page_vma_mapped_walk_done(&pvmw);
1612 mmu_notifier_invalidate_range_end(&range);
1618 * replace_page - replace page in vma by new ksm page
1619 * @vma: vma that holds the pte pointing to page
1620 * @page: the page we are replacing by kpage
1621 * @kpage: the ksm page we replace page by
1622 * @orig_pte: the original value of the pte
1624 * Returns 0 on success, -EFAULT on failure.
1626 static int replace_page(struct vm_area_struct *vma, struct page *page,
1627 struct page *kpage, pte_t orig_pte)
1629 struct mm_struct *mm = vma->vm_mm;
1636 struct mmu_notifier_range range;
1638 addr = page_address_in_vma(page, vma);
1639 if (addr == -EFAULT)
1642 pmd = mm_find_pmd(mm, addr);
1646 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1648 mmu_notifier_invalidate_range_start(&range);
1650 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1651 if (!pte_same(*ptep, orig_pte)) {
1652 pte_unmap_unlock(ptep, ptl);
1657 * No need to check ksm_use_zero_pages here: we can only have a
1658 * zero_page here if ksm_use_zero_pages was enabled alreaady.
1660 if (!is_zero_pfn(page_to_pfn(kpage))) {
1662 page_add_anon_rmap(kpage, vma, addr, false);
1663 newpte = mk_pte(kpage, vma->vm_page_prot);
1665 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1666 vma->vm_page_prot));
1668 * We're replacing an anonymous page with a zero page, which is
1669 * not anonymous. We need to do proper accounting otherwise we
1670 * will get wrong values in /proc, and a BUG message in dmesg
1671 * when tearing down the mm.
1673 dec_mm_counter(mm, MM_ANONPAGES);
1676 flush_cache_page(vma, addr, pte_pfn(*ptep));
1678 * No need to notify as we are replacing a read only page with another
1679 * read only page with the same content.
1681 * See Documentation/vm/mmu_notifier.rst
1683 ptep_clear_flush(vma, addr, ptep);
1684 set_pte_at_notify(mm, addr, ptep, newpte);
1686 page_remove_rmap(page, false);
1687 if (!page_mapped(page))
1688 try_to_free_swap(page);
1691 pte_unmap_unlock(ptep, ptl);
1694 mmu_notifier_invalidate_range_end(&range);
1700 * try_to_merge_one_page - take two pages and merge them into one
1701 * @vma: the vma that holds the pte pointing to page
1702 * @page: the PageAnon page that we want to replace with kpage
1703 * @kpage: the PageKsm page that we want to map instead of page,
1704 * or NULL the first time when we want to use page as kpage.
1706 * This function returns 0 if the pages were merged, -EFAULT otherwise.
1708 static int try_to_merge_one_page(struct vm_area_struct *vma,
1709 struct page *page, struct page *kpage)
1711 pte_t orig_pte = __pte(0);
1714 if (page == kpage) /* ksm page forked */
1717 if (!PageAnon(page))
1721 * We need the page lock to read a stable PageSwapCache in
1722 * write_protect_page(). We use trylock_page() instead of
1723 * lock_page() because we don't want to wait here - we
1724 * prefer to continue scanning and merging different pages,
1725 * then come back to this page when it is unlocked.
1727 if (!trylock_page(page))
1730 if (PageTransCompound(page)) {
1731 if (split_huge_page(page))
1736 * If this anonymous page is mapped only here, its pte may need
1737 * to be write-protected. If it's mapped elsewhere, all of its
1738 * ptes are necessarily already write-protected. But in either
1739 * case, we need to lock and check page_count is not raised.
1741 if (write_protect_page(vma, page, &orig_pte) == 0) {
1744 * While we hold page lock, upgrade page from
1745 * PageAnon+anon_vma to PageKsm+NULL stable_node:
1746 * stable_tree_insert() will update stable_node.
1748 set_page_stable_node(page, NULL);
1749 mark_page_accessed(page);
1751 * Page reclaim just frees a clean page with no dirty
1752 * ptes: make sure that the ksm page would be swapped.
1754 if (!PageDirty(page))
1757 } else if (pages_identical(page, kpage))
1758 err = replace_page(vma, page, kpage, orig_pte);
1761 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1762 munlock_vma_page(page);
1763 if (!PageMlocked(kpage)) {
1766 mlock_vma_page(kpage);
1767 page = kpage; /* for final unlock */
1778 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
1779 * but no new kernel page is allocated: kpage must already be a ksm page.
1781 * This function returns 0 if the pages were merged, -EFAULT otherwise.
1783 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1784 struct page *page, struct page *kpage)
1786 struct mm_struct *mm = rmap_item->mm;
1787 struct vm_area_struct *vma;
1790 down_read(&mm->mmap_sem);
1791 vma = find_mergeable_vma(mm, rmap_item->address);
1795 err = try_to_merge_one_page(vma, page, kpage);
1799 /* Unstable nid is in union with stable anon_vma: remove first */
1800 remove_rmap_item_from_tree(rmap_item);
1802 #ifdef CONFIG_LKSM_FILTER
1803 /* node is removed from tree, base_addr can be safely used */
1804 rmap_item->base_addr = vma->vm_start;
1806 /* Must get reference to anon_vma while still holding mmap_sem */
1807 rmap_item->anon_vma = vma->anon_vma;
1808 get_anon_vma(vma->anon_vma);
1810 up_read(&mm->mmap_sem);
1815 * try_to_merge_two_pages - take two identical pages and prepare them
1816 * to be merged into one page.
1818 * This function returns the kpage if we successfully merged two identical
1819 * pages into one ksm page, NULL otherwise.
1821 * Note that this function upgrades page to ksm page: if one of the pages
1822 * is already a ksm page, try_to_merge_with_ksm_page should be used.
1824 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1826 struct rmap_item *tree_rmap_item,
1827 struct page *tree_page)
1831 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1833 err = try_to_merge_with_ksm_page(tree_rmap_item,
1836 * If that fails, we have a ksm page with only one pte
1837 * pointing to it: so break it.
1840 break_cow(rmap_item);
1842 return err ? NULL : page;
1845 static __always_inline
1846 bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1848 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1850 * Check that at least one mapping still exists, otherwise
1851 * there's no much point to merge and share with this
1852 * stable_node, as the underlying tree_page of the other
1853 * sharer is going to be freed soon.
1855 return stable_node->rmap_hlist_len &&
1856 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1859 static __always_inline
1860 bool is_page_sharing_candidate(struct stable_node *stable_node)
1862 return __is_page_sharing_candidate(stable_node, 0);
1865 static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1866 struct stable_node **_stable_node,
1867 struct rb_root *root,
1868 bool prune_stale_stable_nodes)
1870 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1871 struct hlist_node *hlist_safe;
1872 struct page *_tree_page, *tree_page = NULL;
1874 int found_rmap_hlist_len;
1876 if (!prune_stale_stable_nodes ||
1877 time_before(jiffies, stable_node->chain_prune_time +
1879 ksm_stable_node_chains_prune_millisecs)))
1880 prune_stale_stable_nodes = false;
1882 stable_node->chain_prune_time = jiffies;
1884 hlist_for_each_entry_safe(dup, hlist_safe,
1885 &stable_node->hlist, hlist_dup) {
1888 * We must walk all stable_node_dup to prune the stale
1889 * stable nodes during lookup.
1891 * get_ksm_page can drop the nodes from the
1892 * stable_node->hlist if they point to freed pages
1893 * (that's why we do a _safe walk). The "dup"
1894 * stable_node parameter itself will be freed from
1895 * under us if it returns NULL.
1897 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1901 if (is_page_sharing_candidate(dup)) {
1903 dup->rmap_hlist_len > found_rmap_hlist_len) {
1905 put_page(tree_page);
1907 found_rmap_hlist_len = found->rmap_hlist_len;
1908 tree_page = _tree_page;
1910 /* skip put_page for found dup */
1911 if (!prune_stale_stable_nodes)
1916 put_page(_tree_page);
1921 * nr is counting all dups in the chain only if
1922 * prune_stale_stable_nodes is true, otherwise we may
1923 * break the loop at nr == 1 even if there are
1926 if (prune_stale_stable_nodes && nr == 1) {
1928 * If there's not just one entry it would
1929 * corrupt memory, better BUG_ON. In KSM
1930 * context with no lock held it's not even
1933 BUG_ON(stable_node->hlist.first->next);
1936 * There's just one entry and it is below the
1937 * deduplication limit so drop the chain.
1939 rb_replace_node(&stable_node->node, &found->node,
1941 free_stable_node(stable_node);
1942 ksm_stable_node_chains--;
1943 ksm_stable_node_dups--;
1945 * NOTE: the caller depends on the stable_node
1946 * to be equal to stable_node_dup if the chain
1949 *_stable_node = found;
1951 * Just for robustneess as stable_node is
1952 * otherwise left as a stable pointer, the
1953 * compiler shall optimize it away at build
1957 } else if (stable_node->hlist.first != &found->hlist_dup &&
1958 __is_page_sharing_candidate(found, 1)) {
1960 * If the found stable_node dup can accept one
1961 * more future merge (in addition to the one
1962 * that is underway) and is not at the head of
1963 * the chain, put it there so next search will
1964 * be quicker in the !prune_stale_stable_nodes
1967 * NOTE: it would be inaccurate to use nr > 1
1968 * instead of checking the hlist.first pointer
1969 * directly, because in the
1970 * prune_stale_stable_nodes case "nr" isn't
1971 * the position of the found dup in the chain,
1972 * but the total number of dups in the chain.
1974 hlist_del(&found->hlist_dup);
1975 hlist_add_head(&found->hlist_dup,
1976 &stable_node->hlist);
1980 *_stable_node_dup = found;
1984 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1985 struct rb_root *root)
1987 if (!is_stable_node_chain(stable_node))
1989 if (hlist_empty(&stable_node->hlist)) {
1990 free_stable_node_chain(stable_node, root);
1993 return hlist_entry(stable_node->hlist.first,
1994 typeof(*stable_node), hlist_dup);
1998 * Like for get_ksm_page, this function can free the *_stable_node and
1999 * *_stable_node_dup if the returned tree_page is NULL.
2001 * It can also free and overwrite *_stable_node with the found
2002 * stable_node_dup if the chain is collapsed (in which case
2003 * *_stable_node will be equal to *_stable_node_dup like if the chain
2004 * never existed). It's up to the caller to verify tree_page is not
2005 * NULL before dereferencing *_stable_node or *_stable_node_dup.
2007 * *_stable_node_dup is really a second output parameter of this
2008 * function and will be overwritten in all cases, the caller doesn't
2009 * need to initialize it.
2011 static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
2012 struct stable_node **_stable_node,
2013 struct rb_root *root,
2014 bool prune_stale_stable_nodes)
2016 struct stable_node *stable_node = *_stable_node;
2017 if (!is_stable_node_chain(stable_node)) {
2018 if (is_page_sharing_candidate(stable_node)) {
2019 *_stable_node_dup = stable_node;
2020 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
2023 * _stable_node_dup set to NULL means the stable_node
2024 * reached the ksm_max_page_sharing limit.
2026 *_stable_node_dup = NULL;
2029 return stable_node_dup(_stable_node_dup, _stable_node, root,
2030 prune_stale_stable_nodes);
2033 static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
2034 struct stable_node **s_n,
2035 struct rb_root *root)
2037 return __stable_node_chain(s_n_d, s_n, root, true);
2040 static __always_inline struct page *chain(struct stable_node **s_n_d,
2041 struct stable_node *s_n,
2042 struct rb_root *root)
2044 struct stable_node *old_stable_node = s_n;
2045 struct page *tree_page;
2047 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
2048 /* not pruning dups so s_n cannot have changed */
2049 VM_BUG_ON(s_n != old_stable_node);
2054 * stable_tree_search - search for page inside the stable tree
2056 * This function checks if there is a page inside the stable tree
2057 * with identical content to the page that we are scanning right now.
2059 * This function returns the stable tree node of identical content if found,
2062 static struct page *stable_tree_search(struct page *page)
2065 struct rb_root *root;
2066 struct rb_node **new;
2067 struct rb_node *parent;
2068 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
2069 struct stable_node *page_node;
2071 page_node = page_stable_node(page);
2072 if (page_node && page_node->head != &migrate_nodes) {
2073 /* ksm page forked */
2078 nid = get_kpfn_nid(page_to_pfn(page));
2079 root = root_stable_tree + nid;
2081 new = &root->rb_node;
2085 struct page *tree_page;
2089 stable_node = rb_entry(*new, struct stable_node, node);
2090 stable_node_any = NULL;
2091 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
2093 * NOTE: stable_node may have been freed by
2094 * chain_prune() if the returned stable_node_dup is
2095 * not NULL. stable_node_dup may have been inserted in
2096 * the rbtree instead as a regular stable_node (in
2097 * order to collapse the stable_node chain if a single
2098 * stable_node dup was found in it). In such case the
2099 * stable_node is overwritten by the calleee to point
2100 * to the stable_node_dup that was collapsed in the
2101 * stable rbtree and stable_node will be equal to
2102 * stable_node_dup like if the chain never existed.
2104 if (!stable_node_dup) {
2106 * Either all stable_node dups were full in
2107 * this stable_node chain, or this chain was
2108 * empty and should be rb_erased.
2110 stable_node_any = stable_node_dup_any(stable_node,
2112 if (!stable_node_any) {
2113 /* rb_erase just run */
2117 * Take any of the stable_node dups page of
2118 * this stable_node chain to let the tree walk
2119 * continue. All KSM pages belonging to the
2120 * stable_node dups in a stable_node chain
2121 * have the same content and they're
2122 * wrprotected at all times. Any will work
2123 * fine to continue the walk.
2125 tree_page = get_ksm_page(stable_node_any,
2126 GET_KSM_PAGE_NOLOCK);
2128 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
2131 * If we walked over a stale stable_node,
2132 * get_ksm_page() will call rb_erase() and it
2133 * may rebalance the tree from under us. So
2134 * restart the search from scratch. Returning
2135 * NULL would be safe too, but we'd generate
2136 * false negative insertions just because some
2137 * stable_node was stale.
2142 ret = memcmp_pages(page, tree_page);
2143 put_page(tree_page);
2147 new = &parent->rb_left;
2149 new = &parent->rb_right;
2152 VM_BUG_ON(page_node->head != &migrate_nodes);
2154 * Test if the migrated page should be merged
2155 * into a stable node dup. If the mapcount is
2156 * 1 we can migrate it with another KSM page
2157 * without adding it to the chain.
2159 if (page_mapcount(page) > 1)
2163 if (!stable_node_dup) {
2165 * If the stable_node is a chain and
2166 * we got a payload match in memcmp
2167 * but we cannot merge the scanned
2168 * page in any of the existing
2169 * stable_node dups because they're
2170 * all full, we need to wait the
2171 * scanned page to find itself a match
2172 * in the unstable tree to create a
2173 * brand new KSM page to add later to
2174 * the dups of this stable_node.
2180 * Lock and unlock the stable_node's page (which
2181 * might already have been migrated) so that page
2182 * migration is sure to notice its raised count.
2183 * It would be more elegant to return stable_node
2184 * than kpage, but that involves more changes.
2186 tree_page = get_ksm_page(stable_node_dup,
2187 GET_KSM_PAGE_TRYLOCK);
2189 if (PTR_ERR(tree_page) == -EBUSY)
2190 return ERR_PTR(-EBUSY);
2192 if (unlikely(!tree_page))
2194 * The tree may have been rebalanced,
2195 * so re-evaluate parent and new.
2198 unlock_page(tree_page);
2200 if (get_kpfn_nid(stable_node_dup->kpfn) !=
2201 NUMA(stable_node_dup->nid)) {
2202 put_page(tree_page);
2212 list_del(&page_node->list);
2213 DO_NUMA(page_node->nid = nid);
2214 rb_link_node(&page_node->node, parent, new);
2215 rb_insert_color(&page_node->node, root);
2217 if (is_page_sharing_candidate(page_node)) {
2225 * If stable_node was a chain and chain_prune collapsed it,
2226 * stable_node has been updated to be the new regular
2227 * stable_node. A collapse of the chain is indistinguishable
2228 * from the case there was no chain in the stable
2229 * rbtree. Otherwise stable_node is the chain and
2230 * stable_node_dup is the dup to replace.
2232 if (stable_node_dup == stable_node) {
2233 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
2234 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
2235 /* there is no chain */
2237 VM_BUG_ON(page_node->head != &migrate_nodes);
2238 list_del(&page_node->list);
2239 DO_NUMA(page_node->nid = nid);
2240 rb_replace_node(&stable_node_dup->node,
2243 if (is_page_sharing_candidate(page_node))
2248 rb_erase(&stable_node_dup->node, root);
2252 VM_BUG_ON(!is_stable_node_chain(stable_node));
2253 __stable_node_dup_del(stable_node_dup);
2255 VM_BUG_ON(page_node->head != &migrate_nodes);
2256 list_del(&page_node->list);
2257 DO_NUMA(page_node->nid = nid);
2258 stable_node_chain_add_dup(page_node, stable_node);
2259 if (is_page_sharing_candidate(page_node))
2267 stable_node_dup->head = &migrate_nodes;
2268 list_add(&stable_node_dup->list, stable_node_dup->head);
2272 /* stable_node_dup could be null if it reached the limit */
2273 if (!stable_node_dup)
2274 stable_node_dup = stable_node_any;
2276 * If stable_node was a chain and chain_prune collapsed it,
2277 * stable_node has been updated to be the new regular
2278 * stable_node. A collapse of the chain is indistinguishable
2279 * from the case there was no chain in the stable
2280 * rbtree. Otherwise stable_node is the chain and
2281 * stable_node_dup is the dup to replace.
2283 if (stable_node_dup == stable_node) {
2284 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
2285 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
2286 /* chain is missing so create it */
2287 stable_node = alloc_stable_node_chain(stable_node_dup,
2293 * Add this stable_node dup that was
2294 * migrated to the stable_node chain
2295 * of the current nid for this page
2298 VM_BUG_ON(!is_stable_node_chain(stable_node));
2299 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
2300 VM_BUG_ON(page_node->head != &migrate_nodes);
2301 list_del(&page_node->list);
2302 DO_NUMA(page_node->nid = nid);
2303 stable_node_chain_add_dup(page_node, stable_node);
2308 * stable_tree_insert - insert stable tree node pointing to new ksm page
2309 * into the stable tree.
2311 * This function returns the stable tree node just allocated on success,
2314 static struct stable_node *stable_tree_insert(struct page *kpage)
2318 struct rb_root *root;
2319 struct rb_node **new;
2320 struct rb_node *parent;
2321 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
2322 bool need_chain = false;
2324 kpfn = page_to_pfn(kpage);
2325 nid = get_kpfn_nid(kpfn);
2326 root = root_stable_tree + nid;
2329 new = &root->rb_node;
2332 struct page *tree_page;
2336 stable_node = rb_entry(*new, struct stable_node, node);
2337 stable_node_any = NULL;
2338 tree_page = chain(&stable_node_dup, stable_node, root);
2339 if (!stable_node_dup) {
2341 * Either all stable_node dups were full in
2342 * this stable_node chain, or this chain was
2343 * empty and should be rb_erased.
2345 stable_node_any = stable_node_dup_any(stable_node,
2347 if (!stable_node_any) {
2348 /* rb_erase just run */
2352 * Take any of the stable_node dups page of
2353 * this stable_node chain to let the tree walk
2354 * continue. All KSM pages belonging to the
2355 * stable_node dups in a stable_node chain
2356 * have the same content and they're
2357 * wrprotected at all times. Any will work
2358 * fine to continue the walk.
2360 tree_page = get_ksm_page(stable_node_any,
2361 GET_KSM_PAGE_NOLOCK);
2363 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
2366 * If we walked over a stale stable_node,
2367 * get_ksm_page() will call rb_erase() and it
2368 * may rebalance the tree from under us. So
2369 * restart the search from scratch. Returning
2370 * NULL would be safe too, but we'd generate
2371 * false negative insertions just because some
2372 * stable_node was stale.
2377 ret = memcmp_pages(kpage, tree_page);
2378 put_page(tree_page);
2382 new = &parent->rb_left;
2384 new = &parent->rb_right;
2391 stable_node_dup = alloc_stable_node();
2392 if (!stable_node_dup)
2395 INIT_HLIST_HEAD(&stable_node_dup->hlist);
2396 stable_node_dup->kpfn = kpfn;
2397 set_page_stable_node(kpage, stable_node_dup);
2398 stable_node_dup->rmap_hlist_len = 0;
2399 DO_NUMA(stable_node_dup->nid = nid);
2401 rb_link_node(&stable_node_dup->node, parent, new);
2402 rb_insert_color(&stable_node_dup->node, root);
2404 if (!is_stable_node_chain(stable_node)) {
2405 struct stable_node *orig = stable_node;
2406 /* chain is missing so create it */
2407 stable_node = alloc_stable_node_chain(orig, root);
2409 free_stable_node(stable_node_dup);
2413 stable_node_chain_add_dup(stable_node_dup, stable_node);
2416 return stable_node_dup;
2420 * unstable_tree_search_insert - search for identical page,
2421 * else insert rmap_item into the unstable tree.
2423 * This function searches for a page in the unstable tree identical to the
2424 * page currently being scanned; and if no identical page is found in the
2425 * tree, we insert rmap_item as a new object into the unstable tree.
2427 * This function returns pointer to rmap_item found to be identical
2428 * to the currently scanned page, NULL otherwise.
2430 * This function does both searching and inserting, because they share
2431 * the same walking algorithm in an rbtree.
2434 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
2436 struct page **tree_pagep)
2438 struct rb_node **new;
2439 struct rb_root *root;
2440 struct rb_node *parent = NULL;
2443 nid = get_kpfn_nid(page_to_pfn(page));
2444 root = root_unstable_tree + nid;
2445 new = &root->rb_node;
2448 struct rmap_item *tree_rmap_item;
2449 struct page *tree_page;
2453 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
2454 tree_page = get_mergeable_page(tree_rmap_item);
2459 * Don't substitute a ksm page for a forked page.
2461 if (page == tree_page) {
2462 put_page(tree_page);
2466 ret = memcmp_pages(page, tree_page);
2470 put_page(tree_page);
2471 new = &parent->rb_left;
2472 } else if (ret > 0) {
2473 put_page(tree_page);
2474 new = &parent->rb_right;
2475 } else if (!ksm_merge_across_nodes &&
2476 page_to_nid(tree_page) != nid) {
2478 * If tree_page has been migrated to another NUMA node,
2479 * it will be flushed out and put in the right unstable
2480 * tree next time: only merge with it when across_nodes.
2482 put_page(tree_page);
2485 *tree_pagep = tree_page;
2486 return tree_rmap_item;
2490 rmap_item->address |= UNSTABLE_FLAG;
2491 rmap_item->address |= (ksm_scan.scan_round & SEQNR_MASK);
2492 DO_NUMA(rmap_item->nid = nid);
2493 rb_link_node(&rmap_item->node, parent, new);
2494 rb_insert_color(&rmap_item->node, root);
2496 #ifdef CONFIG_LKSM_FILTER
2497 rmap_item->region = ksm_scan.region;
2499 ksm_pages_unshared++;
2504 * stable_tree_append - add another rmap_item to the linked list of
2505 * rmap_items hanging off a given node of the stable tree, all sharing
2506 * the same ksm page.
2508 static void stable_tree_append(struct rmap_item *rmap_item,
2509 struct stable_node *stable_node,
2510 bool max_page_sharing_bypass)
2513 * rmap won't find this mapping if we don't insert the
2514 * rmap_item in the right stable_node
2515 * duplicate. page_migration could break later if rmap breaks,
2516 * so we can as well crash here. We really need to check for
2517 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
2518 * for other negative values as an undeflow if detected here
2519 * for the first time (and not when decreasing rmap_hlist_len)
2520 * would be sign of memory corruption in the stable_node.
2522 BUG_ON(stable_node->rmap_hlist_len < 0);
2524 stable_node->rmap_hlist_len++;
2525 if (!max_page_sharing_bypass)
2526 /* possibly non fatal but unexpected overflow, only warn */
2527 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2528 ksm_max_page_sharing);
2530 rmap_item->head = stable_node;
2531 rmap_item->address |= STABLE_FLAG;
2532 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2534 if (rmap_item->hlist.next) {
2535 ksm_pages_sharing++;
2536 lksm_slot_nr_merged++;
2542 #ifdef CONFIG_LKSM_FILTER
2543 static inline void stable_tree_append_region(struct rmap_item *rmap_item,
2544 struct stable_node *stable_node,
2545 struct lksm_region *region,
2546 bool max_page_sharing_bypass)
2548 if (region->type == LKSM_REGION_FILE1
2549 || region->type == LKSM_REGION_FILE2) {
2551 unsigned long offset =
2552 (rmap_item->address - rmap_item->base_addr) >> PAGE_SHIFT;
2553 if (unlikely(region->filter_cnt == 0)
2554 && region->len > SINGLE_FILTER_LEN
2555 && !region->filter) {
2556 region->filter = kcalloc(region->len, sizeof(long), GFP_KERNEL);
2557 if (!region->filter) {
2558 ksm_err("fail to allocate memory for filter");
2562 if (region->len > SINGLE_FILTER_LEN)
2563 ret = test_and_set_bit(offset, region->filter);
2565 ret = test_and_set_bit(offset, ®ion->single_filter);
2567 region->filter_cnt++;
2569 region->merge_cnt++;
2570 region_share[region->type]++;
2572 stable_tree_append(rmap_item, stable_node, max_page_sharing_bypass);
2574 #endif /* CONFIG_LKSM_FILTER */
2577 * cmp_and_merge_page - first see if page can be merged into the stable tree;
2578 * if not, compare checksum to previous and if it's the same, see if page can
2579 * be inserted into the unstable tree, or merged with a page already there and
2580 * both transferred to the stable tree.
2582 * @page: the page that we are searching identical page to.
2583 * @rmap_item: the reverse mapping into the virtual address of this page
2585 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2587 struct mm_struct *mm = rmap_item->mm;
2588 struct rmap_item *tree_rmap_item;
2589 struct page *tree_page = NULL;
2590 struct stable_node *stable_node;
2592 unsigned int checksum;
2594 bool max_page_sharing_bypass = false;
2596 stable_node = page_stable_node(page);
2598 if (stable_node->head != &migrate_nodes &&
2599 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2600 NUMA(stable_node->nid)) {
2601 stable_node_dup_del(stable_node);
2602 stable_node->head = &migrate_nodes;
2603 list_add(&stable_node->list, stable_node->head);
2605 if (stable_node->head != &migrate_nodes &&
2606 rmap_item->head == stable_node)
2609 * If it's a KSM fork, allow it to go over the sharing limit
2612 if (!is_page_sharing_candidate(stable_node))
2613 max_page_sharing_bypass = true;
2616 /* We first start with searching the page inside the stable tree */
2617 kpage = stable_tree_search(page);
2618 if (kpage == page && rmap_item->head == stable_node) {
2623 remove_rmap_item_from_tree(rmap_item);
2626 if (PTR_ERR(kpage) == -EBUSY)
2629 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2632 * The page was successfully merged:
2633 * add its rmap_item to the stable tree.
2636 #ifdef CONFIG_LKSM_FILTER
2637 stable_tree_append_region(rmap_item, page_stable_node(kpage),
2638 ksm_scan.region, max_page_sharing_bypass);
2640 stable_tree_append(rmap_item, page_stable_node(kpage),
2641 max_page_sharing_bypass);
2650 * LKSM: In LKSM, KSM is running in a event-triggered manner.
2651 * Because of that scanning is much infrequently performed.
2652 * We just skip caculation of checksum for LKSM to catch scanning
2655 if (ksm_scan.scan_round < initial_round
2656 && !lksm_test_rmap_frozen(rmap_item)) {
2657 checksum = calc_checksum(page);
2658 if (rmap_item->oldchecksum != checksum) {
2659 rmap_item->oldchecksum = checksum;
2665 * Same checksum as an empty page. We attempt to merge it with the
2666 * appropriate zero page if the user enabled this via sysfs.
2668 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2669 struct vm_area_struct *vma;
2671 down_read(&mm->mmap_sem);
2672 vma = find_mergeable_vma(mm, rmap_item->address);
2674 err = try_to_merge_one_page(vma, page,
2675 ZERO_PAGE(rmap_item->address));
2678 * If the vma is out of date, we do not need to
2683 up_read(&mm->mmap_sem);
2685 * In case of failure, the page was not really empty, so we
2686 * need to continue. Otherwise we're done.
2692 unstable_tree_search_insert(rmap_item, page, &tree_page);
2693 if (tree_rmap_item) {
2695 #ifdef CONFIG_LKSM_FILTER
2696 struct lksm_region *tree_region = tree_rmap_item->region;
2698 kpage = try_to_merge_two_pages(rmap_item, page,
2699 tree_rmap_item, tree_page);
2701 * If both pages we tried to merge belong to the same compound
2702 * page, then we actually ended up increasing the reference
2703 * count of the same compound page twice, and split_huge_page
2705 * Here we set a flag if that happened, and we use it later to
2706 * try split_huge_page again. Since we call put_page right
2707 * afterwards, the reference count will be correct and
2708 * split_huge_page should succeed.
2710 split = PageTransCompound(page)
2711 && compound_head(page) == compound_head(tree_page);
2712 put_page(tree_page);
2715 * The pages were successfully merged: insert new
2716 * node in the stable tree and add both rmap_items.
2719 stable_node = stable_tree_insert(kpage);
2721 #ifdef CONFIG_LKSM_FILTER
2722 stable_tree_append_region(tree_rmap_item, stable_node,
2723 tree_region, false);
2724 stable_tree_append_region(rmap_item, stable_node,
2725 ksm_scan.region, false);
2727 stable_tree_append(tree_rmap_item, stable_node,
2729 stable_tree_append(rmap_item, stable_node,
2736 * If we fail to insert the page into the stable tree,
2737 * we will have 2 virtual addresses that are pointing
2738 * to a ksm page left outside the stable tree,
2739 * in which case we need to break_cow on both.
2742 break_cow(tree_rmap_item);
2743 break_cow(rmap_item);
2744 #ifdef CONFIG_LKSM_FILTER
2745 tree_rmap_item->region = tree_region;
2746 rmap_item->region = ksm_scan.region;
2751 * We are here if we tried to merge two pages and
2752 * failed because they both belonged to the same
2753 * compound page. We will split the page now, but no
2754 * merging will take place.
2755 * We do not want to add the cost of a full lock; if
2756 * the page is locked, it is better to skip it and
2757 * perhaps try again later.
2759 if (!trylock_page(page))
2761 split_huge_page(page);
2767 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2768 struct rmap_item **rmap_list,
2771 struct rmap_item *rmap_item;
2773 while (*rmap_list) {
2774 rmap_item = *rmap_list;
2775 if ((rmap_item->address & PAGE_MASK) == addr) {
2776 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
2777 && rmap_item->address & UNSTABLE_FLAG)
2778 lksm_set_rmap_frozen(rmap_item);
2780 lksm_clear_rmap_frozen(rmap_item);
2783 if (rmap_item->address > addr)
2785 *rmap_list = rmap_item->rmap_list;
2786 remove_rmap_item_from_tree(rmap_item);
2787 free_rmap_item(rmap_item);
2790 rmap_item = alloc_rmap_item();
2792 /* It has already been zeroed */
2793 rmap_item->mm = mm_slot->mm;
2794 rmap_item->address = addr;
2795 rmap_item->rmap_list = *rmap_list;
2796 #ifdef CONFIG_LKSM_FILTER
2797 rmap_item->region = ksm_scan.region;
2799 *rmap_list = rmap_item;
2800 if (lksm_test_mm_state(mm_slot, FROZEN_BIT))
2801 lksm_set_rmap_frozen(rmap_item);
2803 lksm_clear_rmap_frozen(rmap_item);
2809 * lksm_flush_removed_mm_list:
2810 * batched flushing out removed mm_slots by lksm_remove_mm_slot
2812 static void lksm_flush_removed_mm_list(void)
2814 struct mm_slot *head, *next, *slot;
2816 spin_lock(&ksm_mmlist_lock);
2817 head = list_first_entry_or_null(&ksm_scan.remove_mm_list,
2818 struct mm_slot, mm_list);
2820 spin_unlock(&ksm_mmlist_lock);
2824 list_del_init(&ksm_scan.remove_mm_list);
2825 spin_unlock(&ksm_mmlist_lock);
2827 if (!list_empty(&head->mm_list)) {
2828 list_for_each_entry_safe(slot, next, &head->mm_list, mm_list) {
2829 ksm_debug("slot(%p) will be freed", slot);
2830 list_del(&slot->mm_list);
2834 remove_trailing_rmap_items(slot, &slot->rmap_list);
2835 #ifdef CONFIG_LKSM_FILTER
2836 lksm_region_ref_list_release(slot);
2838 clear_bit(MMF_VM_MERGEABLE, &slot->mm->flags);
2844 ksm_debug("slot(%p) will be freed", head);
2847 remove_trailing_rmap_items(head, &head->rmap_list);
2848 clear_bit(MMF_VM_MERGEABLE, &head->mm->flags);
2854 * remove mm_slot from lists
2855 * LKSM defers releasing stuffs at the end of scanning
2857 static inline void lksm_remove_mm_slot(struct mm_slot *slot)
2859 hash_del(&slot->link);
2860 list_del_init(&slot->scan_list);
2861 list_move(&slot->mm_list, &ksm_scan.remove_mm_list);
2862 if (!RB_EMPTY_NODE(&slot->ordered_list)) {
2863 rb_erase(&slot->ordered_list, &vips_list);
2864 RB_CLEAR_NODE(&slot->ordered_list);
2868 /* caller must hold ksm_mmlist_lock */
2869 static struct mm_slot *lksm_get_unscanned_mm_slot(struct mm_slot *slot)
2871 struct mm_slot *next;
2873 list_for_each_entry_safe_continue(slot, next, &ksm_scan_head.scan_list,
2875 if (ksm_test_exit(slot->mm)) {
2876 ksm_debug("slot:%p %p is moved to remove list", slot, slot->mm);
2877 if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
2878 atomic_dec(&ksm_scan.nr_frozen);
2880 atomic_dec(&ksm_scan.nr_scannable);
2881 lksm_remove_mm_slot(slot);
2885 lksm_nr_scanned_slot++;
2892 /* caller must hold ksm_mmlist_lock */
2893 static void lksm_insert_mm_slot_ordered(struct mm_slot *slot)
2895 struct rb_root *root;
2896 struct rb_node **new;
2897 struct rb_node *parent;
2898 struct mm_slot *temp_slot;
2902 new = &root->rb_node;
2905 temp_slot = rb_entry(*new, struct mm_slot, ordered_list);
2908 if (slot->nr_merged > temp_slot->nr_merged)
2909 new = &parent->rb_left;
2911 new = &parent->rb_right;
2914 rb_link_node(&slot->ordered_list, parent, new);
2915 rb_insert_color(&slot->ordered_list, root);
2918 #ifdef CONFIG_LKSM_FILTER
2920 * most vmas grow up except stack.
2921 * the given value of size must be same with orig's one.
2924 static inline void __lksm_copy_filter
2925 (unsigned long *orig, unsigned long *newer, int size)
2928 *(newer++) = *(orig++);
2931 static inline void lksm_copy_filter
2932 (struct lksm_region *region, unsigned long *filter)
2934 if (region->len > SINGLE_FILTER_LEN) {
2936 __lksm_copy_filter(region->filter, filter, region->len);
2938 __lksm_copy_filter(®ion->single_filter, filter, region->len);
2941 static struct vm_area_struct *lksm_find_next_vma
2942 (struct mm_struct *mm, struct mm_slot *slot)
2944 struct vm_area_struct *vma;
2945 struct lksm_region *region;
2947 if (ksm_test_exit(mm))
2950 vma = find_vma(mm, ksm_scan.address);
2951 for (; vma; vma = vma->vm_next) {
2952 if (!(vma->vm_flags & VM_MERGEABLE))
2954 if (ksm_scan.address < vma->vm_start)
2955 ksm_scan.address = vma->vm_start;
2956 if (!vma->anon_vma) {
2957 ksm_scan.address = vma->vm_end;
2961 if (ksm_scan.cached_vma == vma)
2962 region = ksm_scan.region;
2964 region = lksm_find_region(vma);
2965 ksm_scan.cached_vma = vma;
2966 ksm_scan.vma_base_addr = vma->vm_start;
2969 if (!region || region->type == LKSM_REGION_CONFLICT)
2970 region = &unknown_region;
2971 else if (region->type != LKSM_REGION_HEAP
2972 && region->type != LKSM_REGION_CONFLICT
2973 && region->type != LKSM_REGION_UNKNOWN) {
2974 int size = lksm_region_size(vma->vm_start, vma->vm_end);
2975 int len = (size > BITS_PER_LONG) ? lksm_bitmap_size(size)
2976 : SINGLE_FILTER_LEN;
2978 if (len > SINGLE_FILTER_LEN && unlikely(region->len != len)) {
2980 if (region->conflict > 1) {
2981 region->type = LKSM_REGION_CONFLICT;
2982 if (region->len > SINGLE_FILTER_LEN)
2983 kfree(region->filter);
2984 region->filter = NULL;
2985 region->len = SINGLE_FILTER_LEN;
2986 /* conflicted regions will be unfiltered */
2987 region = &unknown_region;
2988 ksm_debug("the region is frequently conflicted. break.");
2991 if (region->len < len) {
2992 unsigned long *filter;
2993 ksm_debug("size of region(%p) is changed: %d -> %d (size: %d)",
2994 region, region->len, len, size);
2995 ksm_debug("region-%d type: %d vma:%p", region->ino, region->type, vma);
2996 filter = kcalloc(len, sizeof(long), GFP_KERNEL);
2998 ksm_err("fail to allocate memory for filter");
3001 if (region->filter_cnt > 0)
3002 lksm_copy_filter(region, filter);
3003 if (region->len > SINGLE_FILTER_LEN)
3004 kfree(region->filter);
3005 region->filter = filter;
3011 if (ksm_scan.region != region)
3012 ksm_scan.region = region;
3018 static inline unsigned long lksm_get_next_filtered_address
3019 (struct lksm_region *region, unsigned long addr, unsigned long base)
3021 unsigned long next_offset, curr_offset, nbits;
3023 curr_offset = (addr - base) >> PAGE_SHIFT;
3024 nbits = (region->len == 0) ? BITS_PER_LONG :
3025 (region->len << (6 + PAGE_SHIFT));
3026 if (region->len > SINGLE_FILTER_LEN)
3027 next_offset = find_next_bit(region->filter, nbits, curr_offset);
3029 next_offset = find_next_bit(®ion->single_filter,
3030 nbits, curr_offset);
3032 return (next_offset << PAGE_SHIFT) + base;
3035 #define lksm_region_skipped(region) \
3036 (region->len > 0 && !region->filter)
3038 static struct rmap_item *__scan_next_rmap_item(struct page **page,
3039 struct mm_struct *mm, struct mm_slot *slot)
3041 struct vm_area_struct *vma;
3042 struct rmap_item *rmap_item;
3047 vma = lksm_find_next_vma(mm, slot);
3049 while (vma && ksm_scan.address < vma->vm_end) {
3050 if (ksm_test_exit(mm)) {
3054 if (!lksm_test_mm_state(slot, KSM_MM_NEWCOMER)
3055 && !lksm_test_mm_state(slot, KSM_MM_FROZEN)
3056 && ksm_scan.region->type != LKSM_REGION_HEAP
3057 && ksm_scan.region->type != LKSM_REGION_UNKNOWN
3058 && lksm_region_mature(ksm_scan.scan_round, ksm_scan.region)
3059 && !lksm_region_skipped(ksm_scan.region)) {
3060 if (ksm_scan.region->filter_cnt > 0) {
3061 addr = lksm_get_next_filtered_address(ksm_scan.region,
3062 ksm_scan.address, ksm_scan.vma_base_addr);
3063 ksm_scan.address = addr;
3064 if (ksm_scan.address >= vma->vm_end)
3066 if (ksm_scan.address < vma->vm_start) {
3067 ksm_debug("address(%lu) is less than vm_start(%lu)",
3068 ksm_scan.address, vma->vm_start);
3072 ksm_scan.address = vma->vm_end;
3076 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
3077 if (IS_ERR_OR_NULL(*page)) {
3078 ksm_scan.address += PAGE_SIZE;
3082 if (PageAnon(*page)) {
3083 flush_anon_page(vma, *page, ksm_scan.address);
3084 flush_dcache_page(*page);
3085 rmap_item = get_next_rmap_item(slot,
3086 ksm_scan.rmap_list, ksm_scan.address);
3088 ksm_scan.rmap_list =
3089 &rmap_item->rmap_list;
3090 ksm_scan.address += PAGE_SIZE;
3096 ksm_scan.address += PAGE_SIZE;
3101 /* clean-up a scanned region */
3102 ksm_scan.region = NULL;
3103 ksm_scan.cached_vma = NULL;
3104 ksm_scan.vma_base_addr = 0;
3106 return NULL; /* no scannable rmap item */
3109 #else /* CONFIG_LKSM_FILTER */
3111 static struct rmap_item *__scan_next_rmap_item(struct page **page,
3112 struct mm_struct *mm, struct mm_slot *slot)
3114 struct vm_area_struct *vma;
3115 struct rmap_item *rmap_item;
3117 if (ksm_test_exit(mm))
3120 vma = find_vma(mm, ksm_scan.address);
3122 for (; vma; vma = vma->vm_next) {
3123 if (!(vma->vm_flags & VM_MERGEABLE))
3125 if (ksm_scan.address < vma->vm_start)
3126 ksm_scan.address = vma->vm_start;
3128 ksm_scan.address = vma->vm_end;
3130 while (ksm_scan.address < vma->vm_end) {
3131 if (ksm_test_exit(mm))
3133 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
3134 if (IS_ERR_OR_NULL(*page)) {
3135 ksm_scan.address += PAGE_SIZE;
3139 if (PageAnon(*page)) {
3140 flush_anon_page(vma, *page, ksm_scan.address);
3141 flush_dcache_page(*page);
3142 rmap_item = get_next_rmap_item(slot,
3143 ksm_scan.rmap_list, ksm_scan.address);
3145 ksm_scan.rmap_list =
3146 &rmap_item->rmap_list;
3147 ksm_scan.address += PAGE_SIZE;
3153 ksm_scan.address += PAGE_SIZE;
3161 #endif /* CONFIG_LKSM_FILTER */
3163 static inline int sum_merge_win(int merge_win[], int len)
3167 for (i = 0; i < len; i++)
3168 sum += merge_win[i];
3172 static inline int lksm_account_mm_slot_nr_merge(struct mm_slot *slot, int nr_merged)
3174 slot->nr_merged_win[slot->merge_idx++] = nr_merged;
3175 if (slot->merge_idx == MERGE_WIN)
3176 slot->merge_idx = 0;
3177 slot->nr_merged = sum_merge_win(slot->nr_merged_win, MERGE_WIN);
3178 return slot->nr_merged;
3181 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
3183 struct mm_struct *mm;
3184 struct mm_slot *slot;
3185 struct rmap_item *rmap_item;
3187 if (list_empty(&ksm_scan_head.scan_list))
3190 slot = ksm_scan.mm_slot;
3191 if (slot == &ksm_scan_head) {
3193 * A number of pages can hang around indefinitely on per-cpu
3194 * pagevecs, raised page count preventing write_protect_page
3195 * from merging them. Though it doesn't really matter much,
3196 * it is puzzling to see some stuck in pages_volatile until
3197 * other activity jostles them out, and they also prevented
3198 * LTP's KSM test from succeeding deterministically; so drain
3199 * them here (here rather than on entry to ksm_do_scan(),
3200 * so we don't IPI too often when pages_to_scan is set low).
3202 lru_add_drain_all();
3204 if (ksm_scan.scan_round < ksm_crawl_round) {
3205 ksm_scan.scan_round = ksm_crawl_round;
3206 root_unstable_tree[LKSM_NODE_ID] = RB_ROOT;
3209 spin_lock(&ksm_mmlist_lock);
3210 slot = lksm_get_unscanned_mm_slot(slot);
3211 ksm_scan.mm_slot = slot;
3212 spin_unlock(&ksm_mmlist_lock);
3215 * Although we tested list_empty() above, a racing __ksm_exit
3216 * of the last mm on the list may have removed it since then.
3218 if (slot == &ksm_scan_head)
3221 slot->elapsed = get_jiffies_64();
3223 ksm_scan.address = 0;
3224 ksm_scan.rmap_list = &slot->rmap_list;
3227 if (unlikely(!ksm_scan.rmap_list))
3228 ksm_scan.rmap_list = &slot->rmap_list;
3232 down_read(&mm->mmap_sem);
3233 rmap_item = __scan_next_rmap_item(page, mm, slot);
3237 up_read(&mm->mmap_sem);
3241 if (ksm_test_exit(mm)) {
3242 ksm_scan.address = 0;
3243 ksm_scan.rmap_list = &slot->rmap_list;
3246 * Nuke all the rmap_items that are above this current rmap:
3247 * because there were no VM_MERGEABLE vmas with such addresses.
3249 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
3251 spin_lock(&ksm_mmlist_lock);
3252 ksm_scan.mm_slot = lksm_get_unscanned_mm_slot(slot);
3254 if (ksm_scan.address == 0) {
3256 * We've completed a full scan of all vmas, holding mmap_sem
3257 * throughout, and found no VM_MERGEABLE: so do the same as
3258 * __ksm_exit does to remove this mm from all our lists now.
3259 * This applies either when cleaning up after __ksm_exit
3260 * (but beware: we can reach here even before __ksm_exit),
3261 * or when all VM_MERGEABLE areas have been unmapped (and
3262 * mmap_sem then protects against race with MADV_MERGEABLE).
3264 up_read(&mm->mmap_sem);
3265 if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
3266 atomic_dec(&ksm_scan.nr_frozen);
3268 atomic_dec(&ksm_scan.nr_scannable);
3269 lksm_remove_mm_slot(slot);
3270 spin_unlock(&ksm_mmlist_lock);
3272 lksm_slot_nr_merged = 0;
3273 lksm_slot_nr_broken = 0;
3275 int newcomer = 0, frozen = 0;
3277 up_read(&mm->mmap_sem);
3279 if (lksm_test_mm_state(slot, KSM_MM_NEWCOMER)) {
3280 lksm_clear_mm_state(slot, KSM_MM_NEWCOMER);
3283 if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) {
3284 lksm_clear_mm_state(slot, KSM_MM_FROZEN);
3286 atomic_dec(&ksm_scan.nr_frozen);
3288 atomic_dec(&ksm_scan.nr_scannable);
3289 lksm_set_mm_state(slot, KSM_MM_SCANNED);
3291 list_del_init(&slot->scan_list);
3292 if (!RB_EMPTY_NODE(&slot->ordered_list)) {
3293 rb_erase(&slot->ordered_list, &vips_list);
3294 RB_CLEAR_NODE(&slot->ordered_list);
3296 if (lksm_account_mm_slot_nr_merge(slot, lksm_slot_nr_merged))
3297 lksm_insert_mm_slot_ordered(slot);
3299 slot->elapsed = get_jiffies_64() - slot->elapsed;
3300 spin_unlock(&ksm_mmlist_lock);
3302 if (ksm_test_exit(slot->mm))
3303 ksm_debug("slot(%p:%p) is exited", slot, slot->mm);
3305 ksm_debug("slot-%d(%s) %d merged %d scanned %lu pages "
3306 "(sum: %d) - (%s, %s) takes %u msecs (nr_scannable: %d)",
3307 task_pid_nr(slot->mm->owner), slot->mm->owner->comm,
3308 lksm_slot_nr_merged - lksm_slot_nr_broken, slot->nr_scans,
3309 slot->scanning_size, slot->nr_merged,
3310 newcomer ? "new" : "old",
3311 frozen ? "frozen" : "normal",
3312 jiffies_to_msecs(slot->elapsed),
3313 atomic_read(&ksm_scan.nr_scannable));
3315 lksm_slot_nr_merged = 0;
3316 lksm_slot_nr_broken = 0;
3319 /* Repeat until we've completed scanning the whole list */
3320 slot = ksm_scan.mm_slot;
3321 if (slot != &ksm_scan_head) {
3322 slot->elapsed = get_jiffies_64();
3330 * ksm_do_scan - the ksm scanner main worker function.
3331 * @scan_npages: number of pages we want to scan before we return.
3333 static int ksm_do_scan(unsigned int scan_npages)
3335 struct rmap_item *rmap_item;
3336 struct page *uninitialized_var(page);
3338 while (scan_npages-- && likely(!freezing(current))) {
3340 rmap_item = scan_get_next_rmap_item(&page);
3342 return 1; /* need sleep */
3343 cmp_and_merge_page(page, rmap_item);
3349 static int ksmd_should_run(void)
3351 return (ksm_run & KSM_RUN_MERGE) &&
3352 !list_empty(&ksm_scan_head.scan_list);
3355 static void lksm_scan_wrapup_wait(void)
3357 if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3358 if (ksm_thread_pages_to_scan != lksm_default_pages_to_scan)
3359 ksm_thread_pages_to_scan = lksm_default_pages_to_scan;
3360 } else if (ksm_scan.scan_mode == LKSM_SCAN_FULL)
3361 ksm_scan.nr_full_scan++;
3365 lksm_nr_scanned_slot = 0;
3367 ksm_scan.scan_mode = LKSM_SCAN_NONE;
3368 if (ksm_run & KSM_RUN_ONESHOT)
3369 atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
3371 lksm_clear_scan_state(ksm_state);
3373 wait_event_freezable(ksm_thread_wait,
3374 (lksm_check_scan_state(ksm_state) && ksmd_should_run())
3375 || kthread_should_stop());
3378 static int lksm_scan_thread(void *nothing)
3380 unsigned long begin, elapsed;
3381 unsigned int sleep_ms;
3382 int need_to_sleep = 0;
3385 set_user_nice(current, 5);
3387 ksm_debug("KSM_SCAND pid: %d", task_pid_nr(current));
3388 while (!kthread_should_stop()) {
3389 mutex_lock(&ksm_thread_mutex);
3390 wait_while_offlining();
3391 if (ksmd_should_run())
3392 need_to_sleep = ksm_do_scan(ksm_thread_pages_to_scan);
3393 mutex_unlock(&ksm_thread_mutex);
3397 if (need_to_sleep) {
3398 if (!ksmd_should_run()) {
3399 /* if no one left in scanning list, go to sleep for a while */
3400 lksm_flush_removed_mm_list();
3402 elapsed = get_jiffies_64() - begin;
3403 lksm_last_scan_time = elapsed;
3404 lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
3406 ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages "
3407 "are merged/broken (nr_scannable: %d, nr_frozen: %d)",
3408 lksm_nr_scanned_slot,
3409 jiffies_to_msecs(lksm_last_scan_time),
3410 lksm_nr_merged, lksm_nr_broken,
3411 atomic_read(&ksm_scan.nr_scannable),
3412 atomic_read(&ksm_scan.nr_frozen));
3414 lksm_scan_wrapup_wait();
3416 ksm_debug("Start %lu-th scanning: nr_scannable(%d) "
3418 ksm_scan.scan_round,
3419 atomic_read(&ksm_scan.nr_scannable),
3420 atomic_read(&ksm_scan.nr_frozen));
3422 if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3423 if (lksm_boosted_pages_to_scan !=
3424 ksm_thread_pages_to_scan) {
3425 ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
3426 ksm_debug("set pages_to_scan to %u",
3427 lksm_boosted_pages_to_scan);
3430 begin = get_jiffies_64();
3432 /* new scanning targets are coming */
3433 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
3434 wait_event_interruptible_timeout(ksm_iter_wait,
3435 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
3436 msecs_to_jiffies(sleep_ms));
3439 } else if (ksmd_should_run()) {
3441 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
3442 wait_event_interruptible_timeout(ksm_iter_wait,
3443 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
3444 msecs_to_jiffies(sleep_ms));
3446 /* wait for activating ksm */
3447 if (likely(ksm_scan.scan_round > 0)) {
3448 lksm_flush_removed_mm_list();
3450 elapsed = get_jiffies_64() - begin;
3451 lksm_last_scan_time = elapsed;
3452 lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
3454 ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages are merged/broken",
3455 lksm_nr_scanned_slot, jiffies_to_msecs(lksm_last_scan_time),
3456 lksm_nr_merged, lksm_nr_broken);
3458 lksm_scan_wrapup_wait();
3460 wait_event_freezable(ksm_thread_wait,
3461 (lksm_check_scan_state(ksm_state) && ksmd_should_run())
3462 || kthread_should_stop());
3464 ksm_debug("Start %lu-th scanning: nr_scannable(%d) nr_frozen(%d)",
3465 ksm_scan.scan_round,
3466 atomic_read(&ksm_scan.nr_scannable),
3467 atomic_read(&ksm_scan.nr_frozen));
3469 if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3470 ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
3471 ksm_debug("set pages_to_scan to %u",
3472 lksm_boosted_pages_to_scan);
3474 begin = get_jiffies_64();
3481 * lksm crawler declaration & definition part
3483 static struct task_struct *ksm_crawld;
3485 LIST_HEAD(frozen_task_list);
3486 DEFINE_SPINLOCK(frozen_task_lock);
3492 static atomic_t crawl_state;
3495 LKSM_TASK_SLOT_NONE = 0,
3496 LKSM_TASK_SLOT_REMOVED,
3499 static inline int lksm_count_and_clear_mm_slots
3500 (struct mm_slot *head, unsigned long *delay)
3503 struct mm_slot *slot;
3505 spin_lock(&ksm_mmlist_lock);
3506 list_for_each_entry(slot, &head->mm_list, mm_list) {
3507 if (list_empty(&slot->scan_list)) {
3508 lksm_clear_mm_state(slot, KSM_MM_SCANNED);
3510 slot->scanning_size = get_mm_counter(slot->mm, MM_ANONPAGES);
3511 list_add_tail(&slot->scan_list, &ksm_scan_head.scan_list);
3512 *delay += slot->elapsed;
3516 spin_unlock(&ksm_mmlist_lock);
3520 static int lksm_prepare_frozen_scan(void)
3522 int nr_frozen = 0, nr_added = 0, err;
3523 struct task_struct *task;
3524 struct task_slot *task_slot;
3525 struct mm_struct *mm;
3527 spin_lock(&frozen_task_lock);
3528 nr_frozen = atomic_read(&ksm_scan.nr_frozen);
3529 if (list_empty(&frozen_task_list)) {
3530 spin_unlock(&frozen_task_lock);
3534 ksm_debug("prepare frozen scan: round(%lu)", ksm_crawl_round);
3535 task_slot = list_first_entry_or_null(&frozen_task_list,
3536 struct task_slot, list);
3538 list_del(&task_slot->list);
3539 hash_del(&task_slot->hlist);
3540 spin_unlock(&frozen_task_lock);
3542 task = task_slot->task;
3543 if (ksm_run & KSM_RUN_UNMERGE) {
3544 put_task_struct(task);
3545 free_task_slot(task_slot);
3546 goto clean_up_abort;
3549 mm = get_task_mm(task);
3551 if (!mm || ksm_test_exit(mm))
3555 ksm_join_write_lock(mm, task_slot->frozen, err);
3561 free_task_slot(task_slot);
3562 put_task_struct(task);
3568 spin_lock(&frozen_task_lock);
3569 task_slot = list_first_entry_or_null(&frozen_task_list,
3570 struct task_slot, list);
3572 spin_unlock(&frozen_task_lock);
3573 atomic_add(nr_added, &ksm_scan.nr_frozen);
3575 return nr_added + nr_frozen;
3578 spin_lock(&frozen_task_lock);
3579 task_slot = list_first_entry_or_null(&frozen_task_list,
3580 struct task_slot, list);
3582 list_del(&task_slot->list);
3583 hash_del(&task_slot->hlist);
3584 spin_unlock(&frozen_task_lock);
3586 task = task_slot->task;
3587 put_task_struct(task);
3588 free_task_slot(task_slot);
3590 spin_lock(&frozen_task_lock);
3591 task_slot = list_first_entry_or_null(&frozen_task_list,
3592 struct task_slot, list);
3594 spin_unlock(&frozen_task_lock);
3599 /* this function make a list of new processes and vip processes */
3600 static int lksm_prepare_partial_scan(void)
3602 int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0;
3603 unsigned long delay = 0;
3604 unsigned long fault_cnt = 0;
3605 struct task_struct *task;
3606 struct mm_struct *mm;
3607 struct mm_slot *mm_slot;
3608 struct list_head recheck_list;
3609 struct rb_node *node;
3611 ksm_debug("prepare partial scan: round(%lu)", ksm_crawl_round);
3612 INIT_LIST_HEAD(&recheck_list);
3614 nr_frozen = lksm_prepare_frozen_scan();
3617 for_each_process(task) {
3618 if (task == current || task_pid_nr(task) == 0
3619 || check_short_task(task))
3621 if (ksm_run & KSM_RUN_UNMERGE) {
3626 mm = get_task_mm(task);
3629 ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
3636 if (nr_added + nr_frozen >= lksm_max_vips) {
3637 ksm_debug("nr_scannable(%d) already fulfilled skip vips",
3638 nr_added + nr_frozen);
3642 spin_lock(&ksm_mmlist_lock);
3643 node = rb_first(&vips_list);
3645 ksm_debug("empty vip list");
3646 spin_unlock(&ksm_mmlist_lock);
3649 mm_slot = rb_entry(node, struct mm_slot, ordered_list);
3650 while (nr_scannable + nr_added + nr_frozen < lksm_max_vips) {
3651 if (ksm_run & KSM_RUN_UNMERGE) {
3652 spin_unlock(&ksm_mmlist_lock);
3658 if (ksm_test_exit(mm_slot->mm)) {
3659 if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
3660 atomic_dec(&ksm_scan.nr_scannable);
3661 lksm_remove_mm_slot(mm_slot);
3664 if (!lksm_test_mm_state(mm_slot, KSM_MM_LISTED))
3667 /* prunning by fault count */
3668 fault_cnt = mm_slot->mm->owner->maj_flt + mm_slot->mm->owner->min_flt;
3669 if (mm_slot->fault_cnt == fault_cnt)
3672 mm_slot->fault_cnt = fault_cnt;
3673 mm_slot->scanning_size = get_mm_counter(mm_slot->mm, MM_ANONPAGES);
3674 mm_slot->nr_scans = 0;
3675 delay += mm_slot->elapsed;
3676 ksm_debug("slot(nr_merged: %d, scanning_size: %lu) task(%s)",
3677 mm_slot->nr_merged, mm_slot->scanning_size,
3678 mm_slot->mm->owner->comm);
3679 list_move_tail(&mm_slot->scan_list, &recheck_list);
3680 lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
3681 #ifdef CONFIG_LKSM_FILTER
3682 /* to prevent mm_slot termination on __ksm_exit */
3683 lksm_set_mm_state(mm_slot, KSM_MM_PREPARED);
3688 node = rb_next(node);
3691 mm_slot = rb_entry(node, struct mm_slot, ordered_list);
3693 spin_unlock(&ksm_mmlist_lock);
3694 #ifdef CONFIG_LKSM_FILTER
3695 list_for_each_entry(mm_slot, &recheck_list, scan_list) {
3696 if (ksm_test_exit(mm_slot->mm))
3698 mm_slot->nr_scans = 0;
3699 /* check new maps */
3700 down_read(&mm_slot->mm->mmap_sem);
3701 ksm_join(mm_slot->mm, KSM_TASK_UNFROZEN);
3702 up_read(&mm_slot->mm->mmap_sem);
3706 spin_lock(&ksm_mmlist_lock);
3707 if (!list_empty(&recheck_list)) {
3708 #ifdef CONFIG_LKSM_FILTER
3709 list_for_each_entry(mm_slot, &recheck_list, scan_list)
3710 lksm_clear_mm_state(mm_slot, KSM_MM_PREPARED);
3712 list_splice(&recheck_list, &ksm_scan_head.scan_list);
3714 spin_unlock(&ksm_mmlist_lock);
3716 ksm_scan.scan_mode = LKSM_SCAN_PARTIAL;
3719 atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
3720 ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
3721 nr_frozen, nr_added, nr_scannable, atomic_read(&ksm_scan.nr_scannable));
3723 return nr_frozen + nr_added + nr_scannable;
3726 static int lksm_prepare_full_scan(unsigned long *next_fullscan)
3728 int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0, nr_target;
3729 unsigned long delay = 0;
3730 struct task_struct *task;
3731 struct mm_struct *mm;
3733 ksm_debug("prepare full scan: round(%lu)", ksm_crawl_round);
3735 nr_frozen = lksm_prepare_frozen_scan();
3737 for_each_process(task) {
3738 if (task == current || task_pid_nr(task) == 0
3739 || check_short_task(task))
3741 if (ksm_run & KSM_RUN_UNMERGE) {
3746 mm = get_task_mm(task);
3749 ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
3755 nr_scannable = lksm_count_and_clear_mm_slots(&ksm_mm_head, &delay);
3756 nr_target = nr_scannable + nr_added + nr_frozen;
3758 /* calculate crawler's sleep time */
3759 delay += msecs_to_jiffies((nr_frozen + nr_added) * lksm_proc_scan_time);
3760 *next_fullscan = jiffies + delay + msecs_to_jiffies(full_scan_interval);
3762 ksm_scan.scan_mode = LKSM_SCAN_FULL;
3765 atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
3766 ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
3767 nr_frozen, nr_added, nr_scannable,
3768 atomic_read(&ksm_scan.nr_scannable));
3773 static int lksm_do_wait_userspace_event(unsigned long sleep_time)
3775 wait_event_freezable(ksm_crawl_wait,
3776 kthread_should_stop() ||
3777 (atomic_read(&ksm_one_shot_scanning) > 0));
3778 return atomic_read(&ksm_one_shot_scanning);
3781 static int lksm_do_wait_frozen_event(unsigned long sleep_time)
3785 spin_lock_irq(&frozen_task_lock);
3786 if (list_empty(&frozen_task_list))
3787 /* wait until candidate list is filled */
3788 wait_event_interruptible_lock_irq_timeout(
3790 kthread_should_stop()
3791 || !list_empty(&frozen_task_list)
3792 || !list_empty(&ksm_scan_head.scan_list),
3793 frozen_task_lock, sleep_time);
3795 if (!list_empty(&frozen_task_list) ||
3796 !list_empty(&ksm_scan_head.scan_list))
3798 spin_unlock_irq(&frozen_task_lock);
3803 static inline void lksm_wake_up_scan_thread(void)
3805 ksm_debug("wake up lksm_scan_thread");
3806 lksm_set_scan_state(ksm_state);
3807 wake_up(&ksm_thread_wait);
3810 #define LKSM_CRAWL_FROZEN_EVENT_WAIT 100 /* 100ms */
3812 static void lksm_do_crawl_once
3813 (unsigned long *next_fscan, unsigned long sleep_time)
3818 /* cralwer thread waits for trigger event from userspace */
3819 scan_mode = lksm_do_wait_userspace_event(sleep_time);
3821 if (scan_mode == LKSM_SCAN_PARTIAL) {
3822 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3823 msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
3824 nr_added = lksm_prepare_partial_scan();
3825 } else if (scan_mode == LKSM_SCAN_FULL) {
3826 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3827 nr_added = lksm_prepare_full_scan(next_fscan);
3833 lksm_wake_up_scan_thread();
3835 ksm_debug("No one can be scanned!");
3836 atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
3838 atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3841 static void lksm_do_crawl_periodic
3842 (unsigned long *next_fscan, unsigned long sleep_time)
3846 if (time_is_before_eq_jiffies(*next_fscan)) {
3847 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3848 nr_added = lksm_prepare_full_scan(next_fscan);
3849 } else if (lksm_do_wait_frozen_event(sleep_time)) {
3850 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3851 msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
3852 nr_added = lksm_prepare_partial_scan();
3858 lksm_wake_up_scan_thread();
3859 atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3862 static int lksm_crawl_thread(void *data)
3865 unsigned long next_fscan = jiffies; /* next full scan */
3866 unsigned long sleep_time = crawler_sleep;
3869 set_user_nice(current, 5);
3871 ksm_debug("KSM_CRAWLD pid: %d", task_pid_nr(current));
3872 wait_event_freezable(ksm_crawl_wait,
3873 kthread_should_stop() || ksm_run & KSM_RUN_MERGE);
3875 while (!kthread_should_stop() && ksm_crawl_round < initial_round) {
3879 if ((ksm_run & KSM_RUN_MERGE) &&
3880 !lksm_check_scan_state(ksm_state) &&
3881 time_is_before_eq_jiffies(next_fscan)) {
3882 nr_added = lksm_prepare_full_scan(&next_fscan);
3884 lksm_wake_up_scan_thread();
3887 next_fscan = jiffies + sleep_time;
3890 wait_event_interruptible_timeout(ksm_crawl_wait,
3891 kthread_should_stop() || !lksm_check_scan_state(ksm_state),
3895 /* initialization loop done */
3896 full_scan_interval = DEFAULT_FULL_SCAN_INTERVAL;
3897 next_fscan = jiffies + msecs_to_jiffies(full_scan_interval);
3898 atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3900 /* normal operation loop */
3901 while (!kthread_should_stop()) {
3902 if (ksm_run & KSM_RUN_ONESHOT) {
3903 if (!lksm_check_scan_state(ksm_state))
3904 lksm_do_crawl_once(&next_fscan, sleep_time);
3906 /* wait until scanning done */
3907 wait_event_freezable(ksm_crawl_wait,
3908 !lksm_check_scan_state(ksm_state)
3909 || kthread_should_stop());
3910 } else if (ksm_run & KSM_RUN_MERGE) {
3911 if (!lksm_check_scan_state(ksm_state))
3912 lksm_do_crawl_periodic(&next_fscan, sleep_time);
3914 /* wait until scanning done */
3915 wait_event_interruptible_timeout(ksm_crawl_wait,
3916 !lksm_check_scan_state(ksm_state)
3917 || kthread_should_stop(),
3921 ksm_debug("ksm is not activated");
3922 wait_event_freezable(ksm_crawl_wait,
3923 kthread_should_stop() || (ksm_run & KSM_RUN_MERGE));
3930 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
3931 unsigned long end, int advice, unsigned long *vm_flags)
3933 struct mm_struct *mm = vma->vm_mm;
3937 case MADV_MERGEABLE:
3939 * Be somewhat over-protective for now!
3941 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
3942 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
3943 VM_HUGETLB | VM_MIXEDMAP))
3944 return 0; /* just ignore the advice */
3946 if (vma_is_dax(vma))
3950 if (*vm_flags & VM_SAO)
3954 if (*vm_flags & VM_SPARC_ADI)
3958 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
3959 err = __ksm_enter(mm, KSM_TASK_UNFROZEN);
3964 *vm_flags |= VM_MERGEABLE;
3967 case MADV_UNMERGEABLE:
3968 if (!(*vm_flags & VM_MERGEABLE))
3969 return 0; /* just ignore the advice */
3971 if (vma->anon_vma) {
3972 err = unmerge_ksm_pages(vma, start, end);
3977 *vm_flags &= ~VM_MERGEABLE;
3984 static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen)
3986 struct mm_slot *mm_slot;
3988 mm_slot = alloc_mm_slot();
3992 if (frozen == KSM_TASK_FROZEN)
3993 lksm_set_mm_state(mm_slot, KSM_MM_FROZEN | KSM_MM_NEWCOMER);
3995 lksm_set_mm_state(mm_slot, KSM_MM_LISTED | KSM_MM_NEWCOMER);
3997 lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
3998 RB_CLEAR_NODE(&mm_slot->ordered_list);
3999 mm_slot->fault_cnt = mm->owner->maj_flt + mm->owner->min_flt;
4000 mm_slot->scanning_size = get_mm_counter(mm, MM_ANONPAGES);
4002 spin_lock(&ksm_mmlist_lock);
4003 insert_to_mm_slots_hash(mm, mm_slot);
4005 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
4006 * insert just behind the scanning cursor, to let the area settle
4007 * down a little; when fork is followed by immediate exec, we don't
4008 * want ksmd to waste time setting up and tearing down an rmap_list.
4010 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
4011 * scanning cursor, otherwise KSM pages in newly forked mms will be
4012 * missed: then we might as well insert at the end of the list.
4014 if (ksm_run & KSM_RUN_UNMERGE)
4015 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
4017 list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
4018 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
4020 ksm_nr_added_process++;
4021 spin_unlock(&ksm_mmlist_lock);
4022 #ifdef CONFIG_LKSM_FILTER
4023 INIT_LIST_HEAD(&mm_slot->ref_list);
4025 set_bit(MMF_VM_MERGEABLE, &mm->flags);
4026 atomic_inc(&mm->mm_count);
4031 int __ksm_enter(struct mm_struct *mm, int frozen)
4033 if (!__ksm_enter_alloc_slot(mm, frozen))
4038 void __ksm_exit(struct mm_struct *mm)
4040 struct mm_slot *mm_slot;
4041 int easy_to_free = 0;
4044 * This process is exiting: if it's straightforward (as is the
4045 * case when ksmd was never running), free mm_slot immediately.
4046 * But if it's at the cursor or has rmap_items linked to it, use
4047 * mmap_sem to synchronize with any break_cows before pagetables
4048 * are freed, and leave the mm_slot on the list for ksmd to free.
4049 * Beware: ksm may already have noticed it exiting and freed the slot.
4052 spin_lock(&ksm_mmlist_lock);
4053 mm_slot = get_mm_slot(mm);
4055 spin_unlock(&ksm_mmlist_lock);
4059 if (ksm_scan.mm_slot != mm_slot) {
4060 #ifdef CONFIG_LKSM_FILTER
4061 if (lksm_test_mm_state(mm_slot, KSM_MM_PREPARED))
4062 goto deferring_free;
4064 if (!mm_slot->rmap_list) {
4065 hash_del(&mm_slot->link);
4066 list_del(&mm_slot->mm_list);
4067 list_del(&mm_slot->scan_list);
4068 if (!RB_EMPTY_NODE(&mm_slot->ordered_list)) {
4069 rb_erase(&mm_slot->ordered_list, &vips_list);
4070 RB_CLEAR_NODE(&mm_slot->ordered_list);
4074 lksm_remove_mm_slot(mm_slot);
4075 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)) {
4076 atomic_dec(&ksm_scan.nr_frozen);
4077 ksm_debug("nr_frozen: %d", atomic_read(&ksm_scan.nr_frozen));
4078 } else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) {
4079 atomic_dec(&ksm_scan.nr_scannable);
4080 ksm_debug("nr_scannable: %d", atomic_read(&ksm_scan.nr_scannable));
4083 #ifdef CONFIG_LKSM_FILTER
4086 ksm_nr_added_process--;
4087 spin_unlock(&ksm_mmlist_lock);
4090 #ifdef CONFIG_LKSM_FILTER
4091 lksm_region_ref_list_release(mm_slot);
4093 free_mm_slot(mm_slot);
4094 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
4096 } else if (mm_slot) {
4097 down_write(&mm->mmap_sem);
4098 up_write(&mm->mmap_sem);
4102 struct page *ksm_might_need_to_copy(struct page *page,
4103 struct vm_area_struct *vma, unsigned long address)
4105 struct anon_vma *anon_vma = page_anon_vma(page);
4106 struct page *new_page;
4108 if (PageKsm(page)) {
4109 if (page_stable_node(page) &&
4110 !(ksm_run & KSM_RUN_UNMERGE))
4111 return page; /* no need to copy it */
4112 } else if (!anon_vma) {
4113 return page; /* no need to copy it */
4114 } else if (anon_vma->root == vma->anon_vma->root &&
4115 page->index == linear_page_index(vma, address)) {
4116 return page; /* still no need to copy it */
4118 if (!PageUptodate(page))
4119 return page; /* let do_swap_page report the error */
4121 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
4123 copy_user_highpage(new_page, page, address, vma);
4125 SetPageDirty(new_page);
4126 __SetPageUptodate(new_page);
4127 __SetPageLocked(new_page);
4133 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
4135 struct stable_node *stable_node;
4136 struct rmap_item *rmap_item;
4137 int search_new_forks = 0;
4139 VM_BUG_ON_PAGE(!PageKsm(page), page);
4142 * Rely on the page lock to protect against concurrent modifications
4143 * to that page's node of the stable tree.
4145 VM_BUG_ON_PAGE(!PageLocked(page), page);
4147 stable_node = page_stable_node(page);
4151 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
4152 struct anon_vma *anon_vma = rmap_item->anon_vma;
4153 struct anon_vma_chain *vmac;
4154 struct vm_area_struct *vma;
4157 anon_vma_lock_read(anon_vma);
4158 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
4165 /* Ignore the stable/unstable/sqnr flags */
4166 addr = rmap_item->address & ~KSM_FLAG_MASK;
4168 if (addr < vma->vm_start || addr >= vma->vm_end)
4171 * Initially we examine only the vma which covers this
4172 * rmap_item; but later, if there is still work to do,
4173 * we examine covering vmas in other mms: in case they
4174 * were forked from the original since ksmd passed.
4176 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
4179 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
4182 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
4183 anon_vma_unlock_read(anon_vma);
4186 if (rwc->done && rwc->done(page)) {
4187 anon_vma_unlock_read(anon_vma);
4191 anon_vma_unlock_read(anon_vma);
4193 if (!search_new_forks++)
4197 bool reuse_ksm_page(struct page *page,
4198 struct vm_area_struct *vma,
4199 unsigned long address)
4201 #ifdef CONFIG_DEBUG_VM
4202 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
4203 WARN_ON(!page_mapped(page)) ||
4204 WARN_ON(!PageLocked(page))) {
4205 dump_page(page, "reuse_ksm_page");
4210 if (PageSwapCache(page) || !page_stable_node(page))
4212 /* Prohibit parallel get_ksm_page() */
4213 if (!page_ref_freeze(page, 1))
4216 page_move_anon_rmap(page, vma);
4217 page->index = linear_page_index(vma, address);
4218 page_ref_unfreeze(page, 1);
4222 #ifdef CONFIG_MIGRATION
4223 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
4225 struct stable_node *stable_node;
4227 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
4228 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
4229 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
4231 stable_node = page_stable_node(newpage);
4233 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
4234 stable_node->kpfn = page_to_pfn(newpage);
4236 * newpage->mapping was set in advance; now we need smp_wmb()
4237 * to make sure that the new stable_node->kpfn is visible
4238 * to get_ksm_page() before it can see that oldpage->mapping
4239 * has gone stale (or that PageSwapCache has been cleared).
4242 set_page_stable_node(oldpage, NULL);
4245 #endif /* CONFIG_MIGRATION */
4247 #ifdef CONFIG_MEMORY_HOTREMOVE
4248 static void wait_while_offlining(void)
4250 while (ksm_run & KSM_RUN_OFFLINE) {
4251 mutex_unlock(&ksm_thread_mutex);
4252 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
4253 TASK_UNINTERRUPTIBLE);
4254 mutex_lock(&ksm_thread_mutex);
4258 static bool stable_node_dup_remove_range(struct stable_node *stable_node,
4259 unsigned long start_pfn,
4260 unsigned long end_pfn)
4262 if (stable_node->kpfn >= start_pfn &&
4263 stable_node->kpfn < end_pfn) {
4265 * Don't get_ksm_page, page has already gone:
4266 * which is why we keep kpfn instead of page*
4268 remove_node_from_stable_tree(stable_node);
4274 static bool stable_node_chain_remove_range(struct stable_node *stable_node,
4275 unsigned long start_pfn,
4276 unsigned long end_pfn,
4277 struct rb_root *root)
4279 struct stable_node *dup;
4280 struct hlist_node *hlist_safe;
4282 if (!is_stable_node_chain(stable_node)) {
4283 VM_BUG_ON(is_stable_node_dup(stable_node));
4284 return stable_node_dup_remove_range(stable_node, start_pfn,
4288 hlist_for_each_entry_safe(dup, hlist_safe,
4289 &stable_node->hlist, hlist_dup) {
4290 VM_BUG_ON(!is_stable_node_dup(dup));
4291 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
4293 if (hlist_empty(&stable_node->hlist)) {
4294 free_stable_node_chain(stable_node, root);
4295 return true; /* notify caller that tree was rebalanced */
4300 static void ksm_check_stable_tree(unsigned long start_pfn,
4301 unsigned long end_pfn)
4303 struct stable_node *stable_node, *next;
4304 struct rb_node *node;
4307 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
4308 node = rb_first(root_stable_tree + nid);
4310 stable_node = rb_entry(node, struct stable_node, node);
4311 if (stable_node_chain_remove_range(stable_node,
4315 node = rb_first(root_stable_tree + nid);
4317 node = rb_next(node);
4321 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
4322 if (stable_node->kpfn >= start_pfn &&
4323 stable_node->kpfn < end_pfn)
4324 remove_node_from_stable_tree(stable_node);
4329 static int ksm_memory_callback(struct notifier_block *self,
4330 unsigned long action, void *arg)
4332 struct memory_notify *mn = arg;
4335 case MEM_GOING_OFFLINE:
4337 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
4338 * and remove_all_stable_nodes() while memory is going offline:
4339 * it is unsafe for them to touch the stable tree at this time.
4340 * But unmerge_ksm_pages(), rmap lookups and other entry points
4341 * which do not need the ksm_thread_mutex are all safe.
4343 mutex_lock(&ksm_thread_mutex);
4344 ksm_run |= KSM_RUN_OFFLINE;
4345 mutex_unlock(&ksm_thread_mutex);
4350 * Most of the work is done by page migration; but there might
4351 * be a few stable_nodes left over, still pointing to struct
4352 * pages which have been offlined: prune those from the tree,
4353 * otherwise get_ksm_page() might later try to access a
4354 * non-existent struct page.
4356 ksm_check_stable_tree(mn->start_pfn,
4357 mn->start_pfn + mn->nr_pages);
4360 case MEM_CANCEL_OFFLINE:
4361 mutex_lock(&ksm_thread_mutex);
4362 ksm_run &= ~KSM_RUN_OFFLINE;
4363 mutex_unlock(&ksm_thread_mutex);
4365 smp_mb(); /* wake_up_bit advises this */
4366 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
4372 static void wait_while_offlining(void)
4375 #endif /* CONFIG_MEMORY_HOTREMOVE */
4379 * This all compiles without CONFIG_SYSFS, but is a waste of space.
4382 #define KSM_ATTR_RO(_name) \
4383 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
4384 #define KSM_ATTR(_name) \
4385 static struct kobj_attribute _name##_attr = \
4386 __ATTR(_name, 0644, _name##_show, _name##_store)
4388 static ssize_t sleep_millisecs_show(struct kobject *kobj,
4389 struct kobj_attribute *attr, char *buf)
4391 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
4394 static ssize_t sleep_millisecs_store(struct kobject *kobj,
4395 struct kobj_attribute *attr,
4396 const char *buf, size_t count)
4398 unsigned long msecs;
4401 err = kstrtoul(buf, 10, &msecs);
4402 if (err || msecs > UINT_MAX)
4405 ksm_thread_sleep_millisecs = msecs;
4406 wake_up_interruptible(&ksm_iter_wait);
4410 KSM_ATTR(sleep_millisecs);
4412 static ssize_t pages_to_scan_show(struct kobject *kobj,
4413 struct kobj_attribute *attr, char *buf)
4415 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
4418 static ssize_t pages_to_scan_store(struct kobject *kobj,
4419 struct kobj_attribute *attr,
4420 const char *buf, size_t count)
4423 unsigned long nr_pages;
4425 err = kstrtoul(buf, 10, &nr_pages);
4426 if (err || nr_pages > UINT_MAX)
4429 ksm_thread_pages_to_scan = nr_pages;
4433 KSM_ATTR(pages_to_scan);
4435 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
4438 if (ksm_run & KSM_RUN_ONESHOT)
4439 return sprintf(buf, "%u\n", KSM_RUN_ONESHOT);
4441 return sprintf(buf, "%lu\n", ksm_run);
4444 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
4445 const char *buf, size_t count)
4448 unsigned long flags;
4450 err = kstrtoul(buf, 10, &flags);
4451 if (err || flags > UINT_MAX)
4453 if (flags > KSM_RUN_ONESHOT)
4457 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
4458 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
4459 * breaking COW to free the pages_shared (but leaves mm_slots
4460 * on the list for when ksmd may be set running again).
4463 mutex_lock(&ksm_thread_mutex);
4464 wait_while_offlining();
4465 if (ksm_run != flags) {
4466 if (flags == KSM_RUN_ONESHOT)
4467 ksm_run = KSM_RUN_MERGE | KSM_RUN_ONESHOT;
4470 if (flags & KSM_RUN_UNMERGE) {
4471 set_current_oom_origin();
4472 err = unmerge_and_remove_all_rmap_items();
4473 clear_current_oom_origin();
4475 ksm_run = KSM_RUN_STOP;
4480 mutex_unlock(&ksm_thread_mutex);
4482 if (ksm_run & KSM_RUN_MERGE) {
4483 ksm_debug("activate KSM");
4484 wake_up(&ksm_crawl_wait);
4492 static ssize_t merge_across_nodes_show(struct kobject *kobj,
4493 struct kobj_attribute *attr, char *buf)
4495 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
4498 static ssize_t merge_across_nodes_store(struct kobject *kobj,
4499 struct kobj_attribute *attr,
4500 const char *buf, size_t count)
4505 err = kstrtoul(buf, 10, &knob);
4511 mutex_lock(&ksm_thread_mutex);
4512 wait_while_offlining();
4513 if (ksm_merge_across_nodes != knob) {
4514 if (ksm_pages_shared || remove_all_stable_nodes())
4516 else if (root_stable_tree == one_stable_tree) {
4517 struct rb_root *buf;
4519 * This is the first time that we switch away from the
4520 * default of merging across nodes: must now allocate
4521 * a buffer to hold as many roots as may be needed.
4522 * Allocate stable and unstable together:
4523 * MAXSMP NODES_SHIFT 10 will use 16kB.
4525 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
4527 /* Let us assume that RB_ROOT is NULL is zero */
4531 root_stable_tree = buf;
4532 root_unstable_tree = buf + nr_node_ids;
4533 /* Stable tree is empty but not the unstable */
4534 root_unstable_tree[0] = one_unstable_tree[0];
4538 ksm_merge_across_nodes = knob;
4539 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
4542 mutex_unlock(&ksm_thread_mutex);
4544 return err ? err : count;
4546 KSM_ATTR(merge_across_nodes);
4549 static ssize_t use_zero_pages_show(struct kobject *kobj,
4550 struct kobj_attribute *attr, char *buf)
4552 return sprintf(buf, "%u\n", ksm_use_zero_pages);
4554 static ssize_t use_zero_pages_store(struct kobject *kobj,
4555 struct kobj_attribute *attr,
4556 const char *buf, size_t count)
4561 err = kstrtobool(buf, &value);
4565 ksm_use_zero_pages = value;
4569 KSM_ATTR(use_zero_pages);
4571 static ssize_t max_page_sharing_show(struct kobject *kobj,
4572 struct kobj_attribute *attr, char *buf)
4574 return sprintf(buf, "%u\n", ksm_max_page_sharing);
4577 static ssize_t max_page_sharing_store(struct kobject *kobj,
4578 struct kobj_attribute *attr,
4579 const char *buf, size_t count)
4584 err = kstrtoint(buf, 10, &knob);
4588 * When a KSM page is created it is shared by 2 mappings. This
4589 * being a signed comparison, it implicitly verifies it's not
4595 if (READ_ONCE(ksm_max_page_sharing) == knob)
4598 mutex_lock(&ksm_thread_mutex);
4599 wait_while_offlining();
4600 if (ksm_max_page_sharing != knob) {
4601 if (ksm_pages_shared || remove_all_stable_nodes())
4604 ksm_max_page_sharing = knob;
4606 mutex_unlock(&ksm_thread_mutex);
4608 return err ? err : count;
4610 KSM_ATTR(max_page_sharing);
4612 static ssize_t pages_shared_show(struct kobject *kobj,
4613 struct kobj_attribute *attr, char *buf)
4615 return sprintf(buf, "%lu\n", ksm_pages_shared);
4617 KSM_ATTR_RO(pages_shared);
4619 static ssize_t pages_sharing_show(struct kobject *kobj,
4620 struct kobj_attribute *attr, char *buf)
4622 return sprintf(buf, "%lu\n", ksm_pages_sharing);
4624 KSM_ATTR_RO(pages_sharing);
4626 static ssize_t pages_unshared_show(struct kobject *kobj,
4627 struct kobj_attribute *attr, char *buf)
4629 return sprintf(buf, "%lu\n", ksm_pages_unshared);
4631 KSM_ATTR_RO(pages_unshared);
4633 static ssize_t pages_volatile_show(struct kobject *kobj,
4634 struct kobj_attribute *attr, char *buf)
4636 long ksm_pages_volatile;
4638 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
4639 - ksm_pages_sharing - ksm_pages_unshared;
4641 * It was not worth any locking to calculate that statistic,
4642 * but it might therefore sometimes be negative: conceal that.
4644 if (ksm_pages_volatile < 0)
4645 ksm_pages_volatile = 0;
4646 return sprintf(buf, "%ld\n", ksm_pages_volatile);
4648 KSM_ATTR_RO(pages_volatile);
4650 static ssize_t stable_node_dups_show(struct kobject *kobj,
4651 struct kobj_attribute *attr, char *buf)
4653 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
4655 KSM_ATTR_RO(stable_node_dups);
4657 static ssize_t stable_node_chains_show(struct kobject *kobj,
4658 struct kobj_attribute *attr, char *buf)
4660 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
4662 KSM_ATTR_RO(stable_node_chains);
4665 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
4666 struct kobj_attribute *attr,
4669 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
4673 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
4674 struct kobj_attribute *attr,
4675 const char *buf, size_t count)
4677 unsigned long msecs;
4680 err = kstrtoul(buf, 10, &msecs);
4681 if (err || msecs > UINT_MAX)
4684 ksm_stable_node_chains_prune_millisecs = msecs;
4688 KSM_ATTR(stable_node_chains_prune_millisecs);
4690 static ssize_t full_scans_show(struct kobject *kobj,
4691 struct kobj_attribute *attr, char *buf)
4693 return sprintf(buf, "%lu\n", ksm_scan.nr_full_scan);
4695 KSM_ATTR_RO(full_scans);
4697 static ssize_t scanning_process_show(struct kobject *kobj,
4698 struct kobj_attribute *attr, char *buf)
4700 return sprintf(buf, "%u\n", ksm_nr_added_process);
4702 KSM_ATTR_RO(scanning_process);
4704 static ssize_t full_scan_interval_show(struct kobject *kobj,
4705 struct kobj_attribute *attr, char *buf)
4707 return sprintf(buf, "%lu\n", full_scan_interval);
4710 static ssize_t full_scan_interval_store(struct kobject *kbj,
4711 struct kobj_attribute *attr, const char *buf, size_t count)
4714 unsigned long interval;
4716 err = kstrtoul(buf, 10, &interval);
4717 if (err || interval > UINT_MAX)
4720 full_scan_interval = interval;
4723 KSM_ATTR(full_scan_interval);
4725 static ssize_t one_shot_scanning_show(struct kobject *kobj,
4726 struct kobj_attribute *attr, char *buf)
4728 return sprintf(buf, "%d\n", atomic_read(&ksm_one_shot_scanning));
4731 static ssize_t one_shot_scanning_store(struct kobject *kbj,
4732 struct kobj_attribute *attr, const char *buf, size_t count)
4736 err = kstrtoint(buf, 10, &val);
4737 if (err || (val != LKSM_SCAN_PARTIAL && val != LKSM_SCAN_FULL)) {
4738 ksm_err("wrong value: %d", val);
4742 if (!atomic_cmpxchg(&ksm_one_shot_scanning, LKSM_SCAN_NONE, val)) {
4743 wake_up(&ksm_crawl_wait);
4746 ksm_debug("ksm is still scanning");
4749 KSM_ATTR(one_shot_scanning);
4751 static ssize_t scan_boost_show(struct kobject *kobj,
4752 struct kobj_attribute *attr, char *buf)
4754 return sprintf(buf, "%u\n", lksm_boosted_pages_to_scan);
4757 static ssize_t scan_boost_store(struct kobject *kbj,
4758 struct kobj_attribute *attr, const char *buf, size_t count)
4762 err = kstrtoint(buf, 10, &val);
4763 /* lksm_boosted_pages_to_scan must presence in from 100 to 10000 */
4764 if (err || val < 100 || val > 10000) {
4765 ksm_err("wrong value: %d", val);
4769 lksm_boosted_pages_to_scan = (unsigned int) val;
4773 KSM_ATTR(scan_boost);
4775 #ifdef CONFIG_LKSM_FILTER
4776 static ssize_t nr_regions_show(struct kobject *kobj,
4777 struct kobj_attribute *attr, char *buf)
4779 return sprintf(buf, "%u\n", lksm_nr_regions);
4781 KSM_ATTR_RO(nr_regions);
4783 static ssize_t region_share_show(struct kobject *obj,
4784 struct kobj_attribute *attr, char *buf)
4786 return sprintf(buf, "%s:%d %s:%d %s:%d %s:%d %s:%d\n",
4787 region_type_str[0], region_share[0], region_type_str[1], region_share[1],
4788 region_type_str[2], region_share[2], region_type_str[3], region_share[3],
4789 region_type_str[4], region_share[4]);
4791 KSM_ATTR_RO(region_share);
4792 #endif /* CONFIG_LKSM_FILTER */
4794 static struct attribute *ksm_attrs[] = {
4795 &sleep_millisecs_attr.attr,
4796 &pages_to_scan_attr.attr,
4798 &pages_shared_attr.attr,
4799 &pages_sharing_attr.attr,
4800 &pages_unshared_attr.attr,
4801 &pages_volatile_attr.attr,
4802 &full_scans_attr.attr,
4804 &merge_across_nodes_attr.attr,
4806 &max_page_sharing_attr.attr,
4807 &stable_node_chains_attr.attr,
4808 &stable_node_dups_attr.attr,
4809 &stable_node_chains_prune_millisecs_attr.attr,
4810 &use_zero_pages_attr.attr,
4811 &scanning_process_attr.attr,
4812 &full_scan_interval_attr.attr,
4813 &one_shot_scanning_attr.attr,
4814 &scan_boost_attr.attr,
4815 #ifdef CONFIG_LKSM_FILTER
4816 &nr_regions_attr.attr,
4817 ®ion_share_attr.attr,
4822 static const struct attribute_group ksm_attr_group = {
4826 #endif /* CONFIG_SYSFS */
4828 #ifdef CONFIG_LKSM_FILTER
4829 static inline void init_lksm_region
4830 (struct lksm_region *region, unsigned long ino, int type, int len)
4833 region->type = type;
4837 /* if region is newly allocated, the function returns true. */
4838 static void lksm_insert_region
4839 (struct lksm_region **region, unsigned long ino,
4840 struct vm_area_struct *vma, int type)
4842 int size, len, need_hash_add = 0;
4843 struct lksm_region *next = NULL;
4844 unsigned long flags;
4846 size = lksm_region_size(vma->vm_start, vma->vm_end);
4848 len = (size > BITS_PER_LONG) ? lksm_bitmap_size(size) : SINGLE_FILTER_LEN;
4851 *region = kzalloc(sizeof(struct lksm_region), GFP_KERNEL);
4853 ksm_err("region allocation failed");
4856 init_lksm_region(*region, ino, LKSM_REGION_FILE1, len);
4857 (*region)->scan_round = ksm_crawl_round;
4858 atomic_set(&(*region)->refcount, 0);
4863 if (!(*region)->next && type == LKSM_REGION_FILE2) {
4864 next = kzalloc(sizeof(struct lksm_region), GFP_KERNEL);
4869 ksm_err("region allocation failed");
4872 init_lksm_region(next, ino, LKSM_REGION_FILE2, len);
4873 atomic_set(&next->refcount, 0);
4874 next->scan_round = ksm_crawl_round;
4878 if (need_hash_add || next) {
4879 spin_lock_irqsave(&lksm_region_lock, flags);
4881 hash_add(lksm_region_hash, &(*region)->hnode, ino);
4883 (*region)->next = next;
4884 next->prev = *region;
4886 spin_unlock_irqrestore(&lksm_region_lock, flags);
4890 static inline struct lksm_region *lksm_hash_find_region(unsigned long ino)
4892 struct lksm_region *region;
4894 hash_for_each_possible(lksm_region_hash, region, hnode, ino)
4895 if (region->ino == ino)
4900 static void lksm_register_file_anon_region
4901 (struct mm_slot *slot, struct vm_area_struct *vma)
4903 struct lksm_region *region;
4904 struct file *file = NULL;
4905 struct inode *inode;
4906 unsigned long flags;
4910 file = vma->vm_file;
4911 type = LKSM_REGION_FILE1;
4912 } else if (vma->vm_prev) {
4913 /* LKSM should deal with .NET libraries */
4914 struct vm_area_struct *prev = vma->vm_prev;
4915 if (prev->vm_flags & VM_MERGEABLE && prev->vm_file) {
4916 /* Linux standard map structure */
4917 file = prev->vm_file;
4918 type = LKSM_REGION_FILE2;
4920 /* DLL map structure */
4923 while (i <= LKSM_REGION_ITER_MAX && prev) {
4925 file = prev->vm_file;
4926 else if (prev->vm_file && file != prev->vm_file)
4929 if (prev->vm_flags & VM_MERGEABLE && file) {
4933 prev = prev->vm_prev;
4937 type = LKSM_REGION_FILE2;
4944 inode = file_inode(file);
4947 spin_lock_irqsave(&lksm_region_lock, flags);
4948 region = lksm_hash_find_region(inode->i_ino);
4949 spin_unlock_irqrestore(&lksm_region_lock, flags);
4951 lksm_insert_region(®ion, inode->i_ino, vma, type);
4953 if (type == LKSM_REGION_FILE1)
4954 lksm_region_ref_append(slot, region);
4956 lksm_region_ref_append(slot, region->next);
4961 static struct lksm_region *lksm_find_region(struct vm_area_struct *vma)
4963 struct lksm_region *region = NULL;
4964 struct file *file = NULL;
4965 struct inode *inode;
4966 unsigned long ino = 0, flags;
4970 return &heap_region;
4971 else if (is_stack(vma))
4973 else if (!vma->anon_vma)
4975 else if (is_exec(vma))
4979 /* check thread stack */
4980 file = vma->vm_file;
4981 type = LKSM_REGION_FILE1;
4982 } else if (vma->vm_prev) {
4983 struct vm_area_struct *prev = vma->vm_prev;
4984 if (prev->vm_flags & VM_MERGEABLE && prev->vm_file) {
4985 /* Linux standard map structure */
4986 file = prev->vm_file;
4987 type = LKSM_REGION_FILE2;
4989 /* DLL map structure */
4992 while (i <= LKSM_REGION_ITER_MAX && prev) {
4994 file = prev->vm_file;
4995 else if (prev->vm_file && file != prev->vm_file)
4998 if (prev->vm_flags & VM_MERGEABLE && file) {
5002 prev = prev->vm_prev;
5006 type = LKSM_REGION_FILE2;
5013 inode = file_inode(file);
5017 if (ksm_scan.region && ksm_scan.region->ino == ino) {
5018 if (ksm_scan.region->type == type)
5019 return ksm_scan.region;
5020 else if (ksm_scan.region->type == LKSM_REGION_FILE1)
5021 region = ksm_scan.region;
5023 spin_lock_irqsave(&lksm_region_lock, flags);
5024 region = lksm_hash_find_region(ino);
5025 spin_unlock_irqrestore(&lksm_region_lock, flags);
5029 if (region && type == LKSM_REGION_FILE2) {
5030 if (!region->next) {
5031 ksm_debug("region(%p:%lu:%s)-vma(%p) doesn't have next area (file: %p)",
5032 region, ino, region_type_str[region->type], vma, file);
5033 lksm_insert_region(®ion, ino, vma, type);
5034 BUG_ON(!region->next);
5036 return region->next;
5040 #endif /* CONFIG_LKSM_FILTER */
5042 static inline int __lksm_remove_candidate(struct task_struct *task)
5044 int ret = LKSM_TASK_SLOT_NONE;
5045 struct task_slot *slot = get_task_slot(task);
5048 list_del(&slot->list);
5049 hash_del(&slot->hlist);
5050 free_task_slot(slot);
5051 ret = LKSM_TASK_SLOT_REMOVED;
5056 /* called by ksm_exit */
5057 void lksm_remove_candidate(struct mm_struct *mm)
5062 struct mm_slot *mm_slot;
5064 spin_lock(&ksm_mmlist_lock);
5065 mm_slot = get_mm_slot(mm);
5066 if (mm_slot && mm_slot != ksm_scan.mm_slot) {
5067 list_move(&mm_slot->mm_list, &ksm_scan.remove_mm_list);
5068 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN))
5069 atomic_dec(&ksm_scan.nr_frozen);
5070 else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5071 atomic_dec(&ksm_scan.nr_scannable);
5072 ksm_debug("mm_slot: %p will be exited", mm_slot);
5074 spin_unlock(&ksm_mmlist_lock);
5078 if (!ksm_test_exit(mm))
5079 ksm_debug("proc-%d(%s) will be removed",
5080 task_pid_nr(mm->owner), mm->owner->comm);
5082 ksm_debug("proc-%d(%s) is exited", task_pid_nr(mm->owner), mm->owner->comm);
5083 spin_lock(&frozen_task_lock);
5084 ret = __lksm_remove_candidate(mm->owner);
5085 spin_unlock(&frozen_task_lock);
5086 if (ret == LKSM_TASK_SLOT_REMOVED)
5087 put_task_struct(mm->owner);
5090 static int lksm_task_frozen(struct task_struct *task)
5092 int need_wakeup = 0;
5093 struct mm_struct *mm = task->mm;
5094 struct mm_slot *mm_slot;
5095 struct task_slot *task_slot;
5097 if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5098 /* a mergeable task becoming frozen */
5099 spin_lock(&ksm_mmlist_lock);
5100 mm_slot = get_mm_slot(mm);
5103 if (mm_slot != ksm_scan.mm_slot
5104 && lksm_test_mm_state(mm_slot, KSM_MM_LISTED)) {
5105 if (list_empty(&mm_slot->scan_list))
5106 list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
5107 if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5108 atomic_dec(&ksm_scan.nr_scannable);
5109 lksm_clear_mm_state(mm_slot, KSM_MM_LISTED);
5110 lksm_set_mm_state(mm_slot, KSM_MM_FROZEN);
5111 atomic_inc(&ksm_scan.nr_frozen);
5113 need_wakeup = (ksm_run == KSM_RUN_MERGE);
5114 ksm_debug("lksm_task_frozen called for task(%s): %p (nr_frozen: %d)",
5115 task->comm, task, atomic_read(&ksm_scan.nr_frozen));
5117 spin_unlock(&ksm_mmlist_lock);
5119 task_slot = alloc_task_slot();
5121 ksm_err("[ksm_tizen] Cannot allocate memory for task_slot\n");
5125 task_slot->task = task;
5126 task_slot->frozen = KSM_TASK_FROZEN;
5127 task_slot->inserted = jiffies;
5129 get_task_struct(task);
5131 spin_lock(&frozen_task_lock);
5132 list_add(&task_slot->list, &frozen_task_list);
5133 insert_to_task_slots_hash(task_slot);
5134 spin_unlock(&frozen_task_lock);
5136 need_wakeup = (ksm_run == KSM_RUN_MERGE);
5137 ksm_debug("task-%d(%s) is added to frozen task list",
5138 task_pid_nr(task), task->comm);
5141 if (need_wakeup && atomic_read(&crawl_state) == KSM_CRAWL_SLEEP)
5142 wake_up(&ksm_crawl_wait);
5147 static int lksm_task_thawed(struct task_struct *task)
5149 struct mm_struct *mm = task->mm;
5150 struct mm_slot *mm_slot;
5151 struct task_slot *task_slot;
5153 if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5154 /* a frozen task becoming thawed */
5155 spin_lock(&ksm_mmlist_lock);
5156 mm_slot = get_mm_slot(mm);
5159 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
5160 && ksm_scan.mm_slot != mm_slot) {
5161 if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5162 atomic_inc(&ksm_scan.nr_scannable);
5164 list_del_init(&mm_slot->scan_list);
5165 lksm_clear_mm_state(mm_slot, KSM_MM_FROZEN);
5166 lksm_set_mm_state(mm_slot, KSM_MM_LISTED);
5167 atomic_dec(&ksm_scan.nr_frozen);
5168 ksm_debug("nr_frozen: %d nr_scannable: %d",
5169 atomic_read(&ksm_scan.nr_frozen),
5170 atomic_read(&ksm_scan.nr_scannable));
5172 spin_unlock(&ksm_mmlist_lock);
5174 /* just remove task slot, it will be cared by full_scan */
5175 spin_lock(&frozen_task_lock);
5176 task_slot = get_task_slot(task);
5178 list_del(&task_slot->list);
5179 hash_del(&task_slot->hlist);
5181 spin_unlock(&frozen_task_lock);
5183 free_task_slot(task_slot);
5184 put_task_struct(task);
5185 ksm_debug("task-%d(%s) is removed from frozen task list",
5186 task_pid_nr(task), task->comm);
5194 * lksm_hint: a hook for construct candidate list
5195 * this function cannot sleep
5197 int lksm_hint(struct task_struct *task, int frozen)
5200 * If lksm_hint is called by ksm_fork, the task yet has its own
5201 * mm_struct because it does not completes mm_struct initialization.
5202 * Thus, we skip this check and put the task into candidate list.
5204 if (frozen == KSM_TASK_FROZEN)
5205 return lksm_task_frozen(task);
5206 else if (frozen == KSM_TASK_THAWED)
5207 return lksm_task_thawed(task);
5212 static void __init lksm_init(void)
5214 ksm_crawld = kthread_create(lksm_crawl_thread, NULL, "ksm_crawld");
5216 if (ksm_crawld == NULL) {
5217 printk(KERN_ALERT "fail to create ksm crawler daemon\n");
5221 atomic_set(&ksm_scan.nr_frozen, 0);
5222 atomic_set(&ksm_scan.nr_scannable, 0);
5223 atomic_set(&ksm_state, 0);
5224 INIT_LIST_HEAD(&ksm_scan.remove_mm_list);
5226 crawler_sleep = msecs_to_jiffies(1000);
5227 #ifdef CONFIG_LKSM_FILTER
5228 init_lksm_region(&heap_region, 0, LKSM_REGION_HEAP, 0);
5229 heap_region.merge_cnt = 0;
5230 heap_region.filter_cnt = 0;
5231 heap_region.filter = NULL;
5233 init_lksm_region(&unknown_region, 0, LKSM_REGION_UNKNOWN, 0);
5234 unknown_region.merge_cnt = 0;
5235 unknown_region.filter_cnt = 0;
5236 unknown_region.filter = NULL;
5238 spin_lock_init(&lksm_region_lock);
5239 #endif /* CONFIG_LKSM_FILTER */
5240 wake_up_process(ksm_crawld);
5243 static int __init ksm_init(void)
5245 struct task_struct *ksm_thread;
5248 /* The correct value depends on page size and endianness */
5249 zero_checksum = calc_checksum(ZERO_PAGE(0));
5250 /* Default to false for backwards compatibility */
5251 ksm_use_zero_pages = false;
5253 err = ksm_slab_init();
5257 ksm_thread = kthread_run(lksm_scan_thread, NULL, "ksmd");
5258 if (IS_ERR(ksm_thread)) {
5259 pr_err("ksm: creating kthread failed\n");
5260 err = PTR_ERR(ksm_thread);
5265 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
5267 pr_err("ksm: register sysfs failed\n");
5268 kthread_stop(ksm_thread);
5272 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
5274 #endif /* CONFIG_SYSFS */
5276 #ifdef CONFIG_MEMORY_HOTREMOVE
5277 /* There is no significance to this priority 100 */
5278 hotplug_memory_notifier(ksm_memory_callback, 100);
5287 subsys_initcall(ksm_init);