From f7096b03eee0deb8121e9952b6c33cc56b0992d8 Mon Sep 17 00:00:00 2001 From: Sung-hun Kim Date: Thu, 16 Jul 2020 09:05:12 +0900 Subject: [PATCH] mm: LKSM: lightweight memory deduplication for embedded devices Lightweight KSM (in short, LKSM) is a variant of KSM that is implemented in a fully event-triggered manner. LKSM provides a memory deduplication facility like vanilla KSM while it extremely reduces overhead. Normally, LKSM is blocked and waits for incoming deduplication requests from user-level process (e.g., resourced). When a user-level process submits deduplication request via sysfs, LKSM performs requested-level of deduplication, such as partial and full deduplications. After that, LKSM is going to sleep again and waits for next request coming. By doing so, LKSM mitigates side effects that impact on system performance, for instance, occupying system resources such as CPU and incurring cache pollutions. Change-Id: I551feb45d8c59ccddea60ced96225780d50e1c43 Signed-off-by: Sung-hun Kim --- include/linux/ksm.h | 21 +- kernel/freezer.c | 12 + mm/lksm.c | 1546 ++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 1445 insertions(+), 134 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e816..97e9bf3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -21,13 +21,28 @@ struct mem_cgroup; #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); -int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); +#ifdef CONFIG_LKSM +int __ksm_enter(struct mm_struct *mm, int frozen); +void lksm_remove_candidate(struct mm_struct *mm); +int lksm_hint(struct task_struct *task, int frozen); + +#define KSM_TASK_UNFROZEN 0 +#define KSM_TASK_FROZEN 1 +#define KSM_TASK_THAWED 2 +#else /* CONFIG_LKSM */ +int __ksm_enter(struct mm_struct *mm); +#endif /* CONFIG_LKSM */ + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +#ifdef CONFIG_LKSM + return lksm_hint(mm->owner, KSM_TASK_UNFROZEN); +#else /* CONFIG_LKSM */ return __ksm_enter(mm); +#endif /* CONFIG_LKSM */ return 0; } @@ -35,6 +50,10 @@ static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); +#ifdef CONFIG_LKSM + else + lksm_remove_candidate(mm); +#endif } /* diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36f..365ad8d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -12,6 +12,10 @@ #include #include +#ifdef CONFIG_LKSM +#include +#endif + /* total number of freezing conditions in effect */ atomic_t system_freezing_cnt = ATOMIC_INIT(0); EXPORT_SYMBOL(system_freezing_cnt); @@ -140,6 +144,10 @@ bool freeze_task(struct task_struct *p) wake_up_state(p, TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&freezer_lock, flags); +#ifdef CONFIG_LKSM + if (!(p->flags & PF_KTHREAD)) + lksm_hint(p, KSM_TASK_FROZEN); +#endif return true; } @@ -151,6 +159,10 @@ void __thaw_task(struct task_struct *p) if (frozen(p)) wake_up_process(p); spin_unlock_irqrestore(&freezer_lock, flags); +#ifdef CONFIG_LKSM + if (!(p->flags & PF_KTHREAD)) + lksm_hint(p, KSM_TASK_THAWED); +#endif } /** diff --git a/mm/lksm.c b/mm/lksm.c index e486c54..fcf5a55 100644 --- a/mm/lksm.c +++ b/mm/lksm.c @@ -1,5 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* + * Lightweight KSM. + * + * This code provides lightweight version of KSM. + * + * Copyright (C) 2020 Samsung Electronics Co., Ltd. + * Author: Sung-hun Kim (sfoon.kim@samsung.com) + */ + +/* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different @@ -50,6 +59,11 @@ #define DO_NUMA(x) do { } while (0) #endif +#define ksm_debug(fmt, ...) \ + printk(KERN_DEBUG "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) +#define ksm_err(fmt, ...) \ + printk(KERN_ERR "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) + /** * DOC: Overview * @@ -110,34 +124,97 @@ * stable trees and multiple unstable trees: one of each for each NUMA node. */ +/* + * A few notes about lightweight KSM. + * + * A smart crawler leverages semantics of tasks in Tizen. + * When the application goes to background, it is attached to freezer + * task group. LKSM crawler hooks this event and adds a "frozen task" + * to candidate list for scanning. + * + */ + +/* merge window size */ +#define MERGE_WIN 3 + /** * struct mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for + * + * extension - added for LKSM + * @state: state of mm_slot (frozen, listed, scanned, newcomer) + * @merge_idx: merge window index to store the number of currently merged pages + * @nr_merged_win: merge window to keep recent three numbers + * @nr_merged: sum of nr_merged_win, used to maintain vips_list (ordered list) + * @ordered_list: list ordered by nr_merged + * @scanning_size: number of anonymous pages in mm_struct + * @fault_cnt: last read count of page fault (minor + major) + * @elapsed: elapsed scanning time + * @nr_scans: number of scanning pages (can be different with scanning_size) */ struct mm_slot { struct hlist_node link; struct list_head mm_list; + struct list_head scan_list; struct rmap_item *rmap_list; struct mm_struct *mm; + + short state; + + short merge_idx; + int nr_merged_win[MERGE_WIN]; + int nr_merged; + struct rb_node ordered_list; + + unsigned long scanning_size; /* in number of pages */ + unsigned long fault_cnt; + unsigned long elapsed; + int nr_scans; +}; + +/* + * scanning mode of LKSM: + * LKSM_SCAN_PARTIAL: perform deduplication on subset of processes + * LKSM_SCAN_FULL: perform deduplication on full set of processes + */ +enum lksm_scan_mode { + LKSM_SCAN_NONE, + LKSM_SCAN_PARTIAL, + LKSM_SCAN_FULL, }; /** * struct ksm_scan - cursor for scanning - * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list - * @seqnr: count of completed full scans (needed when removing unstable node) + * @mm_slot: the current mm_slot we are scanning + * @remove_mm_list: temporary list for batching flush of removed slots + * @nr_scannable: the number of remaining unscanned scannable slots + * @nr_frozen: the number of remaining unscanned frozen slots + * @scan_round: scanning round (partial + full) + * @nr_full_scan: the number of full scanning + * @scan_mode: coverage of current scanning * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { - struct mm_slot *mm_slot; unsigned long address; struct rmap_item **rmap_list; - unsigned long seqnr; + + struct mm_slot *mm_slot; + struct list_head remove_mm_list; + + /* statistics of scanning targets */ + atomic_t nr_scannable; + atomic_t nr_frozen; + + unsigned long scan_round; + unsigned long nr_full_scan; + + enum lksm_scan_mode scan_mode; }; /** @@ -202,7 +279,7 @@ struct rmap_item { }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ - unsigned int oldchecksum; /* when unstable */ + unsigned int oldchecksum; /* when unstable (LSB is a frozen bit) */ union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -212,7 +289,7 @@ struct rmap_item { }; }; -#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define SEQNR_MASK 0x0ff /* low bits of unstable tree scan_round */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ #define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) @@ -224,23 +301,42 @@ static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; +#define LKSM_NODE_ID 0 + /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) +/* list for VIP processes */ +static struct rb_root vips_list = RB_ROOT; +static int lksm_max_vips = 20; + #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); +static DEFINE_HASHTABLE(task_slots_hash, MM_SLOTS_HASH_BITS); +/* + * two list heads in LKSM: + * - ksm_mm_head: a head for traversing whole list of processes, + not used for scanning itself + * - ksm_scan_head: a head for a list of currently scanning processes + */ static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; + +static struct mm_slot ksm_scan_head = { + .scan_list = LIST_HEAD_INIT(ksm_scan_head.scan_list), +}; + static struct ksm_scan ksm_scan = { - .mm_slot = &ksm_mm_head, + .mm_slot = &ksm_scan_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; +static struct kmem_cache *task_slot_cache; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; @@ -275,9 +371,19 @@ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; +/* Processes tracked by KSM thread */ +static unsigned int ksm_nr_added_process; + /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* An indicator for KSM scanning */ +static atomic_t ksm_one_shot_scanning; + +/* Boosting when the scanner performs partial scan */ +static unsigned int lksm_boosted_pages_to_scan = 100; +static unsigned int lksm_default_pages_to_scan = 100; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -287,17 +393,99 @@ static int ksm_nr_node_ids = 1; #define ksm_nr_node_ids 1 #endif +/* + * Default policy for KSM_RUN_ONESHOT: + * KSM performs both scannings only when the user requests it. + * If scanning is ended, both crawler and scanner threads are blocked until + * the next request is coming. + */ #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 +#define KSM_RUN_ONESHOT 8 + static unsigned long ksm_run = KSM_RUN_STOP; +static atomic_t ksm_state; /* 0: in crawling 1: in scanning */ + +#define lksm_check_scan_state(ksm_state) (atomic_read(&ksm_state) == 1) +#define lksm_set_scan_state(ksm_state) (atomic_set(&ksm_state, 1)) +#define lksm_clear_scan_state(ksm_state) (atomic_set(&ksm_state, 0)) + +struct task_slot { + struct task_struct *task; + int frozen; + unsigned long inserted; + struct list_head list; + struct hlist_node hlist; +}; + +/* + * Frozen state: + * When a process stops running on forground (e.g., going to background), + * the system daemon (e.g., resourced) puts it to cgroup_freezer. + * Once a process joins into freezer cgroup, the system kernel does not count + * it as a runnable process, and thus it cannot be scheduled on CPU. + * So, I regard processes in freezer cgroup as a frozen state and that can be + * good candidates of memory deduplication. + * + * LKSM provides a hook to catch the moment that the process is being frozen. + * With the hook, ksm crawler can get candidate list for memory deduplication. + * (see kernel/cgroup_freezer.c) + */ +#define FROZEN_BIT 0x01 +#define LISTED_BIT 0x02 + +#define lksm_test_rmap_frozen(rmap_item) (rmap_item->oldchecksum & FROZEN_BIT) +#define lksm_set_rmap_frozen(rmap_item) (rmap_item->oldchecksum |= FROZEN_BIT) +#define lksm_clear_rmap_frozen(rmap_item) (rmap_item->oldchecksum &= ~FROZEN_BIT) +#define lksm_clear_checksum_frozen(checksum) (checksum &= ~FROZEN_BIT) + +#define KSM_MM_FROZEN 0x01 +#define KSM_MM_LISTED 0x02 +#define KSM_MM_NEWCOMER 0x04 +#define KSM_MM_SCANNED 0x08 + +#define lksm_test_mm_state(mm_slot, bit) (mm_slot->state & bit) +#define lksm_set_mm_state(mm_slot, bit) (mm_slot->state |= bit) +#define lksm_clear_mm_state(mm_slot, bit) (mm_slot->state &= ~bit) + +static int initial_round = 3; +static unsigned long ksm_crawl_round; +static unsigned long crawler_sleep; + +/* statistical information */ +static int lksm_nr_merged; /* global merge count */ +static int lksm_nr_broken; /* global broken count */ +static int lksm_nr_scanned_slot; /* global scanned slot count */ +static int lksm_slot_nr_merged; /* per-slot merge count */ +static int lksm_slot_nr_broken; /* per-slot broken count */ + +/* initially, KSM takes small full scan interval */ +#define DEFAULT_FULL_SCAN_INTERVAL 60000 /* 60 seconds */ +static unsigned long full_scan_interval = 100; + +/* statistical information about scanning time */ +static unsigned long lksm_last_scan_time; +static unsigned long lksm_proc_scan_time; + +/* stuffs for pruning short-lived task */ +#define KSM_SHORT_TASK_TIME 100 +static unsigned long short_lived_thresh = KSM_SHORT_TASK_TIME; + +#define get_task_runtime(task) (task->se.sum_exec_runtime) +#define ms_to_ns(ms) (ms * 1000 * 1000) +#define check_short_task(task) \ + (get_task_runtime(task) < ms_to_ns(short_lived_thresh)) + static void wait_while_offlining(void); +static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); +static DECLARE_WAIT_QUEUE_HEAD(ksm_crawl_wait); #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ @@ -316,9 +504,14 @@ static int __init ksm_slab_init(void) mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) goto out_free2; + task_slot_cache = KSM_KMEM_CACHE(task_slot, 0); + if (!task_slot_cache) + goto out_free3; return 0; +out_free3: + kmem_cache_destroy(mm_slot_cache); out_free2: kmem_cache_destroy(stable_node_cache); out_free1: @@ -439,6 +632,34 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); } +static inline struct task_slot *alloc_task_slot(void) +{ + if (!task_slot_cache) + return NULL; + return kmem_cache_zalloc(task_slot_cache, GFP_NOWAIT); +} + +static inline void free_task_slot(struct task_slot *task_slot) +{ + kmem_cache_free(task_slot_cache, task_slot); +} + +static struct task_slot *get_task_slot(struct task_struct *task) +{ + struct task_slot *slot; + + hash_for_each_possible(task_slots_hash, slot, hlist, + (unsigned long)task) + if (slot->task == task) + return slot; + return NULL; +} + +static inline void insert_to_task_slots_hash(struct task_slot *slot) +{ + hash_add(task_slots_hash, &slot->hlist, (unsigned long)slot->task); +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -577,6 +798,46 @@ out: } /* + * ksm_join: a wrapper function of ksm_enter. + * The function sets VM_MERGEABLE flag of vmas in the given mm_struct. + */ +static int ksm_join(struct mm_struct *mm, int frozen) +{ + struct vm_area_struct *vma; + struct mm_slot *slot; + int newly_allocated = 0; + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + slot = __ksm_enter_alloc_slot(mm, frozen); + if (!slot) + return -ENOMEM; + newly_allocated = 1; + } else { + slot = get_mm_slot(mm); + if (!slot) { + ksm_err("there is no mm_slot for %p", mm); + return -EINVAL; + } + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + continue; + vma->vm_flags |= VM_MERGEABLE; + } + + return newly_allocated; +} + +#define ksm_join_write_lock(mm, frozen, ret) do {\ + down_write(&mm->mmap_sem); \ + ret = ksm_join(mm, frozen); \ + up_write(&mm->mmap_sem); \ +} while (0) + +/* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, @@ -636,9 +897,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - if (rmap_item->hlist.next) + if (rmap_item->hlist.next) { ksm_pages_sharing--; - else + lksm_slot_nr_broken++; + lksm_nr_broken++; + } else ksm_pages_shared--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; @@ -785,9 +1048,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) unlock_page(page); put_page(page); - if (!hlist_empty(&stable_node->hlist)) + if (!hlist_empty(&stable_node->hlist)) { ksm_pages_sharing--; - else + lksm_slot_nr_broken++; + lksm_nr_broken++; + } else ksm_pages_shared--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; @@ -804,11 +1069,13 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) * if this rmap_item was inserted by this scan, rather * than left over from before. */ - age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); - BUG_ON(age > 1); + age = (unsigned char)(ksm_scan.scan_round - rmap_item->address); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); + else + RB_CLEAR_NODE(&rmap_item->node); + ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } @@ -1008,7 +1275,7 @@ static int unmerge_and_remove_all_rmap_items(void) /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); - ksm_scan.seqnr = 0; + ksm_scan.scan_round = 0; return 0; error: @@ -1026,7 +1293,7 @@ static u32 calc_checksum(struct page *page) void *addr = kmap_atomic(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_atomic(addr); - return checksum; + return lksm_clear_checksum_frozen(checksum); } static int write_protect_page(struct vm_area_struct *vma, struct page *page, @@ -1977,7 +2244,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, } rmap_item->address |= UNSTABLE_FLAG; - rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + rmap_item->address |= (ksm_scan.scan_round & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); @@ -2017,9 +2284,11 @@ static void stable_tree_append(struct rmap_item *rmap_item, rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); - if (rmap_item->hlist.next) + if (rmap_item->hlist.next) { ksm_pages_sharing++; - else + lksm_slot_nr_merged++; + lksm_nr_merged++; + } else ksm_pages_shared++; } @@ -2092,15 +2361,18 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } /* - * If the hash value of the page has changed from the last time - * we calculated it, this page is changing frequently: therefore we - * don't want to insert it in the unstable tree, and we don't want - * to waste our time searching for something identical to it there. + * LKSM: In LKSM, KSM is running in a event-triggered manner. + * Because of that scanning is much infrequently performed. + * We just skip caculation of checksum for LKSM to catch scanning + * chances more. */ - checksum = calc_checksum(page); - if (rmap_item->oldchecksum != checksum) { - rmap_item->oldchecksum = checksum; - return; + if (ksm_scan.scan_round < initial_round + && !lksm_test_rmap_frozen(rmap_item)) { + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } } /* @@ -2201,8 +2473,14 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, while (*rmap_list) { rmap_item = *rmap_list; - if ((rmap_item->address & PAGE_MASK) == addr) + if ((rmap_item->address & PAGE_MASK) == addr) { + if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN) + && rmap_item->address & UNSTABLE_FLAG) + lksm_set_rmap_frozen(rmap_item); + else + lksm_clear_rmap_frozen(rmap_item); return rmap_item; + } if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; @@ -2217,75 +2495,127 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; + if (lksm_test_mm_state(mm_slot, FROZEN_BIT)) + lksm_set_rmap_frozen(rmap_item); + else + lksm_clear_rmap_frozen(rmap_item); } return rmap_item; } -static struct rmap_item *scan_get_next_rmap_item(struct page **page) +/* + * lksm_flush_removed_mm_list: + * batched flushing out removed mm_slots by lksm_remove_mm_slot + */ +static void lksm_flush_removed_mm_list(void) { - struct mm_struct *mm; - struct mm_slot *slot; - struct vm_area_struct *vma; - struct rmap_item *rmap_item; - int nid; + struct mm_slot *head, *next, *slot; - if (list_empty(&ksm_mm_head.mm_list)) - return NULL; + spin_lock(&ksm_mmlist_lock); + head = list_first_entry_or_null(&ksm_scan.remove_mm_list, + struct mm_slot, mm_list); + if (!head) { + spin_unlock(&ksm_mmlist_lock); + return; + } - slot = ksm_scan.mm_slot; - if (slot == &ksm_mm_head) { - /* - * A number of pages can hang around indefinitely on per-cpu - * pagevecs, raised page count preventing write_protect_page - * from merging them. Though it doesn't really matter much, - * it is puzzling to see some stuck in pages_volatile until - * other activity jostles them out, and they also prevented - * LTP's KSM test from succeeding deterministically; so drain - * them here (here rather than on entry to ksm_do_scan(), - * so we don't IPI too often when pages_to_scan is set low). - */ - lru_add_drain_all(); + list_del_init(&ksm_scan.remove_mm_list); + spin_unlock(&ksm_mmlist_lock); - /* - * Whereas stale stable_nodes on the stable_tree itself - * get pruned in the regular course of stable_tree_search(), - * those moved out to the migrate_nodes list can accumulate: - * so prune them once before each full scan. - */ - if (!ksm_merge_across_nodes) { - struct stable_node *stable_node, *next; - struct page *page; - - list_for_each_entry_safe(stable_node, next, - &migrate_nodes, list) { - page = get_ksm_page(stable_node, - GET_KSM_PAGE_NOLOCK); - if (page) - put_page(page); - cond_resched(); - } + if (!list_empty(&head->mm_list)) { + list_for_each_entry_safe(slot, next, &head->mm_list, mm_list) { + ksm_debug("slot(%p) will be freed", slot); + list_del(&slot->mm_list); + + cond_resched(); + + remove_trailing_rmap_items(slot, &slot->rmap_list); + clear_bit(MMF_VM_MERGEABLE, &slot->mm->flags); + mmdrop(slot->mm); + free_mm_slot(slot); + } + } + + ksm_debug("slot(%p) will be freed", head); + + cond_resched(); + remove_trailing_rmap_items(head, &head->rmap_list); + clear_bit(MMF_VM_MERGEABLE, &head->mm->flags); + mmdrop(head->mm); + free_mm_slot(head); +} + +/* + * remove mm_slot from lists + * LKSM defers releasing stuffs at the end of scanning + */ +static inline void lksm_remove_mm_slot(struct mm_slot *slot) +{ + hash_del(&slot->link); + list_del_init(&slot->scan_list); + list_move(&slot->mm_list, &ksm_scan.remove_mm_list); + if (!RB_EMPTY_NODE(&slot->ordered_list)) { + rb_erase(&slot->ordered_list, &vips_list); + RB_CLEAR_NODE(&slot->ordered_list); + } +} + +/* caller must hold ksm_mmlist_lock */ +static struct mm_slot *lksm_get_unscanned_mm_slot(struct mm_slot *slot) +{ + struct mm_slot *next; + + list_for_each_entry_safe_continue(slot, next, &ksm_scan_head.scan_list, + scan_list) { + if (ksm_test_exit(slot->mm)) { + ksm_debug("slot:%p %p is moved to remove list", slot, slot->mm); + if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) + atomic_dec(&ksm_scan.nr_frozen); + else + atomic_dec(&ksm_scan.nr_scannable); + lksm_remove_mm_slot(slot); + continue; } - for (nid = 0; nid < ksm_nr_node_ids; nid++) - root_unstable_tree[nid] = RB_ROOT; + lksm_nr_scanned_slot++; + break; + } + + return slot; +} - spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.mm_slot = slot; - spin_unlock(&ksm_mmlist_lock); - /* - * Although we tested list_empty() above, a racing __ksm_exit - * of the last mm on the list may have removed it since then. - */ - if (slot == &ksm_mm_head) - return NULL; -next_mm: - ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; +/* caller must hold ksm_mmlist_lock */ +static void lksm_insert_mm_slot_ordered(struct mm_slot *slot) +{ + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; + struct mm_slot *temp_slot; + + root = &vips_list; + parent = NULL; + new = &root->rb_node; + + while (*new) { + temp_slot = rb_entry(*new, struct mm_slot, ordered_list); + + parent = *new; + if (slot->nr_merged > temp_slot->nr_merged) + new = &parent->rb_left; + else + new = &parent->rb_right; } - mm = slot->mm; - down_read(&mm->mmap_sem); + rb_link_node(&slot->ordered_list, parent, new); + rb_insert_color(&slot->ordered_list, root); +} + +static struct rmap_item *__scan_next_rmap_item(struct page **page, + struct mm_struct *mm, struct mm_slot *slot) +{ + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + if (ksm_test_exit(mm)) vma = NULL; else @@ -2319,7 +2649,6 @@ next_mm: ksm_scan.address += PAGE_SIZE; } else put_page(*page); - up_read(&mm->mmap_sem); return rmap_item; } put_page(*page); @@ -2328,6 +2657,87 @@ next_mm: } } + return NULL; +} + +static inline int sum_merge_win(int merge_win[], int len) +{ + int i, sum = 0; + + for (i = 0; i < len; i++) + sum += merge_win[i]; + return sum; +} + +static inline int lksm_account_mm_slot_nr_merge(struct mm_slot *slot, int nr_merged) +{ + slot->nr_merged_win[slot->merge_idx++] = nr_merged; + if (slot->merge_idx == MERGE_WIN) + slot->merge_idx = 0; + slot->nr_merged = sum_merge_win(slot->nr_merged_win, MERGE_WIN); + return slot->nr_merged; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct rmap_item *rmap_item; + + if (list_empty(&ksm_scan_head.scan_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_scan_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + + if (ksm_scan.scan_round < ksm_crawl_round) { + ksm_scan.scan_round = ksm_crawl_round; + root_unstable_tree[LKSM_NODE_ID] = RB_ROOT; + } + + spin_lock(&ksm_mmlist_lock); + slot = lksm_get_unscanned_mm_slot(slot); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); + + /* + * Although we tested list_empty() above, a racing __ksm_exit + * of the last mm on the list may have removed it since then. + */ + if (slot == &ksm_scan_head) + return NULL; + + slot->elapsed = get_jiffies_64(); +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_list = &slot->rmap_list; + } + + if (unlikely(!ksm_scan.rmap_list)) + ksm_scan.rmap_list = &slot->rmap_list; + + mm = slot->mm; + BUG_ON(!mm); + down_read(&mm->mmap_sem); + rmap_item = __scan_next_rmap_item(page, mm, slot); + + if (rmap_item) { + slot->nr_scans++; + up_read(&mm->mmap_sem); + return rmap_item; + } + if (ksm_test_exit(mm)) { ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; @@ -2339,8 +2749,8 @@ next_mm: remove_trailing_rmap_items(slot, ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + ksm_scan.mm_slot = lksm_get_unscanned_mm_slot(slot); + if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_sem @@ -2351,32 +2761,68 @@ next_mm: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_sem then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->link); - list_del(&slot->mm_list); + up_read(&mm->mmap_sem); + if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) + atomic_dec(&ksm_scan.nr_frozen); + else + atomic_dec(&ksm_scan.nr_scannable); + lksm_remove_mm_slot(slot); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); - mmdrop(mm); + lksm_slot_nr_merged = 0; + lksm_slot_nr_broken = 0; } else { + int newcomer = 0, frozen = 0; + up_read(&mm->mmap_sem); - /* - * up_read(&mm->mmap_sem) first because after - * spin_unlock(&ksm_mmlist_lock) run, the "mm" may - * already have been freed under us by __ksm_exit() - * because the "mm_slot" is still hashed and - * ksm_scan.mm_slot doesn't point to it anymore. - */ + + if (lksm_test_mm_state(slot, KSM_MM_NEWCOMER)) { + lksm_clear_mm_state(slot, KSM_MM_NEWCOMER); + newcomer = 1; + } + if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) { + lksm_clear_mm_state(slot, KSM_MM_FROZEN); + frozen = 1; + atomic_dec(&ksm_scan.nr_frozen); + } else + atomic_dec(&ksm_scan.nr_scannable); + lksm_set_mm_state(slot, KSM_MM_SCANNED); + + list_del_init(&slot->scan_list); + if (!RB_EMPTY_NODE(&slot->ordered_list)) { + rb_erase(&slot->ordered_list, &vips_list); + RB_CLEAR_NODE(&slot->ordered_list); + } + if (lksm_account_mm_slot_nr_merge(slot, lksm_slot_nr_merged)) + lksm_insert_mm_slot_ordered(slot); + + slot->elapsed = get_jiffies_64() - slot->elapsed; spin_unlock(&ksm_mmlist_lock); + + if (ksm_test_exit(slot->mm)) + ksm_debug("slot(%p:%p) is exited", slot, slot->mm); + else + ksm_debug("slot-%d(%s) %d merged %d scanned %lu pages " + "(sum: %d) - (%s, %s) takes %u msecs (nr_scannable: %d)", + task_pid_nr(slot->mm->owner), slot->mm->owner->comm, + lksm_slot_nr_merged - lksm_slot_nr_broken, slot->nr_scans, + slot->scanning_size, slot->nr_merged, + newcomer ? "new" : "old", + frozen ? "frozen" : "normal", + jiffies_to_msecs(slot->elapsed), + atomic_read(&ksm_scan.nr_scannable)); + + lksm_slot_nr_merged = 0; + lksm_slot_nr_broken = 0; } /* Repeat until we've completed scanning the whole list */ slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + if (slot != &ksm_scan_head) { + slot->elapsed = get_jiffies_64(); goto next_mm; + } - ksm_scan.seqnr++; return NULL; } @@ -2384,7 +2830,7 @@ next_mm: * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ -static void ksm_do_scan(unsigned int scan_npages) +static int ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; struct page *uninitialized_var(page); @@ -2393,46 +2839,575 @@ static void ksm_do_scan(unsigned int scan_npages) cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) - return; + return 1; /* need sleep */ cmp_and_merge_page(page, rmap_item); put_page(page); } + return 0; } static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && + !list_empty(&ksm_scan_head.scan_list); } -static int ksm_scan_thread(void *nothing) +static void lksm_scan_wrapup_wait(void) { + if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) { + if (ksm_thread_pages_to_scan != lksm_default_pages_to_scan) + ksm_thread_pages_to_scan = lksm_default_pages_to_scan; + } else if (ksm_scan.scan_mode == LKSM_SCAN_FULL) + ksm_scan.nr_full_scan++; + + lksm_nr_merged = 0; + lksm_nr_broken = 0; + lksm_nr_scanned_slot = 0; + + ksm_scan.scan_mode = LKSM_SCAN_NONE; + if (ksm_run & KSM_RUN_ONESHOT) + atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE); + + lksm_clear_scan_state(ksm_state); + + wait_event_freezable(ksm_thread_wait, + (lksm_check_scan_state(ksm_state) && ksmd_should_run()) + || kthread_should_stop()); +} + +static int lksm_scan_thread(void *nothing) +{ + unsigned long begin, elapsed; unsigned int sleep_ms; + int need_to_sleep = 0; set_freezable(); set_user_nice(current, 5); + ksm_debug("KSM_SCAND pid: %d", task_pid_nr(current)); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) - ksm_do_scan(ksm_thread_pages_to_scan); + need_to_sleep = ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); try_to_freeze(); - if (ksmd_should_run()) { + if (need_to_sleep) { + if (!ksmd_should_run()) { + /* if no one left in scanning list, go to sleep for a while */ + lksm_flush_removed_mm_list(); + + elapsed = get_jiffies_64() - begin; + lksm_last_scan_time = elapsed; + lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot; + + ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages " + "are merged/broken (nr_scannable: %d, nr_frozen: %d)", + lksm_nr_scanned_slot, + jiffies_to_msecs(lksm_last_scan_time), + lksm_nr_merged, lksm_nr_broken, + atomic_read(&ksm_scan.nr_scannable), + atomic_read(&ksm_scan.nr_frozen)); + + lksm_scan_wrapup_wait(); + + ksm_debug("Start %lu-th scanning: nr_scannable(%d) " + "nr_frozen(%d)", + ksm_scan.scan_round, + atomic_read(&ksm_scan.nr_scannable), + atomic_read(&ksm_scan.nr_frozen)); + + if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) { + if (lksm_boosted_pages_to_scan != + ksm_thread_pages_to_scan) { + ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan; + ksm_debug("set pages_to_scan to %u", + lksm_boosted_pages_to_scan); + } + } + begin = get_jiffies_64(); + } else { + /* new scanning targets are coming */ + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); + } + need_to_sleep = 0; + } else if (ksmd_should_run()) { + /* normal sleep */ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_interruptible_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { - wait_event_freezable(ksm_thread_wait, - ksmd_should_run() || kthread_should_stop()); + /* wait for activating ksm */ + if (likely(ksm_scan.scan_round > 0)) { + lksm_flush_removed_mm_list(); + + elapsed = get_jiffies_64() - begin; + lksm_last_scan_time = elapsed; + lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot; + + ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages are merged/broken", + lksm_nr_scanned_slot, jiffies_to_msecs(lksm_last_scan_time), + lksm_nr_merged, lksm_nr_broken); + + lksm_scan_wrapup_wait(); + } else + wait_event_freezable(ksm_thread_wait, + (lksm_check_scan_state(ksm_state) && ksmd_should_run()) + || kthread_should_stop()); + + ksm_debug("Start %lu-th scanning: nr_scannable(%d) nr_frozen(%d)", + ksm_scan.scan_round, + atomic_read(&ksm_scan.nr_scannable), + atomic_read(&ksm_scan.nr_frozen)); + + if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) { + ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan; + ksm_debug("set pages_to_scan to %u", + lksm_boosted_pages_to_scan); + } + begin = get_jiffies_64(); } } return 0; } +/* + * lksm crawler declaration & definition part + */ +static struct task_struct *ksm_crawld; + +LIST_HEAD(frozen_task_list); +DEFINE_SPINLOCK(frozen_task_lock); + +enum { + KSM_CRAWL_SLEEP, + KSM_CRAWL_RUN, +} ksm_crawl_state; +static atomic_t crawl_state; + +enum { + LKSM_TASK_SLOT_NONE = 0, + LKSM_TASK_SLOT_REMOVED, +}; + +static inline int lksm_count_and_clear_mm_slots +(struct mm_slot *head, unsigned long *delay) +{ + int count = 0; + struct mm_slot *slot; + + spin_lock(&ksm_mmlist_lock); + list_for_each_entry(slot, &head->mm_list, mm_list) { + if (list_empty(&slot->scan_list)) { + lksm_clear_mm_state(slot, KSM_MM_SCANNED); + slot->nr_scans = 0; + slot->scanning_size = get_mm_counter(slot->mm, MM_ANONPAGES); + list_add_tail(&slot->scan_list, &ksm_scan_head.scan_list); + *delay += slot->elapsed; + count++; + } + } + spin_unlock(&ksm_mmlist_lock); + return count; +} + +static int lksm_prepare_frozen_scan(void) +{ + int nr_frozen = 0, nr_added = 0, err; + struct task_struct *task; + struct task_slot *task_slot; + struct mm_struct *mm; + + spin_lock(&frozen_task_lock); + nr_frozen = atomic_read(&ksm_scan.nr_frozen); + if (list_empty(&frozen_task_list)) { + spin_unlock(&frozen_task_lock); + return nr_frozen; + } + + ksm_debug("prepare frozen scan: round(%lu)", ksm_crawl_round); + task_slot = list_first_entry_or_null(&frozen_task_list, + struct task_slot, list); + while (task_slot) { + list_del(&task_slot->list); + hash_del(&task_slot->hlist); + spin_unlock(&frozen_task_lock); + + task = task_slot->task; + if (ksm_run & KSM_RUN_UNMERGE) { + put_task_struct(task); + free_task_slot(task_slot); + goto clean_up_abort; + } + + mm = get_task_mm(task); + + if (!mm || ksm_test_exit(mm)) + goto mm_exit_out; + + if (mm) { + ksm_join_write_lock(mm, task_slot->frozen, err); + if (!err) + nr_added++; + } + +mm_exit_out: + free_task_slot(task_slot); + put_task_struct(task); + if (mm) + mmput(mm); + + cond_resched(); + + spin_lock(&frozen_task_lock); + task_slot = list_first_entry_or_null(&frozen_task_list, + struct task_slot, list); + } + spin_unlock(&frozen_task_lock); + atomic_add(nr_added, &ksm_scan.nr_frozen); + + return nr_added + nr_frozen; + +clean_up_abort: + spin_lock(&frozen_task_lock); + task_slot = list_first_entry_or_null(&frozen_task_list, + struct task_slot, list); + while (task_slot) { + list_del(&task_slot->list); + hash_del(&task_slot->hlist); + spin_unlock(&frozen_task_lock); + + task = task_slot->task; + put_task_struct(task); + free_task_slot(task_slot); + + spin_lock(&frozen_task_lock); + task_slot = list_first_entry_or_null(&frozen_task_list, + struct task_slot, list); + } + spin_unlock(&frozen_task_lock); + + return 0; +} + +/* this function make a list of new processes and vip processes */ +static int lksm_prepare_partial_scan(void) +{ + int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0; + unsigned long delay = 0; + unsigned long fault_cnt = 0; + struct task_struct *task; + struct mm_struct *mm; + struct mm_slot *mm_slot; + struct list_head recheck_list; + struct rb_node *node; + + ksm_debug("prepare partial scan: round(%lu)", ksm_crawl_round); + INIT_LIST_HEAD(&recheck_list); + + nr_frozen = lksm_prepare_frozen_scan(); + + /* get newbies */ + for_each_process(task) { + if (task == current || task_pid_nr(task) == 0 + || check_short_task(task)) + continue; + if (ksm_run & KSM_RUN_UNMERGE) { + nr_frozen = 0; + nr_added = 0; + goto abort; + } + mm = get_task_mm(task); + if (!mm) + continue; + ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret); + if (ret > 0) + nr_added++; + mmput(mm); + } + + /* get vips */ + if (nr_added + nr_frozen >= lksm_max_vips) { + ksm_debug("nr_scannable(%d) already fulfilled skip vips", + nr_added + nr_frozen); + goto skip_vips; + } + + spin_lock(&ksm_mmlist_lock); + node = rb_first(&vips_list); + if (!node) { + ksm_debug("empty vip list"); + spin_unlock(&ksm_mmlist_lock); + goto skip_vips; + } + mm_slot = rb_entry(node, struct mm_slot, ordered_list); + while (nr_scannable + nr_added + nr_frozen < lksm_max_vips) { + if (ksm_run & KSM_RUN_UNMERGE) { + spin_unlock(&ksm_mmlist_lock); + nr_scannable = 0; + nr_frozen = 0; + nr_added = 0; + goto abort; + } + if (ksm_test_exit(mm_slot->mm)) { + if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) + atomic_dec(&ksm_scan.nr_scannable); + lksm_remove_mm_slot(mm_slot); + goto next_node; + } + if (!lksm_test_mm_state(mm_slot, KSM_MM_LISTED)) + goto next_node; + + /* prunning by fault count */ + fault_cnt = mm_slot->mm->owner->maj_flt + mm_slot->mm->owner->min_flt; + if (mm_slot->fault_cnt == fault_cnt) + goto next_node; + + mm_slot->fault_cnt = fault_cnt; + mm_slot->scanning_size = get_mm_counter(mm_slot->mm, MM_ANONPAGES); + mm_slot->nr_scans = 0; + delay += mm_slot->elapsed; + ksm_debug("slot(nr_merged: %d, scanning_size: %lu) task(%s)", + mm_slot->nr_merged, mm_slot->scanning_size, + mm_slot->mm->owner->comm); + list_move_tail(&mm_slot->scan_list, &recheck_list); + lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED); + nr_scannable++; + +next_node: + node = rb_next(node); + if (!node) + break; + mm_slot = rb_entry(node, struct mm_slot, ordered_list); + } + spin_unlock(&ksm_mmlist_lock); + +skip_vips: + spin_lock(&ksm_mmlist_lock); + if (!list_empty(&recheck_list)) + list_splice(&recheck_list, &ksm_scan_head.scan_list); + spin_unlock(&ksm_mmlist_lock); + + ksm_scan.scan_mode = LKSM_SCAN_PARTIAL; + ksm_crawl_round++; + + atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable); + ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d", + nr_frozen, nr_added, nr_scannable, atomic_read(&ksm_scan.nr_scannable)); +abort: + return nr_frozen + nr_added + nr_scannable; +} + +static int lksm_prepare_full_scan(unsigned long *next_fullscan) +{ + int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0, nr_target; + unsigned long delay = 0; + struct task_struct *task; + struct mm_struct *mm; + + ksm_debug("prepare full scan: round(%lu)", ksm_crawl_round); + + nr_frozen = lksm_prepare_frozen_scan(); + + for_each_process(task) { + if (task == current || task_pid_nr(task) == 0 + || check_short_task(task)) + continue; + if (ksm_run & KSM_RUN_UNMERGE) { + nr_target = 0; + goto abort; + } + + mm = get_task_mm(task); + if (!mm) + continue; + ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret); + if (ret > 0) + nr_added++; + mmput(mm); + } + + nr_scannable = lksm_count_and_clear_mm_slots(&ksm_mm_head, &delay); + nr_target = nr_scannable + nr_added + nr_frozen; + + /* calculate crawler's sleep time */ + delay += msecs_to_jiffies((nr_frozen + nr_added) * lksm_proc_scan_time); + *next_fullscan = jiffies + delay + msecs_to_jiffies(full_scan_interval); + + ksm_scan.scan_mode = LKSM_SCAN_FULL; + ksm_crawl_round++; + + atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable); + ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d", + nr_frozen, nr_added, nr_scannable, + atomic_read(&ksm_scan.nr_scannable)); +abort: + return nr_target; +} + +static int lksm_do_wait_userspace_event(unsigned long sleep_time) +{ + wait_event_freezable(ksm_crawl_wait, + kthread_should_stop() || + (atomic_read(&ksm_one_shot_scanning) > 0)); + return atomic_read(&ksm_one_shot_scanning); +} + +static int lksm_do_wait_frozen_event(unsigned long sleep_time) +{ + int need_scan = 0; + + spin_lock_irq(&frozen_task_lock); + if (list_empty(&frozen_task_list)) + /* wait until candidate list is filled */ + wait_event_interruptible_lock_irq_timeout( + ksm_crawl_wait, + kthread_should_stop() + || !list_empty(&frozen_task_list) + || !list_empty(&ksm_scan_head.scan_list), + frozen_task_lock, sleep_time); + + if (!list_empty(&frozen_task_list) || + !list_empty(&ksm_scan_head.scan_list)) + need_scan = 1; + spin_unlock_irq(&frozen_task_lock); + + return need_scan; +} + +static inline void lksm_wake_up_scan_thread(void) +{ + ksm_debug("wake up lksm_scan_thread"); + lksm_set_scan_state(ksm_state); + wake_up(&ksm_thread_wait); +} + +#define LKSM_CRAWL_FROZEN_EVENT_WAIT 100 /* 100ms */ + +static void lksm_do_crawl_once +(unsigned long *next_fscan, unsigned long sleep_time) +{ + int nr_added = 0; + int scan_mode; + + /* cralwer thread waits for trigger event from userspace */ + scan_mode = lksm_do_wait_userspace_event(sleep_time); + + if (scan_mode == LKSM_SCAN_PARTIAL) { + atomic_set(&crawl_state, KSM_CRAWL_RUN); + msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT); + nr_added = lksm_prepare_partial_scan(); + } else if (scan_mode == LKSM_SCAN_FULL) { + atomic_set(&crawl_state, KSM_CRAWL_RUN); + nr_added = lksm_prepare_full_scan(next_fscan); + } + + try_to_freeze(); + + if (nr_added > 0) + lksm_wake_up_scan_thread(); + else { + ksm_debug("No one can be scanned!"); + atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE); + } + atomic_set(&crawl_state, KSM_CRAWL_SLEEP); +} + +static void lksm_do_crawl_periodic +(unsigned long *next_fscan, unsigned long sleep_time) +{ + int nr_added = 0; + + if (time_is_before_eq_jiffies(*next_fscan)) { + atomic_set(&crawl_state, KSM_CRAWL_RUN); + nr_added = lksm_prepare_full_scan(next_fscan); + } else if (lksm_do_wait_frozen_event(sleep_time)) { + atomic_set(&crawl_state, KSM_CRAWL_RUN); + msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT); + nr_added = lksm_prepare_partial_scan(); + } + + try_to_freeze(); + + if (nr_added > 0) + lksm_wake_up_scan_thread(); + atomic_set(&crawl_state, KSM_CRAWL_SLEEP); +} + +static int lksm_crawl_thread(void *data) +{ + int nr_added = 0; + unsigned long next_fscan = jiffies; /* next full scan */ + unsigned long sleep_time = crawler_sleep; + + set_freezable(); + set_user_nice(current, 5); + + ksm_debug("KSM_CRAWLD pid: %d", task_pid_nr(current)); + wait_event_freezable(ksm_crawl_wait, + kthread_should_stop() || ksm_run & KSM_RUN_MERGE); + /* initial loop */ + while (!kthread_should_stop() && ksm_crawl_round < initial_round) { + + try_to_freeze(); + + if ((ksm_run & KSM_RUN_MERGE) && + !lksm_check_scan_state(ksm_state) && + time_is_before_eq_jiffies(next_fscan)) { + nr_added = lksm_prepare_full_scan(&next_fscan); + if (nr_added) { + lksm_wake_up_scan_thread(); + nr_added = 0; + } + next_fscan = jiffies + sleep_time; + } + + wait_event_interruptible_timeout(ksm_crawl_wait, + kthread_should_stop() || !lksm_check_scan_state(ksm_state), + sleep_time); + } + + /* initialization loop done */ + full_scan_interval = DEFAULT_FULL_SCAN_INTERVAL; + next_fscan = jiffies + msecs_to_jiffies(full_scan_interval); + atomic_set(&crawl_state, KSM_CRAWL_SLEEP); + + /* normal operation loop */ + while (!kthread_should_stop()) { + if (ksm_run & KSM_RUN_ONESHOT) { + if (!lksm_check_scan_state(ksm_state)) + lksm_do_crawl_once(&next_fscan, sleep_time); + else + /* wait until scanning done */ + wait_event_freezable(ksm_crawl_wait, + !lksm_check_scan_state(ksm_state) + || kthread_should_stop()); + } else if (ksm_run & KSM_RUN_MERGE) { + if (!lksm_check_scan_state(ksm_state)) + lksm_do_crawl_periodic(&next_fscan, sleep_time); + else + /* wait until scanning done */ + wait_event_interruptible_timeout(ksm_crawl_wait, + !lksm_check_scan_state(ksm_state) + || kthread_should_stop(), + sleep_time); + try_to_freeze(); + } else { + ksm_debug("ksm is not activated"); + wait_event_freezable(ksm_crawl_wait, + kthread_should_stop() || (ksm_run & KSM_RUN_MERGE)); + } + } + + return 0; +} + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { @@ -2462,7 +3437,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, #endif if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { - err = __ksm_enter(mm); + err = __ksm_enter(mm, KSM_TASK_UNFROZEN); if (err) return err; } @@ -2487,17 +3462,23 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -int __ksm_enter(struct mm_struct *mm) +static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen) { struct mm_slot *mm_slot; - int needs_wakeup; mm_slot = alloc_mm_slot(); if (!mm_slot) - return -ENOMEM; + return NULL; - /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + if (frozen == KSM_TASK_FROZEN) + lksm_set_mm_state(mm_slot, KSM_MM_FROZEN | KSM_MM_NEWCOMER); + else + lksm_set_mm_state(mm_slot, KSM_MM_LISTED | KSM_MM_NEWCOMER); + + lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED); + RB_CLEAR_NODE(&mm_slot->ordered_list); + mm_slot->fault_cnt = mm->owner->maj_flt + mm->owner->min_flt; + mm_slot->scanning_size = get_mm_counter(mm, MM_ANONPAGES); spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); @@ -2513,16 +3494,22 @@ int __ksm_enter(struct mm_struct *mm) */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); - else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + else { + list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list); + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + } + ksm_nr_added_process++; spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); - mmgrab(mm); + atomic_inc(&mm->mm_count); - if (needs_wakeup) - wake_up_interruptible(&ksm_thread_wait); + return mm_slot; +} +int __ksm_enter(struct mm_struct *mm, int frozen) +{ + if (!__ksm_enter_alloc_slot(mm, frozen)) + return -ENOMEM; return 0; } @@ -2542,16 +3529,32 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); mm_slot = get_mm_slot(mm); - if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (!mm_slot) { + spin_unlock(&ksm_mmlist_lock); + return; + } + + if (ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); + list_del(&mm_slot->scan_list); + if (!RB_EMPTY_NODE(&mm_slot->ordered_list)) { + rb_erase(&mm_slot->ordered_list, &vips_list); + RB_CLEAR_NODE(&mm_slot->ordered_list); + } easy_to_free = 1; - } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + } else + lksm_remove_mm_slot(mm_slot); + if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)) { + atomic_dec(&ksm_scan.nr_frozen); + ksm_debug("nr_frozen: %d", atomic_read(&ksm_scan.nr_frozen)); + } else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) { + atomic_dec(&ksm_scan.nr_scannable); + ksm_debug("nr_scannable: %d", atomic_read(&ksm_scan.nr_scannable)); } } + ksm_nr_added_process--; spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { @@ -2900,7 +3903,10 @@ KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_run); + if (ksm_run & KSM_RUN_ONESHOT) + return sprintf(buf, "%u\n", KSM_RUN_ONESHOT); + else + return sprintf(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -2912,7 +3918,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 10, &flags); if (err || flags > UINT_MAX) return -EINVAL; - if (flags > KSM_RUN_UNMERGE) + if (flags > KSM_RUN_ONESHOT) return -EINVAL; /* @@ -2925,7 +3931,10 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { - ksm_run = flags; + if (flags == KSM_RUN_ONESHOT) + ksm_run = KSM_RUN_MERGE | KSM_RUN_ONESHOT; + else + ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); @@ -2938,8 +3947,10 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } mutex_unlock(&ksm_thread_mutex); - if (flags & KSM_RUN_MERGE) - wake_up_interruptible(&ksm_thread_wait); + if (ksm_run & KSM_RUN_MERGE) { + ksm_debug("activate KSM"); + wake_up(&ksm_crawl_wait); + } return count; } @@ -3147,10 +4158,88 @@ KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_scan.seqnr); + return sprintf(buf, "%lu\n", ksm_scan.nr_full_scan); } KSM_ATTR_RO(full_scans); +static ssize_t scanning_process_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_nr_added_process); +} +KSM_ATTR_RO(scanning_process); + +static ssize_t full_scan_interval_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", full_scan_interval); +} + +static ssize_t full_scan_interval_store(struct kobject *kbj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long interval; + + err = kstrtoul(buf, 10, &interval); + if (err || interval > UINT_MAX) + return -EINVAL; + + full_scan_interval = interval; + return count; +} +KSM_ATTR(full_scan_interval); + +static ssize_t one_shot_scanning_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", atomic_read(&ksm_one_shot_scanning)); +} + +static ssize_t one_shot_scanning_store(struct kobject *kbj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err, val; + + err = kstrtoint(buf, 10, &val); + if (err || (val != LKSM_SCAN_PARTIAL && val != LKSM_SCAN_FULL)) { + ksm_err("wrong value: %d", val); + return -EINVAL; + } + + if (!atomic_cmpxchg(&ksm_one_shot_scanning, LKSM_SCAN_NONE, val)) { + wake_up(&ksm_crawl_wait); + return count; + } + ksm_debug("ksm is still scanning"); + return -EINVAL; +} +KSM_ATTR(one_shot_scanning); + +static ssize_t scan_boost_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", lksm_boosted_pages_to_scan); +} + +static ssize_t scan_boost_store(struct kobject *kbj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err, val; + + err = kstrtoint(buf, 10, &val); + /* lksm_boosted_pages_to_scan must presence in from 100 to 10000 */ + if (err || val < 100 || val > 10000) { + ksm_err("wrong value: %d", val); + return -EINVAL; + } + + lksm_boosted_pages_to_scan = (unsigned int) val; + + return count; +} +KSM_ATTR(scan_boost); + static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, @@ -3168,6 +4257,10 @@ static struct attribute *ksm_attrs[] = { &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, + &scanning_process_attr.attr, + &full_scan_interval_attr.attr, + &one_shot_scanning_attr.attr, + &scan_boost_attr.attr, NULL, }; @@ -3177,6 +4270,193 @@ static const struct attribute_group ksm_attr_group = { }; #endif /* CONFIG_SYSFS */ +static inline int __lksm_remove_candidate(struct task_struct *task) +{ + int ret = LKSM_TASK_SLOT_NONE; + struct task_slot *slot = get_task_slot(task); + + if (slot) { + list_del(&slot->list); + hash_del(&slot->hlist); + free_task_slot(slot); + ret = LKSM_TASK_SLOT_REMOVED; + } + return ret; +} + +/* called by ksm_exit */ +void lksm_remove_candidate(struct mm_struct *mm) +{ + int ret; + + if (!mm->owner) { + struct mm_slot *mm_slot; + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && mm_slot != ksm_scan.mm_slot) { + list_move(&mm_slot->mm_list, &ksm_scan.remove_mm_list); + if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)) + atomic_dec(&ksm_scan.nr_frozen); + else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) + atomic_dec(&ksm_scan.nr_scannable); + ksm_debug("mm_slot: %p will be exited", mm_slot); + } + spin_unlock(&ksm_mmlist_lock); + return; + } + + if (!ksm_test_exit(mm)) + ksm_debug("proc-%d(%s) will be removed", + task_pid_nr(mm->owner), mm->owner->comm); + + spin_lock(&frozen_task_lock); + ret = __lksm_remove_candidate(mm->owner); + spin_unlock(&frozen_task_lock); + if (ret == LKSM_TASK_SLOT_REMOVED) + put_task_struct(mm->owner); +} + +static int lksm_task_frozen(struct task_struct *task) +{ + int need_wakeup = 0; + struct mm_struct *mm = task->mm; + struct mm_slot *mm_slot; + struct task_slot *task_slot; + + if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + /* a mergeable task becoming frozen */ + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + BUG_ON(!mm_slot); + + if (mm_slot != ksm_scan.mm_slot + && lksm_test_mm_state(mm_slot, KSM_MM_LISTED)) { + if (list_empty(&mm_slot->scan_list)) + list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list); + if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) + atomic_dec(&ksm_scan.nr_scannable); + lksm_clear_mm_state(mm_slot, KSM_MM_LISTED); + lksm_set_mm_state(mm_slot, KSM_MM_FROZEN); + atomic_inc(&ksm_scan.nr_frozen); + + need_wakeup = (ksm_run == KSM_RUN_MERGE); + ksm_debug("lksm_task_frozen called for task(%s): %p (nr_frozen: %d)", + task->comm, task, atomic_read(&ksm_scan.nr_frozen)); + } + spin_unlock(&ksm_mmlist_lock); + } else { + task_slot = alloc_task_slot(); + if (!task_slot) { + ksm_err("[ksm_tizen] Cannot allocate memory for task_slot\n"); + return -ENOMEM; + } + + task_slot->task = task; + task_slot->frozen = KSM_TASK_FROZEN; + task_slot->inserted = jiffies; + + get_task_struct(task); + + spin_lock(&frozen_task_lock); + list_add(&task_slot->list, &frozen_task_list); + insert_to_task_slots_hash(task_slot); + spin_unlock(&frozen_task_lock); + + need_wakeup = (ksm_run == KSM_RUN_MERGE); + ksm_debug("task-%d(%s) is added to frozen task list", + task_pid_nr(task), task->comm); + } + + if (need_wakeup && atomic_read(&crawl_state) == KSM_CRAWL_SLEEP) + wake_up(&ksm_crawl_wait); + + return 0; +} + +static int lksm_task_thawed(struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct mm_slot *mm_slot; + struct task_slot *task_slot; + + if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + /* a frozen task becoming thawed */ + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + BUG_ON(!mm_slot); + + if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN) + && ksm_scan.mm_slot != mm_slot) { + if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) + atomic_inc(&ksm_scan.nr_scannable); + else + list_del_init(&mm_slot->scan_list); + lksm_clear_mm_state(mm_slot, KSM_MM_FROZEN); + lksm_set_mm_state(mm_slot, KSM_MM_LISTED); + atomic_dec(&ksm_scan.nr_frozen); + ksm_debug("nr_frozen: %d nr_scannable: %d", + atomic_read(&ksm_scan.nr_frozen), + atomic_read(&ksm_scan.nr_scannable)); + } + spin_unlock(&ksm_mmlist_lock); + } else { + /* just remove task slot, it will be cared by full_scan */ + spin_lock(&frozen_task_lock); + task_slot = get_task_slot(task); + if (task_slot) { + list_del(&task_slot->list); + hash_del(&task_slot->hlist); + } + spin_unlock(&frozen_task_lock); + if (task_slot) { + free_task_slot(task_slot); + put_task_struct(task); + ksm_debug("task-%d(%s) is removed from frozen task list", + task_pid_nr(task), task->comm); + } + } + + return 0; +} + +/* + * lksm_hint: a hook for construct candidate list + * this function cannot sleep + */ +int lksm_hint(struct task_struct *task, int frozen) +{ + /* + * If lksm_hint is called by ksm_fork, the task yet has its own + * mm_struct because it does not completes mm_struct initialization. + * Thus, we skip this check and put the task into candidate list. + */ + if (frozen == KSM_TASK_FROZEN) + return lksm_task_frozen(task); + else if (frozen == KSM_TASK_THAWED) + return lksm_task_thawed(task); + else + return 0; +} + +static void __init lksm_init(void) +{ + ksm_crawld = kthread_create(lksm_crawl_thread, NULL, "ksm_crawld"); + + if (ksm_crawld == NULL) { + printk(KERN_ALERT "fail to create ksm crawler daemon\n"); + return; + } + + atomic_set(&ksm_scan.nr_frozen, 0); + atomic_set(&ksm_scan.nr_scannable, 0); + atomic_set(&ksm_state, 0); + INIT_LIST_HEAD(&ksm_scan.remove_mm_list); + + crawler_sleep = msecs_to_jiffies(1000); + wake_up_process(ksm_crawld); +} + static int __init ksm_init(void) { struct task_struct *ksm_thread; @@ -3191,7 +4471,7 @@ static int __init ksm_init(void) if (err) goto out; - ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + ksm_thread = kthread_run(lksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); @@ -3209,7 +4489,7 @@ static int __init ksm_init(void) ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ - + lksm_init(); #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, 100); -- 2.7.4