// SPDX-License-Identifier: GPL-2.0-only
/*
+ * Lightweight KSM.
+ *
+ * This code provides lightweight version of KSM.
+ *
+ * Copyright (C) 2020 Samsung Electronics Co., Ltd.
+ * Author: Sung-hun Kim (sfoon.kim@samsung.com)
+ */
+
+/*
* Memory merging support.
*
* This code enables dynamic sharing of identical pages found in different
#define DO_NUMA(x) do { } while (0)
#endif
+#define ksm_debug(fmt, ...) \
+ printk(KERN_DEBUG "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
+#define ksm_err(fmt, ...) \
+ printk(KERN_ERR "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
+
/**
* DOC: Overview
*
* stable trees and multiple unstable trees: one of each for each NUMA node.
*/
+/*
+ * A few notes about lightweight KSM.
+ *
+ * A smart crawler leverages semantics of tasks in Tizen.
+ * When the application goes to background, it is attached to freezer
+ * task group. LKSM crawler hooks this event and adds a "frozen task"
+ * to candidate list for scanning.
+ *
+ */
+
+/* merge window size */
+#define MERGE_WIN 3
+
/**
* struct mm_slot - ksm information per mm that is being scanned
* @link: link to the mm_slots hash list
* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
* @mm: the mm that this information is valid for
+ *
+ * extension - added for LKSM
+ * @state: state of mm_slot (frozen, listed, scanned, newcomer)
+ * @merge_idx: merge window index to store the number of currently merged pages
+ * @nr_merged_win: merge window to keep recent three numbers
+ * @nr_merged: sum of nr_merged_win, used to maintain vips_list (ordered list)
+ * @ordered_list: list ordered by nr_merged
+ * @scanning_size: number of anonymous pages in mm_struct
+ * @fault_cnt: last read count of page fault (minor + major)
+ * @elapsed: elapsed scanning time
+ * @nr_scans: number of scanning pages (can be different with scanning_size)
*/
struct mm_slot {
struct hlist_node link;
struct list_head mm_list;
+ struct list_head scan_list;
struct rmap_item *rmap_list;
struct mm_struct *mm;
+
+ short state;
+
+ short merge_idx;
+ int nr_merged_win[MERGE_WIN];
+ int nr_merged;
+ struct rb_node ordered_list;
+
+ unsigned long scanning_size; /* in number of pages */
+ unsigned long fault_cnt;
+ unsigned long elapsed;
+ int nr_scans;
+};
+
+/*
+ * scanning mode of LKSM:
+ * LKSM_SCAN_PARTIAL: perform deduplication on subset of processes
+ * LKSM_SCAN_FULL: perform deduplication on full set of processes
+ */
+enum lksm_scan_mode {
+ LKSM_SCAN_NONE,
+ LKSM_SCAN_PARTIAL,
+ LKSM_SCAN_FULL,
};
/**
* struct ksm_scan - cursor for scanning
- * @mm_slot: the current mm_slot we are scanning
* @address: the next address inside that to be scanned
* @rmap_list: link to the next rmap to be scanned in the rmap_list
- * @seqnr: count of completed full scans (needed when removing unstable node)
+ * @mm_slot: the current mm_slot we are scanning
+ * @remove_mm_list: temporary list for batching flush of removed slots
+ * @nr_scannable: the number of remaining unscanned scannable slots
+ * @nr_frozen: the number of remaining unscanned frozen slots
+ * @scan_round: scanning round (partial + full)
+ * @nr_full_scan: the number of full scanning
+ * @scan_mode: coverage of current scanning
*
* There is only the one ksm_scan instance of this cursor structure.
*/
struct ksm_scan {
- struct mm_slot *mm_slot;
unsigned long address;
struct rmap_item **rmap_list;
- unsigned long seqnr;
+
+ struct mm_slot *mm_slot;
+ struct list_head remove_mm_list;
+
+ /* statistics of scanning targets */
+ atomic_t nr_scannable;
+ atomic_t nr_frozen;
+
+ unsigned long scan_round;
+ unsigned long nr_full_scan;
+
+ enum lksm_scan_mode scan_mode;
};
/**
};
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
- unsigned int oldchecksum; /* when unstable */
+ unsigned int oldchecksum; /* when unstable (LSB is a frozen bit) */
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
};
};
-#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
+#define SEQNR_MASK 0x0ff /* low bits of unstable tree scan_round */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;
+#define LKSM_NODE_ID 0
+
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
+/* list for VIP processes */
+static struct rb_root vips_list = RB_ROOT;
+static int lksm_max_vips = 20;
+
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+static DEFINE_HASHTABLE(task_slots_hash, MM_SLOTS_HASH_BITS);
+/*
+ * two list heads in LKSM:
+ * - ksm_mm_head: a head for traversing whole list of processes,
+ not used for scanning itself
+ * - ksm_scan_head: a head for a list of currently scanning processes
+ */
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
+
+static struct mm_slot ksm_scan_head = {
+ .scan_list = LIST_HEAD_INIT(ksm_scan_head.scan_list),
+};
+
static struct ksm_scan ksm_scan = {
- .mm_slot = &ksm_mm_head,
+ .mm_slot = &ksm_scan_head,
};
static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
+static struct kmem_cache *task_slot_cache;
/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;
/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;
+/* Processes tracked by KSM thread */
+static unsigned int ksm_nr_added_process;
+
/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;
+/* An indicator for KSM scanning */
+static atomic_t ksm_one_shot_scanning;
+
+/* Boosting when the scanner performs partial scan */
+static unsigned int lksm_boosted_pages_to_scan = 100;
+static unsigned int lksm_default_pages_to_scan = 100;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
#define ksm_nr_node_ids 1
#endif
+/*
+ * Default policy for KSM_RUN_ONESHOT:
+ * KSM performs both scannings only when the user requests it.
+ * If scanning is ended, both crawler and scanner threads are blocked until
+ * the next request is coming.
+ */
#define KSM_RUN_STOP 0
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
#define KSM_RUN_OFFLINE 4
+#define KSM_RUN_ONESHOT 8
+
static unsigned long ksm_run = KSM_RUN_STOP;
+static atomic_t ksm_state; /* 0: in crawling 1: in scanning */
+
+#define lksm_check_scan_state(ksm_state) (atomic_read(&ksm_state) == 1)
+#define lksm_set_scan_state(ksm_state) (atomic_set(&ksm_state, 1))
+#define lksm_clear_scan_state(ksm_state) (atomic_set(&ksm_state, 0))
+
+struct task_slot {
+ struct task_struct *task;
+ int frozen;
+ unsigned long inserted;
+ struct list_head list;
+ struct hlist_node hlist;
+};
+
+/*
+ * Frozen state:
+ * When a process stops running on forground (e.g., going to background),
+ * the system daemon (e.g., resourced) puts it to cgroup_freezer.
+ * Once a process joins into freezer cgroup, the system kernel does not count
+ * it as a runnable process, and thus it cannot be scheduled on CPU.
+ * So, I regard processes in freezer cgroup as a frozen state and that can be
+ * good candidates of memory deduplication.
+ *
+ * LKSM provides a hook to catch the moment that the process is being frozen.
+ * With the hook, ksm crawler can get candidate list for memory deduplication.
+ * (see kernel/cgroup_freezer.c)
+ */
+#define FROZEN_BIT 0x01
+#define LISTED_BIT 0x02
+
+#define lksm_test_rmap_frozen(rmap_item) (rmap_item->oldchecksum & FROZEN_BIT)
+#define lksm_set_rmap_frozen(rmap_item) (rmap_item->oldchecksum |= FROZEN_BIT)
+#define lksm_clear_rmap_frozen(rmap_item) (rmap_item->oldchecksum &= ~FROZEN_BIT)
+#define lksm_clear_checksum_frozen(checksum) (checksum &= ~FROZEN_BIT)
+
+#define KSM_MM_FROZEN 0x01
+#define KSM_MM_LISTED 0x02
+#define KSM_MM_NEWCOMER 0x04
+#define KSM_MM_SCANNED 0x08
+
+#define lksm_test_mm_state(mm_slot, bit) (mm_slot->state & bit)
+#define lksm_set_mm_state(mm_slot, bit) (mm_slot->state |= bit)
+#define lksm_clear_mm_state(mm_slot, bit) (mm_slot->state &= ~bit)
+
+static int initial_round = 3;
+static unsigned long ksm_crawl_round;
+static unsigned long crawler_sleep;
+
+/* statistical information */
+static int lksm_nr_merged; /* global merge count */
+static int lksm_nr_broken; /* global broken count */
+static int lksm_nr_scanned_slot; /* global scanned slot count */
+static int lksm_slot_nr_merged; /* per-slot merge count */
+static int lksm_slot_nr_broken; /* per-slot broken count */
+
+/* initially, KSM takes small full scan interval */
+#define DEFAULT_FULL_SCAN_INTERVAL 60000 /* 60 seconds */
+static unsigned long full_scan_interval = 100;
+
+/* statistical information about scanning time */
+static unsigned long lksm_last_scan_time;
+static unsigned long lksm_proc_scan_time;
+
+/* stuffs for pruning short-lived task */
+#define KSM_SHORT_TASK_TIME 100
+static unsigned long short_lived_thresh = KSM_SHORT_TASK_TIME;
+
+#define get_task_runtime(task) (task->se.sum_exec_runtime)
+#define ms_to_ns(ms) (ms * 1000 * 1000)
+#define check_short_task(task) \
+ (get_task_runtime(task) < ms_to_ns(short_lived_thresh))
+
static void wait_while_offlining(void);
+static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_crawl_wait);
#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
if (!mm_slot_cache)
goto out_free2;
+ task_slot_cache = KSM_KMEM_CACHE(task_slot, 0);
+ if (!task_slot_cache)
+ goto out_free3;
return 0;
+out_free3:
+ kmem_cache_destroy(mm_slot_cache);
out_free2:
kmem_cache_destroy(stable_node_cache);
out_free1:
hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
}
+static inline struct task_slot *alloc_task_slot(void)
+{
+ if (!task_slot_cache)
+ return NULL;
+ return kmem_cache_zalloc(task_slot_cache, GFP_NOWAIT);
+}
+
+static inline void free_task_slot(struct task_slot *task_slot)
+{
+ kmem_cache_free(task_slot_cache, task_slot);
+}
+
+static struct task_slot *get_task_slot(struct task_struct *task)
+{
+ struct task_slot *slot;
+
+ hash_for_each_possible(task_slots_hash, slot, hlist,
+ (unsigned long)task)
+ if (slot->task == task)
+ return slot;
+ return NULL;
+}
+
+static inline void insert_to_task_slots_hash(struct task_slot *slot)
+{
+ hash_add(task_slots_hash, &slot->hlist, (unsigned long)slot->task);
+}
+
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
}
/*
+ * ksm_join: a wrapper function of ksm_enter.
+ * The function sets VM_MERGEABLE flag of vmas in the given mm_struct.
+ */
+static int ksm_join(struct mm_struct *mm, int frozen)
+{
+ struct vm_area_struct *vma;
+ struct mm_slot *slot;
+ int newly_allocated = 0;
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ slot = __ksm_enter_alloc_slot(mm, frozen);
+ if (!slot)
+ return -ENOMEM;
+ newly_allocated = 1;
+ } else {
+ slot = get_mm_slot(mm);
+ if (!slot) {
+ ksm_err("there is no mm_slot for %p", mm);
+ return -EINVAL;
+ }
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_HUGETLB | VM_MIXEDMAP))
+ continue;
+ vma->vm_flags |= VM_MERGEABLE;
+ }
+
+ return newly_allocated;
+}
+
+#define ksm_join_write_lock(mm, frozen, ret) do {\
+ down_write(&mm->mmap_sem); \
+ ret = ksm_join(mm, frozen); \
+ up_write(&mm->mmap_sem); \
+} while (0)
+
+/*
* This helper is used for getting right index into array of tree roots.
* When merge_across_nodes knob is set to 1, there are only two rb-trees for
* stable and unstable pages from all nodes with roots in index 0. Otherwise,
BUG_ON(stable_node->rmap_hlist_len < 0);
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- if (rmap_item->hlist.next)
+ if (rmap_item->hlist.next) {
ksm_pages_sharing--;
- else
+ lksm_slot_nr_broken++;
+ lksm_nr_broken++;
+ } else
ksm_pages_shared--;
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
unlock_page(page);
put_page(page);
- if (!hlist_empty(&stable_node->hlist))
+ if (!hlist_empty(&stable_node->hlist)) {
ksm_pages_sharing--;
- else
+ lksm_slot_nr_broken++;
+ lksm_nr_broken++;
+ } else
ksm_pages_shared--;
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
* if this rmap_item was inserted by this scan, rather
* than left over from before.
*/
- age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
- BUG_ON(age > 1);
+ age = (unsigned char)(ksm_scan.scan_round - rmap_item->address);
if (!age)
rb_erase(&rmap_item->node,
root_unstable_tree + NUMA(rmap_item->nid));
+ else
+ RB_CLEAR_NODE(&rmap_item->node);
+
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
/* Clean up stable nodes, but don't worry if some are still busy */
remove_all_stable_nodes();
- ksm_scan.seqnr = 0;
+ ksm_scan.scan_round = 0;
return 0;
error:
void *addr = kmap_atomic(page);
checksum = xxhash(addr, PAGE_SIZE, 0);
kunmap_atomic(addr);
- return checksum;
+ return lksm_clear_checksum_frozen(checksum);
}
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
}
rmap_item->address |= UNSTABLE_FLAG;
- rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+ rmap_item->address |= (ksm_scan.scan_round & SEQNR_MASK);
DO_NUMA(rmap_item->nid = nid);
rb_link_node(&rmap_item->node, parent, new);
rb_insert_color(&rmap_item->node, root);
rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
- if (rmap_item->hlist.next)
+ if (rmap_item->hlist.next) {
ksm_pages_sharing++;
- else
+ lksm_slot_nr_merged++;
+ lksm_nr_merged++;
+ } else
ksm_pages_shared++;
}
}
/*
- * If the hash value of the page has changed from the last time
- * we calculated it, this page is changing frequently: therefore we
- * don't want to insert it in the unstable tree, and we don't want
- * to waste our time searching for something identical to it there.
+ * LKSM: In LKSM, KSM is running in a event-triggered manner.
+ * Because of that scanning is much infrequently performed.
+ * We just skip caculation of checksum for LKSM to catch scanning
+ * chances more.
*/
- checksum = calc_checksum(page);
- if (rmap_item->oldchecksum != checksum) {
- rmap_item->oldchecksum = checksum;
- return;
+ if (ksm_scan.scan_round < initial_round
+ && !lksm_test_rmap_frozen(rmap_item)) {
+ checksum = calc_checksum(page);
+ if (rmap_item->oldchecksum != checksum) {
+ rmap_item->oldchecksum = checksum;
+ return;
+ }
}
/*
while (*rmap_list) {
rmap_item = *rmap_list;
- if ((rmap_item->address & PAGE_MASK) == addr)
+ if ((rmap_item->address & PAGE_MASK) == addr) {
+ if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
+ && rmap_item->address & UNSTABLE_FLAG)
+ lksm_set_rmap_frozen(rmap_item);
+ else
+ lksm_clear_rmap_frozen(rmap_item);
return rmap_item;
+ }
if (rmap_item->address > addr)
break;
*rmap_list = rmap_item->rmap_list;
rmap_item->address = addr;
rmap_item->rmap_list = *rmap_list;
*rmap_list = rmap_item;
+ if (lksm_test_mm_state(mm_slot, FROZEN_BIT))
+ lksm_set_rmap_frozen(rmap_item);
+ else
+ lksm_clear_rmap_frozen(rmap_item);
}
return rmap_item;
}
-static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+/*
+ * lksm_flush_removed_mm_list:
+ * batched flushing out removed mm_slots by lksm_remove_mm_slot
+ */
+static void lksm_flush_removed_mm_list(void)
{
- struct mm_struct *mm;
- struct mm_slot *slot;
- struct vm_area_struct *vma;
- struct rmap_item *rmap_item;
- int nid;
+ struct mm_slot *head, *next, *slot;
- if (list_empty(&ksm_mm_head.mm_list))
- return NULL;
+ spin_lock(&ksm_mmlist_lock);
+ head = list_first_entry_or_null(&ksm_scan.remove_mm_list,
+ struct mm_slot, mm_list);
+ if (!head) {
+ spin_unlock(&ksm_mmlist_lock);
+ return;
+ }
- slot = ksm_scan.mm_slot;
- if (slot == &ksm_mm_head) {
- /*
- * A number of pages can hang around indefinitely on per-cpu
- * pagevecs, raised page count preventing write_protect_page
- * from merging them. Though it doesn't really matter much,
- * it is puzzling to see some stuck in pages_volatile until
- * other activity jostles them out, and they also prevented
- * LTP's KSM test from succeeding deterministically; so drain
- * them here (here rather than on entry to ksm_do_scan(),
- * so we don't IPI too often when pages_to_scan is set low).
- */
- lru_add_drain_all();
+ list_del_init(&ksm_scan.remove_mm_list);
+ spin_unlock(&ksm_mmlist_lock);
- /*
- * Whereas stale stable_nodes on the stable_tree itself
- * get pruned in the regular course of stable_tree_search(),
- * those moved out to the migrate_nodes list can accumulate:
- * so prune them once before each full scan.
- */
- if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node, *next;
- struct page *page;
-
- list_for_each_entry_safe(stable_node, next,
- &migrate_nodes, list) {
- page = get_ksm_page(stable_node,
- GET_KSM_PAGE_NOLOCK);
- if (page)
- put_page(page);
- cond_resched();
- }
+ if (!list_empty(&head->mm_list)) {
+ list_for_each_entry_safe(slot, next, &head->mm_list, mm_list) {
+ ksm_debug("slot(%p) will be freed", slot);
+ list_del(&slot->mm_list);
+
+ cond_resched();
+
+ remove_trailing_rmap_items(slot, &slot->rmap_list);
+ clear_bit(MMF_VM_MERGEABLE, &slot->mm->flags);
+ mmdrop(slot->mm);
+ free_mm_slot(slot);
+ }
+ }
+
+ ksm_debug("slot(%p) will be freed", head);
+
+ cond_resched();
+ remove_trailing_rmap_items(head, &head->rmap_list);
+ clear_bit(MMF_VM_MERGEABLE, &head->mm->flags);
+ mmdrop(head->mm);
+ free_mm_slot(head);
+}
+
+/*
+ * remove mm_slot from lists
+ * LKSM defers releasing stuffs at the end of scanning
+ */
+static inline void lksm_remove_mm_slot(struct mm_slot *slot)
+{
+ hash_del(&slot->link);
+ list_del_init(&slot->scan_list);
+ list_move(&slot->mm_list, &ksm_scan.remove_mm_list);
+ if (!RB_EMPTY_NODE(&slot->ordered_list)) {
+ rb_erase(&slot->ordered_list, &vips_list);
+ RB_CLEAR_NODE(&slot->ordered_list);
+ }
+}
+
+/* caller must hold ksm_mmlist_lock */
+static struct mm_slot *lksm_get_unscanned_mm_slot(struct mm_slot *slot)
+{
+ struct mm_slot *next;
+
+ list_for_each_entry_safe_continue(slot, next, &ksm_scan_head.scan_list,
+ scan_list) {
+ if (ksm_test_exit(slot->mm)) {
+ ksm_debug("slot:%p %p is moved to remove list", slot, slot->mm);
+ if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
+ atomic_dec(&ksm_scan.nr_frozen);
+ else
+ atomic_dec(&ksm_scan.nr_scannable);
+ lksm_remove_mm_slot(slot);
+ continue;
}
- for (nid = 0; nid < ksm_nr_node_ids; nid++)
- root_unstable_tree[nid] = RB_ROOT;
+ lksm_nr_scanned_slot++;
+ break;
+ }
+
+ return slot;
+}
- spin_lock(&ksm_mmlist_lock);
- slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
- ksm_scan.mm_slot = slot;
- spin_unlock(&ksm_mmlist_lock);
- /*
- * Although we tested list_empty() above, a racing __ksm_exit
- * of the last mm on the list may have removed it since then.
- */
- if (slot == &ksm_mm_head)
- return NULL;
-next_mm:
- ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+/* caller must hold ksm_mmlist_lock */
+static void lksm_insert_mm_slot_ordered(struct mm_slot *slot)
+{
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
+ struct mm_slot *temp_slot;
+
+ root = &vips_list;
+ parent = NULL;
+ new = &root->rb_node;
+
+ while (*new) {
+ temp_slot = rb_entry(*new, struct mm_slot, ordered_list);
+
+ parent = *new;
+ if (slot->nr_merged > temp_slot->nr_merged)
+ new = &parent->rb_left;
+ else
+ new = &parent->rb_right;
}
- mm = slot->mm;
- down_read(&mm->mmap_sem);
+ rb_link_node(&slot->ordered_list, parent, new);
+ rb_insert_color(&slot->ordered_list, root);
+}
+
+static struct rmap_item *__scan_next_rmap_item(struct page **page,
+ struct mm_struct *mm, struct mm_slot *slot)
+{
+ struct vm_area_struct *vma;
+ struct rmap_item *rmap_item;
+
if (ksm_test_exit(mm))
vma = NULL;
else
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
- up_read(&mm->mmap_sem);
return rmap_item;
}
put_page(*page);
}
}
+ return NULL;
+}
+
+static inline int sum_merge_win(int merge_win[], int len)
+{
+ int i, sum = 0;
+
+ for (i = 0; i < len; i++)
+ sum += merge_win[i];
+ return sum;
+}
+
+static inline int lksm_account_mm_slot_nr_merge(struct mm_slot *slot, int nr_merged)
+{
+ slot->nr_merged_win[slot->merge_idx++] = nr_merged;
+ if (slot->merge_idx == MERGE_WIN)
+ slot->merge_idx = 0;
+ slot->nr_merged = sum_merge_win(slot->nr_merged_win, MERGE_WIN);
+ return slot->nr_merged;
+}
+
+static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+{
+ struct mm_struct *mm;
+ struct mm_slot *slot;
+ struct rmap_item *rmap_item;
+
+ if (list_empty(&ksm_scan_head.scan_list))
+ return NULL;
+
+ slot = ksm_scan.mm_slot;
+ if (slot == &ksm_scan_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
+ if (ksm_scan.scan_round < ksm_crawl_round) {
+ ksm_scan.scan_round = ksm_crawl_round;
+ root_unstable_tree[LKSM_NODE_ID] = RB_ROOT;
+ }
+
+ spin_lock(&ksm_mmlist_lock);
+ slot = lksm_get_unscanned_mm_slot(slot);
+ ksm_scan.mm_slot = slot;
+ spin_unlock(&ksm_mmlist_lock);
+
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_scan_head)
+ return NULL;
+
+ slot->elapsed = get_jiffies_64();
+next_mm:
+ ksm_scan.address = 0;
+ ksm_scan.rmap_list = &slot->rmap_list;
+ }
+
+ if (unlikely(!ksm_scan.rmap_list))
+ ksm_scan.rmap_list = &slot->rmap_list;
+
+ mm = slot->mm;
+ BUG_ON(!mm);
+ down_read(&mm->mmap_sem);
+ rmap_item = __scan_next_rmap_item(page, mm, slot);
+
+ if (rmap_item) {
+ slot->nr_scans++;
+ up_read(&mm->mmap_sem);
+ return rmap_item;
+ }
+
if (ksm_test_exit(mm)) {
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(slot->mm_list.next,
- struct mm_slot, mm_list);
+ ksm_scan.mm_slot = lksm_get_unscanned_mm_slot(slot);
+
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_sem
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
- hash_del(&slot->link);
- list_del(&slot->mm_list);
+ up_read(&mm->mmap_sem);
+ if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
+ atomic_dec(&ksm_scan.nr_frozen);
+ else
+ atomic_dec(&ksm_scan.nr_scannable);
+ lksm_remove_mm_slot(slot);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
- mmdrop(mm);
+ lksm_slot_nr_merged = 0;
+ lksm_slot_nr_broken = 0;
} else {
+ int newcomer = 0, frozen = 0;
+
up_read(&mm->mmap_sem);
- /*
- * up_read(&mm->mmap_sem) first because after
- * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
- * already have been freed under us by __ksm_exit()
- * because the "mm_slot" is still hashed and
- * ksm_scan.mm_slot doesn't point to it anymore.
- */
+
+ if (lksm_test_mm_state(slot, KSM_MM_NEWCOMER)) {
+ lksm_clear_mm_state(slot, KSM_MM_NEWCOMER);
+ newcomer = 1;
+ }
+ if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) {
+ lksm_clear_mm_state(slot, KSM_MM_FROZEN);
+ frozen = 1;
+ atomic_dec(&ksm_scan.nr_frozen);
+ } else
+ atomic_dec(&ksm_scan.nr_scannable);
+ lksm_set_mm_state(slot, KSM_MM_SCANNED);
+
+ list_del_init(&slot->scan_list);
+ if (!RB_EMPTY_NODE(&slot->ordered_list)) {
+ rb_erase(&slot->ordered_list, &vips_list);
+ RB_CLEAR_NODE(&slot->ordered_list);
+ }
+ if (lksm_account_mm_slot_nr_merge(slot, lksm_slot_nr_merged))
+ lksm_insert_mm_slot_ordered(slot);
+
+ slot->elapsed = get_jiffies_64() - slot->elapsed;
spin_unlock(&ksm_mmlist_lock);
+
+ if (ksm_test_exit(slot->mm))
+ ksm_debug("slot(%p:%p) is exited", slot, slot->mm);
+ else
+ ksm_debug("slot-%d(%s) %d merged %d scanned %lu pages "
+ "(sum: %d) - (%s, %s) takes %u msecs (nr_scannable: %d)",
+ task_pid_nr(slot->mm->owner), slot->mm->owner->comm,
+ lksm_slot_nr_merged - lksm_slot_nr_broken, slot->nr_scans,
+ slot->scanning_size, slot->nr_merged,
+ newcomer ? "new" : "old",
+ frozen ? "frozen" : "normal",
+ jiffies_to_msecs(slot->elapsed),
+ atomic_read(&ksm_scan.nr_scannable));
+
+ lksm_slot_nr_merged = 0;
+ lksm_slot_nr_broken = 0;
}
/* Repeat until we've completed scanning the whole list */
slot = ksm_scan.mm_slot;
- if (slot != &ksm_mm_head)
+ if (slot != &ksm_scan_head) {
+ slot->elapsed = get_jiffies_64();
goto next_mm;
+ }
- ksm_scan.seqnr++;
return NULL;
}
* ksm_do_scan - the ksm scanner main worker function.
* @scan_npages: number of pages we want to scan before we return.
*/
-static void ksm_do_scan(unsigned int scan_npages)
+static int ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
- return;
+ return 1; /* need sleep */
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
+ return 0;
}
static int ksmd_should_run(void)
{
- return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+ return (ksm_run & KSM_RUN_MERGE) &&
+ !list_empty(&ksm_scan_head.scan_list);
}
-static int ksm_scan_thread(void *nothing)
+static void lksm_scan_wrapup_wait(void)
{
+ if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
+ if (ksm_thread_pages_to_scan != lksm_default_pages_to_scan)
+ ksm_thread_pages_to_scan = lksm_default_pages_to_scan;
+ } else if (ksm_scan.scan_mode == LKSM_SCAN_FULL)
+ ksm_scan.nr_full_scan++;
+
+ lksm_nr_merged = 0;
+ lksm_nr_broken = 0;
+ lksm_nr_scanned_slot = 0;
+
+ ksm_scan.scan_mode = LKSM_SCAN_NONE;
+ if (ksm_run & KSM_RUN_ONESHOT)
+ atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
+
+ lksm_clear_scan_state(ksm_state);
+
+ wait_event_freezable(ksm_thread_wait,
+ (lksm_check_scan_state(ksm_state) && ksmd_should_run())
+ || kthread_should_stop());
+}
+
+static int lksm_scan_thread(void *nothing)
+{
+ unsigned long begin, elapsed;
unsigned int sleep_ms;
+ int need_to_sleep = 0;
set_freezable();
set_user_nice(current, 5);
+ ksm_debug("KSM_SCAND pid: %d", task_pid_nr(current));
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksmd_should_run())
- ksm_do_scan(ksm_thread_pages_to_scan);
+ need_to_sleep = ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
try_to_freeze();
- if (ksmd_should_run()) {
+ if (need_to_sleep) {
+ if (!ksmd_should_run()) {
+ /* if no one left in scanning list, go to sleep for a while */
+ lksm_flush_removed_mm_list();
+
+ elapsed = get_jiffies_64() - begin;
+ lksm_last_scan_time = elapsed;
+ lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
+
+ ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages "
+ "are merged/broken (nr_scannable: %d, nr_frozen: %d)",
+ lksm_nr_scanned_slot,
+ jiffies_to_msecs(lksm_last_scan_time),
+ lksm_nr_merged, lksm_nr_broken,
+ atomic_read(&ksm_scan.nr_scannable),
+ atomic_read(&ksm_scan.nr_frozen));
+
+ lksm_scan_wrapup_wait();
+
+ ksm_debug("Start %lu-th scanning: nr_scannable(%d) "
+ "nr_frozen(%d)",
+ ksm_scan.scan_round,
+ atomic_read(&ksm_scan.nr_scannable),
+ atomic_read(&ksm_scan.nr_frozen));
+
+ if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
+ if (lksm_boosted_pages_to_scan !=
+ ksm_thread_pages_to_scan) {
+ ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
+ ksm_debug("set pages_to_scan to %u",
+ lksm_boosted_pages_to_scan);
+ }
+ }
+ begin = get_jiffies_64();
+ } else {
+ /* new scanning targets are coming */
+ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+ wait_event_interruptible_timeout(ksm_iter_wait,
+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+ msecs_to_jiffies(sleep_ms));
+ }
+ need_to_sleep = 0;
+ } else if (ksmd_should_run()) {
+ /* normal sleep */
sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
wait_event_interruptible_timeout(ksm_iter_wait,
sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
msecs_to_jiffies(sleep_ms));
} else {
- wait_event_freezable(ksm_thread_wait,
- ksmd_should_run() || kthread_should_stop());
+ /* wait for activating ksm */
+ if (likely(ksm_scan.scan_round > 0)) {
+ lksm_flush_removed_mm_list();
+
+ elapsed = get_jiffies_64() - begin;
+ lksm_last_scan_time = elapsed;
+ lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
+
+ ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages are merged/broken",
+ lksm_nr_scanned_slot, jiffies_to_msecs(lksm_last_scan_time),
+ lksm_nr_merged, lksm_nr_broken);
+
+ lksm_scan_wrapup_wait();
+ } else
+ wait_event_freezable(ksm_thread_wait,
+ (lksm_check_scan_state(ksm_state) && ksmd_should_run())
+ || kthread_should_stop());
+
+ ksm_debug("Start %lu-th scanning: nr_scannable(%d) nr_frozen(%d)",
+ ksm_scan.scan_round,
+ atomic_read(&ksm_scan.nr_scannable),
+ atomic_read(&ksm_scan.nr_frozen));
+
+ if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
+ ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
+ ksm_debug("set pages_to_scan to %u",
+ lksm_boosted_pages_to_scan);
+ }
+ begin = get_jiffies_64();
}
}
return 0;
}
+/*
+ * lksm crawler declaration & definition part
+ */
+static struct task_struct *ksm_crawld;
+
+LIST_HEAD(frozen_task_list);
+DEFINE_SPINLOCK(frozen_task_lock);
+
+enum {
+ KSM_CRAWL_SLEEP,
+ KSM_CRAWL_RUN,
+} ksm_crawl_state;
+static atomic_t crawl_state;
+
+enum {
+ LKSM_TASK_SLOT_NONE = 0,
+ LKSM_TASK_SLOT_REMOVED,
+};
+
+static inline int lksm_count_and_clear_mm_slots
+(struct mm_slot *head, unsigned long *delay)
+{
+ int count = 0;
+ struct mm_slot *slot;
+
+ spin_lock(&ksm_mmlist_lock);
+ list_for_each_entry(slot, &head->mm_list, mm_list) {
+ if (list_empty(&slot->scan_list)) {
+ lksm_clear_mm_state(slot, KSM_MM_SCANNED);
+ slot->nr_scans = 0;
+ slot->scanning_size = get_mm_counter(slot->mm, MM_ANONPAGES);
+ list_add_tail(&slot->scan_list, &ksm_scan_head.scan_list);
+ *delay += slot->elapsed;
+ count++;
+ }
+ }
+ spin_unlock(&ksm_mmlist_lock);
+ return count;
+}
+
+static int lksm_prepare_frozen_scan(void)
+{
+ int nr_frozen = 0, nr_added = 0, err;
+ struct task_struct *task;
+ struct task_slot *task_slot;
+ struct mm_struct *mm;
+
+ spin_lock(&frozen_task_lock);
+ nr_frozen = atomic_read(&ksm_scan.nr_frozen);
+ if (list_empty(&frozen_task_list)) {
+ spin_unlock(&frozen_task_lock);
+ return nr_frozen;
+ }
+
+ ksm_debug("prepare frozen scan: round(%lu)", ksm_crawl_round);
+ task_slot = list_first_entry_or_null(&frozen_task_list,
+ struct task_slot, list);
+ while (task_slot) {
+ list_del(&task_slot->list);
+ hash_del(&task_slot->hlist);
+ spin_unlock(&frozen_task_lock);
+
+ task = task_slot->task;
+ if (ksm_run & KSM_RUN_UNMERGE) {
+ put_task_struct(task);
+ free_task_slot(task_slot);
+ goto clean_up_abort;
+ }
+
+ mm = get_task_mm(task);
+
+ if (!mm || ksm_test_exit(mm))
+ goto mm_exit_out;
+
+ if (mm) {
+ ksm_join_write_lock(mm, task_slot->frozen, err);
+ if (!err)
+ nr_added++;
+ }
+
+mm_exit_out:
+ free_task_slot(task_slot);
+ put_task_struct(task);
+ if (mm)
+ mmput(mm);
+
+ cond_resched();
+
+ spin_lock(&frozen_task_lock);
+ task_slot = list_first_entry_or_null(&frozen_task_list,
+ struct task_slot, list);
+ }
+ spin_unlock(&frozen_task_lock);
+ atomic_add(nr_added, &ksm_scan.nr_frozen);
+
+ return nr_added + nr_frozen;
+
+clean_up_abort:
+ spin_lock(&frozen_task_lock);
+ task_slot = list_first_entry_or_null(&frozen_task_list,
+ struct task_slot, list);
+ while (task_slot) {
+ list_del(&task_slot->list);
+ hash_del(&task_slot->hlist);
+ spin_unlock(&frozen_task_lock);
+
+ task = task_slot->task;
+ put_task_struct(task);
+ free_task_slot(task_slot);
+
+ spin_lock(&frozen_task_lock);
+ task_slot = list_first_entry_or_null(&frozen_task_list,
+ struct task_slot, list);
+ }
+ spin_unlock(&frozen_task_lock);
+
+ return 0;
+}
+
+/* this function make a list of new processes and vip processes */
+static int lksm_prepare_partial_scan(void)
+{
+ int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0;
+ unsigned long delay = 0;
+ unsigned long fault_cnt = 0;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mm_slot *mm_slot;
+ struct list_head recheck_list;
+ struct rb_node *node;
+
+ ksm_debug("prepare partial scan: round(%lu)", ksm_crawl_round);
+ INIT_LIST_HEAD(&recheck_list);
+
+ nr_frozen = lksm_prepare_frozen_scan();
+
+ /* get newbies */
+ for_each_process(task) {
+ if (task == current || task_pid_nr(task) == 0
+ || check_short_task(task))
+ continue;
+ if (ksm_run & KSM_RUN_UNMERGE) {
+ nr_frozen = 0;
+ nr_added = 0;
+ goto abort;
+ }
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+ ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
+ if (ret > 0)
+ nr_added++;
+ mmput(mm);
+ }
+
+ /* get vips */
+ if (nr_added + nr_frozen >= lksm_max_vips) {
+ ksm_debug("nr_scannable(%d) already fulfilled skip vips",
+ nr_added + nr_frozen);
+ goto skip_vips;
+ }
+
+ spin_lock(&ksm_mmlist_lock);
+ node = rb_first(&vips_list);
+ if (!node) {
+ ksm_debug("empty vip list");
+ spin_unlock(&ksm_mmlist_lock);
+ goto skip_vips;
+ }
+ mm_slot = rb_entry(node, struct mm_slot, ordered_list);
+ while (nr_scannable + nr_added + nr_frozen < lksm_max_vips) {
+ if (ksm_run & KSM_RUN_UNMERGE) {
+ spin_unlock(&ksm_mmlist_lock);
+ nr_scannable = 0;
+ nr_frozen = 0;
+ nr_added = 0;
+ goto abort;
+ }
+ if (ksm_test_exit(mm_slot->mm)) {
+ if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
+ atomic_dec(&ksm_scan.nr_scannable);
+ lksm_remove_mm_slot(mm_slot);
+ goto next_node;
+ }
+ if (!lksm_test_mm_state(mm_slot, KSM_MM_LISTED))
+ goto next_node;
+
+ /* prunning by fault count */
+ fault_cnt = mm_slot->mm->owner->maj_flt + mm_slot->mm->owner->min_flt;
+ if (mm_slot->fault_cnt == fault_cnt)
+ goto next_node;
+
+ mm_slot->fault_cnt = fault_cnt;
+ mm_slot->scanning_size = get_mm_counter(mm_slot->mm, MM_ANONPAGES);
+ mm_slot->nr_scans = 0;
+ delay += mm_slot->elapsed;
+ ksm_debug("slot(nr_merged: %d, scanning_size: %lu) task(%s)",
+ mm_slot->nr_merged, mm_slot->scanning_size,
+ mm_slot->mm->owner->comm);
+ list_move_tail(&mm_slot->scan_list, &recheck_list);
+ lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
+ nr_scannable++;
+
+next_node:
+ node = rb_next(node);
+ if (!node)
+ break;
+ mm_slot = rb_entry(node, struct mm_slot, ordered_list);
+ }
+ spin_unlock(&ksm_mmlist_lock);
+
+skip_vips:
+ spin_lock(&ksm_mmlist_lock);
+ if (!list_empty(&recheck_list))
+ list_splice(&recheck_list, &ksm_scan_head.scan_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ ksm_scan.scan_mode = LKSM_SCAN_PARTIAL;
+ ksm_crawl_round++;
+
+ atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
+ ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
+ nr_frozen, nr_added, nr_scannable, atomic_read(&ksm_scan.nr_scannable));
+abort:
+ return nr_frozen + nr_added + nr_scannable;
+}
+
+static int lksm_prepare_full_scan(unsigned long *next_fullscan)
+{
+ int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0, nr_target;
+ unsigned long delay = 0;
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ ksm_debug("prepare full scan: round(%lu)", ksm_crawl_round);
+
+ nr_frozen = lksm_prepare_frozen_scan();
+
+ for_each_process(task) {
+ if (task == current || task_pid_nr(task) == 0
+ || check_short_task(task))
+ continue;
+ if (ksm_run & KSM_RUN_UNMERGE) {
+ nr_target = 0;
+ goto abort;
+ }
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+ ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
+ if (ret > 0)
+ nr_added++;
+ mmput(mm);
+ }
+
+ nr_scannable = lksm_count_and_clear_mm_slots(&ksm_mm_head, &delay);
+ nr_target = nr_scannable + nr_added + nr_frozen;
+
+ /* calculate crawler's sleep time */
+ delay += msecs_to_jiffies((nr_frozen + nr_added) * lksm_proc_scan_time);
+ *next_fullscan = jiffies + delay + msecs_to_jiffies(full_scan_interval);
+
+ ksm_scan.scan_mode = LKSM_SCAN_FULL;
+ ksm_crawl_round++;
+
+ atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
+ ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
+ nr_frozen, nr_added, nr_scannable,
+ atomic_read(&ksm_scan.nr_scannable));
+abort:
+ return nr_target;
+}
+
+static int lksm_do_wait_userspace_event(unsigned long sleep_time)
+{
+ wait_event_freezable(ksm_crawl_wait,
+ kthread_should_stop() ||
+ (atomic_read(&ksm_one_shot_scanning) > 0));
+ return atomic_read(&ksm_one_shot_scanning);
+}
+
+static int lksm_do_wait_frozen_event(unsigned long sleep_time)
+{
+ int need_scan = 0;
+
+ spin_lock_irq(&frozen_task_lock);
+ if (list_empty(&frozen_task_list))
+ /* wait until candidate list is filled */
+ wait_event_interruptible_lock_irq_timeout(
+ ksm_crawl_wait,
+ kthread_should_stop()
+ || !list_empty(&frozen_task_list)
+ || !list_empty(&ksm_scan_head.scan_list),
+ frozen_task_lock, sleep_time);
+
+ if (!list_empty(&frozen_task_list) ||
+ !list_empty(&ksm_scan_head.scan_list))
+ need_scan = 1;
+ spin_unlock_irq(&frozen_task_lock);
+
+ return need_scan;
+}
+
+static inline void lksm_wake_up_scan_thread(void)
+{
+ ksm_debug("wake up lksm_scan_thread");
+ lksm_set_scan_state(ksm_state);
+ wake_up(&ksm_thread_wait);
+}
+
+#define LKSM_CRAWL_FROZEN_EVENT_WAIT 100 /* 100ms */
+
+static void lksm_do_crawl_once
+(unsigned long *next_fscan, unsigned long sleep_time)
+{
+ int nr_added = 0;
+ int scan_mode;
+
+ /* cralwer thread waits for trigger event from userspace */
+ scan_mode = lksm_do_wait_userspace_event(sleep_time);
+
+ if (scan_mode == LKSM_SCAN_PARTIAL) {
+ atomic_set(&crawl_state, KSM_CRAWL_RUN);
+ msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
+ nr_added = lksm_prepare_partial_scan();
+ } else if (scan_mode == LKSM_SCAN_FULL) {
+ atomic_set(&crawl_state, KSM_CRAWL_RUN);
+ nr_added = lksm_prepare_full_scan(next_fscan);
+ }
+
+ try_to_freeze();
+
+ if (nr_added > 0)
+ lksm_wake_up_scan_thread();
+ else {
+ ksm_debug("No one can be scanned!");
+ atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
+ }
+ atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
+}
+
+static void lksm_do_crawl_periodic
+(unsigned long *next_fscan, unsigned long sleep_time)
+{
+ int nr_added = 0;
+
+ if (time_is_before_eq_jiffies(*next_fscan)) {
+ atomic_set(&crawl_state, KSM_CRAWL_RUN);
+ nr_added = lksm_prepare_full_scan(next_fscan);
+ } else if (lksm_do_wait_frozen_event(sleep_time)) {
+ atomic_set(&crawl_state, KSM_CRAWL_RUN);
+ msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
+ nr_added = lksm_prepare_partial_scan();
+ }
+
+ try_to_freeze();
+
+ if (nr_added > 0)
+ lksm_wake_up_scan_thread();
+ atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
+}
+
+static int lksm_crawl_thread(void *data)
+{
+ int nr_added = 0;
+ unsigned long next_fscan = jiffies; /* next full scan */
+ unsigned long sleep_time = crawler_sleep;
+
+ set_freezable();
+ set_user_nice(current, 5);
+
+ ksm_debug("KSM_CRAWLD pid: %d", task_pid_nr(current));
+ wait_event_freezable(ksm_crawl_wait,
+ kthread_should_stop() || ksm_run & KSM_RUN_MERGE);
+ /* initial loop */
+ while (!kthread_should_stop() && ksm_crawl_round < initial_round) {
+
+ try_to_freeze();
+
+ if ((ksm_run & KSM_RUN_MERGE) &&
+ !lksm_check_scan_state(ksm_state) &&
+ time_is_before_eq_jiffies(next_fscan)) {
+ nr_added = lksm_prepare_full_scan(&next_fscan);
+ if (nr_added) {
+ lksm_wake_up_scan_thread();
+ nr_added = 0;
+ }
+ next_fscan = jiffies + sleep_time;
+ }
+
+ wait_event_interruptible_timeout(ksm_crawl_wait,
+ kthread_should_stop() || !lksm_check_scan_state(ksm_state),
+ sleep_time);
+ }
+
+ /* initialization loop done */
+ full_scan_interval = DEFAULT_FULL_SCAN_INTERVAL;
+ next_fscan = jiffies + msecs_to_jiffies(full_scan_interval);
+ atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
+
+ /* normal operation loop */
+ while (!kthread_should_stop()) {
+ if (ksm_run & KSM_RUN_ONESHOT) {
+ if (!lksm_check_scan_state(ksm_state))
+ lksm_do_crawl_once(&next_fscan, sleep_time);
+ else
+ /* wait until scanning done */
+ wait_event_freezable(ksm_crawl_wait,
+ !lksm_check_scan_state(ksm_state)
+ || kthread_should_stop());
+ } else if (ksm_run & KSM_RUN_MERGE) {
+ if (!lksm_check_scan_state(ksm_state))
+ lksm_do_crawl_periodic(&next_fscan, sleep_time);
+ else
+ /* wait until scanning done */
+ wait_event_interruptible_timeout(ksm_crawl_wait,
+ !lksm_check_scan_state(ksm_state)
+ || kthread_should_stop(),
+ sleep_time);
+ try_to_freeze();
+ } else {
+ ksm_debug("ksm is not activated");
+ wait_event_freezable(ksm_crawl_wait,
+ kthread_should_stop() || (ksm_run & KSM_RUN_MERGE));
+ }
+ }
+
+ return 0;
+}
+
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
#endif
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
- err = __ksm_enter(mm);
+ err = __ksm_enter(mm, KSM_TASK_UNFROZEN);
if (err)
return err;
}
return 0;
}
-int __ksm_enter(struct mm_struct *mm)
+static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen)
{
struct mm_slot *mm_slot;
- int needs_wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
- return -ENOMEM;
+ return NULL;
- /* Check ksm_run too? Would need tighter locking */
- needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+ if (frozen == KSM_TASK_FROZEN)
+ lksm_set_mm_state(mm_slot, KSM_MM_FROZEN | KSM_MM_NEWCOMER);
+ else
+ lksm_set_mm_state(mm_slot, KSM_MM_LISTED | KSM_MM_NEWCOMER);
+
+ lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
+ RB_CLEAR_NODE(&mm_slot->ordered_list);
+ mm_slot->fault_cnt = mm->owner->maj_flt + mm->owner->min_flt;
+ mm_slot->scanning_size = get_mm_counter(mm, MM_ANONPAGES);
spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot);
*/
if (ksm_run & KSM_RUN_UNMERGE)
list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
- else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ else {
+ list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
+ list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ }
+ ksm_nr_added_process++;
spin_unlock(&ksm_mmlist_lock);
-
set_bit(MMF_VM_MERGEABLE, &mm->flags);
- mmgrab(mm);
+ atomic_inc(&mm->mm_count);
- if (needs_wakeup)
- wake_up_interruptible(&ksm_thread_wait);
+ return mm_slot;
+}
+int __ksm_enter(struct mm_struct *mm, int frozen)
+{
+ if (!__ksm_enter_alloc_slot(mm, frozen))
+ return -ENOMEM;
return 0;
}
spin_lock(&ksm_mmlist_lock);
mm_slot = get_mm_slot(mm);
- if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+ if (!mm_slot) {
+ spin_unlock(&ksm_mmlist_lock);
+ return;
+ }
+
+ if (ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
+ list_del(&mm_slot->scan_list);
+ if (!RB_EMPTY_NODE(&mm_slot->ordered_list)) {
+ rb_erase(&mm_slot->ordered_list, &vips_list);
+ RB_CLEAR_NODE(&mm_slot->ordered_list);
+ }
easy_to_free = 1;
- } else {
- list_move(&mm_slot->mm_list,
- &ksm_scan.mm_slot->mm_list);
+ } else
+ lksm_remove_mm_slot(mm_slot);
+ if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)) {
+ atomic_dec(&ksm_scan.nr_frozen);
+ ksm_debug("nr_frozen: %d", atomic_read(&ksm_scan.nr_frozen));
+ } else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) {
+ atomic_dec(&ksm_scan.nr_scannable);
+ ksm_debug("nr_scannable: %d", atomic_read(&ksm_scan.nr_scannable));
}
}
+ ksm_nr_added_process--;
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", ksm_run);
+ if (ksm_run & KSM_RUN_ONESHOT)
+ return sprintf(buf, "%u\n", KSM_RUN_ONESHOT);
+ else
+ return sprintf(buf, "%lu\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
err = kstrtoul(buf, 10, &flags);
if (err || flags > UINT_MAX)
return -EINVAL;
- if (flags > KSM_RUN_UNMERGE)
+ if (flags > KSM_RUN_ONESHOT)
return -EINVAL;
/*
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksm_run != flags) {
- ksm_run = flags;
+ if (flags == KSM_RUN_ONESHOT)
+ ksm_run = KSM_RUN_MERGE | KSM_RUN_ONESHOT;
+ else
+ ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
}
mutex_unlock(&ksm_thread_mutex);
- if (flags & KSM_RUN_MERGE)
- wake_up_interruptible(&ksm_thread_wait);
+ if (ksm_run & KSM_RUN_MERGE) {
+ ksm_debug("activate KSM");
+ wake_up(&ksm_crawl_wait);
+ }
return count;
}
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+ return sprintf(buf, "%lu\n", ksm_scan.nr_full_scan);
}
KSM_ATTR_RO(full_scans);
+static ssize_t scanning_process_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_nr_added_process);
+}
+KSM_ATTR_RO(scanning_process);
+
+static ssize_t full_scan_interval_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", full_scan_interval);
+}
+
+static ssize_t full_scan_interval_store(struct kobject *kbj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long interval;
+
+ err = kstrtoul(buf, 10, &interval);
+ if (err || interval > UINT_MAX)
+ return -EINVAL;
+
+ full_scan_interval = interval;
+ return count;
+}
+KSM_ATTR(full_scan_interval);
+
+static ssize_t one_shot_scanning_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", atomic_read(&ksm_one_shot_scanning));
+}
+
+static ssize_t one_shot_scanning_store(struct kobject *kbj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err, val;
+
+ err = kstrtoint(buf, 10, &val);
+ if (err || (val != LKSM_SCAN_PARTIAL && val != LKSM_SCAN_FULL)) {
+ ksm_err("wrong value: %d", val);
+ return -EINVAL;
+ }
+
+ if (!atomic_cmpxchg(&ksm_one_shot_scanning, LKSM_SCAN_NONE, val)) {
+ wake_up(&ksm_crawl_wait);
+ return count;
+ }
+ ksm_debug("ksm is still scanning");
+ return -EINVAL;
+}
+KSM_ATTR(one_shot_scanning);
+
+static ssize_t scan_boost_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", lksm_boosted_pages_to_scan);
+}
+
+static ssize_t scan_boost_store(struct kobject *kbj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err, val;
+
+ err = kstrtoint(buf, 10, &val);
+ /* lksm_boosted_pages_to_scan must presence in from 100 to 10000 */
+ if (err || val < 100 || val > 10000) {
+ ksm_err("wrong value: %d", val);
+ return -EINVAL;
+ }
+
+ lksm_boosted_pages_to_scan = (unsigned int) val;
+
+ return count;
+}
+KSM_ATTR(scan_boost);
+
static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&stable_node_dups_attr.attr,
&stable_node_chains_prune_millisecs_attr.attr,
&use_zero_pages_attr.attr,
+ &scanning_process_attr.attr,
+ &full_scan_interval_attr.attr,
+ &one_shot_scanning_attr.attr,
+ &scan_boost_attr.attr,
NULL,
};
};
#endif /* CONFIG_SYSFS */
+static inline int __lksm_remove_candidate(struct task_struct *task)
+{
+ int ret = LKSM_TASK_SLOT_NONE;
+ struct task_slot *slot = get_task_slot(task);
+
+ if (slot) {
+ list_del(&slot->list);
+ hash_del(&slot->hlist);
+ free_task_slot(slot);
+ ret = LKSM_TASK_SLOT_REMOVED;
+ }
+ return ret;
+}
+
+/* called by ksm_exit */
+void lksm_remove_candidate(struct mm_struct *mm)
+{
+ int ret;
+
+ if (!mm->owner) {
+ struct mm_slot *mm_slot;
+
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && mm_slot != ksm_scan.mm_slot) {
+ list_move(&mm_slot->mm_list, &ksm_scan.remove_mm_list);
+ if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN))
+ atomic_dec(&ksm_scan.nr_frozen);
+ else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
+ atomic_dec(&ksm_scan.nr_scannable);
+ ksm_debug("mm_slot: %p will be exited", mm_slot);
+ }
+ spin_unlock(&ksm_mmlist_lock);
+ return;
+ }
+
+ if (!ksm_test_exit(mm))
+ ksm_debug("proc-%d(%s) will be removed",
+ task_pid_nr(mm->owner), mm->owner->comm);
+
+ spin_lock(&frozen_task_lock);
+ ret = __lksm_remove_candidate(mm->owner);
+ spin_unlock(&frozen_task_lock);
+ if (ret == LKSM_TASK_SLOT_REMOVED)
+ put_task_struct(mm->owner);
+}
+
+static int lksm_task_frozen(struct task_struct *task)
+{
+ int need_wakeup = 0;
+ struct mm_struct *mm = task->mm;
+ struct mm_slot *mm_slot;
+ struct task_slot *task_slot;
+
+ if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ /* a mergeable task becoming frozen */
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ BUG_ON(!mm_slot);
+
+ if (mm_slot != ksm_scan.mm_slot
+ && lksm_test_mm_state(mm_slot, KSM_MM_LISTED)) {
+ if (list_empty(&mm_slot->scan_list))
+ list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
+ if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
+ atomic_dec(&ksm_scan.nr_scannable);
+ lksm_clear_mm_state(mm_slot, KSM_MM_LISTED);
+ lksm_set_mm_state(mm_slot, KSM_MM_FROZEN);
+ atomic_inc(&ksm_scan.nr_frozen);
+
+ need_wakeup = (ksm_run == KSM_RUN_MERGE);
+ ksm_debug("lksm_task_frozen called for task(%s): %p (nr_frozen: %d)",
+ task->comm, task, atomic_read(&ksm_scan.nr_frozen));
+ }
+ spin_unlock(&ksm_mmlist_lock);
+ } else {
+ task_slot = alloc_task_slot();
+ if (!task_slot) {
+ ksm_err("[ksm_tizen] Cannot allocate memory for task_slot\n");
+ return -ENOMEM;
+ }
+
+ task_slot->task = task;
+ task_slot->frozen = KSM_TASK_FROZEN;
+ task_slot->inserted = jiffies;
+
+ get_task_struct(task);
+
+ spin_lock(&frozen_task_lock);
+ list_add(&task_slot->list, &frozen_task_list);
+ insert_to_task_slots_hash(task_slot);
+ spin_unlock(&frozen_task_lock);
+
+ need_wakeup = (ksm_run == KSM_RUN_MERGE);
+ ksm_debug("task-%d(%s) is added to frozen task list",
+ task_pid_nr(task), task->comm);
+ }
+
+ if (need_wakeup && atomic_read(&crawl_state) == KSM_CRAWL_SLEEP)
+ wake_up(&ksm_crawl_wait);
+
+ return 0;
+}
+
+static int lksm_task_thawed(struct task_struct *task)
+{
+ struct mm_struct *mm = task->mm;
+ struct mm_slot *mm_slot;
+ struct task_slot *task_slot;
+
+ if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ /* a frozen task becoming thawed */
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ BUG_ON(!mm_slot);
+
+ if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
+ && ksm_scan.mm_slot != mm_slot) {
+ if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
+ atomic_inc(&ksm_scan.nr_scannable);
+ else
+ list_del_init(&mm_slot->scan_list);
+ lksm_clear_mm_state(mm_slot, KSM_MM_FROZEN);
+ lksm_set_mm_state(mm_slot, KSM_MM_LISTED);
+ atomic_dec(&ksm_scan.nr_frozen);
+ ksm_debug("nr_frozen: %d nr_scannable: %d",
+ atomic_read(&ksm_scan.nr_frozen),
+ atomic_read(&ksm_scan.nr_scannable));
+ }
+ spin_unlock(&ksm_mmlist_lock);
+ } else {
+ /* just remove task slot, it will be cared by full_scan */
+ spin_lock(&frozen_task_lock);
+ task_slot = get_task_slot(task);
+ if (task_slot) {
+ list_del(&task_slot->list);
+ hash_del(&task_slot->hlist);
+ }
+ spin_unlock(&frozen_task_lock);
+ if (task_slot) {
+ free_task_slot(task_slot);
+ put_task_struct(task);
+ ksm_debug("task-%d(%s) is removed from frozen task list",
+ task_pid_nr(task), task->comm);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * lksm_hint: a hook for construct candidate list
+ * this function cannot sleep
+ */
+int lksm_hint(struct task_struct *task, int frozen)
+{
+ /*
+ * If lksm_hint is called by ksm_fork, the task yet has its own
+ * mm_struct because it does not completes mm_struct initialization.
+ * Thus, we skip this check and put the task into candidate list.
+ */
+ if (frozen == KSM_TASK_FROZEN)
+ return lksm_task_frozen(task);
+ else if (frozen == KSM_TASK_THAWED)
+ return lksm_task_thawed(task);
+ else
+ return 0;
+}
+
+static void __init lksm_init(void)
+{
+ ksm_crawld = kthread_create(lksm_crawl_thread, NULL, "ksm_crawld");
+
+ if (ksm_crawld == NULL) {
+ printk(KERN_ALERT "fail to create ksm crawler daemon\n");
+ return;
+ }
+
+ atomic_set(&ksm_scan.nr_frozen, 0);
+ atomic_set(&ksm_scan.nr_scannable, 0);
+ atomic_set(&ksm_state, 0);
+ INIT_LIST_HEAD(&ksm_scan.remove_mm_list);
+
+ crawler_sleep = msecs_to_jiffies(1000);
+ wake_up_process(ksm_crawld);
+}
+
static int __init ksm_init(void)
{
struct task_struct *ksm_thread;
if (err)
goto out;
- ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
+ ksm_thread = kthread_run(lksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
pr_err("ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
#endif /* CONFIG_SYSFS */
-
+ lksm_init();
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);