mm/lksm.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Lightweight KSM.
   4  *
   5  * This code provides lightweight version of KSM.
   6  *
   7  * Copyright (C) 2020 Samsung Electronics Co., Ltd.
   8  * Author: Sung-hun Kim (sfoon.kim@samsung.com)
   9  */
  10
  11 /*
  12  * Memory merging support.
  13  *
  14  * This code enables dynamic sharing of identical pages found in different
  15  * memory areas, even if they are not shared by fork()
  16  *
  17  * Copyright (C) 2008-2009 Red Hat, Inc.
  18  * Authors:
  19  *      Izik Eidus
  20  *      Andrea Arcangeli
  21  *      Chris Wright
  22  *      Hugh Dickins
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/mm.h>
  27 #include <linux/fs.h>
  28 #include <linux/mman.h>
  29 #include <linux/sched.h>
  30 #include <linux/sched/mm.h>
  31 #include <linux/sched/coredump.h>
  32 #include <linux/rwsem.h>
  33 #include <linux/pagemap.h>
  34 #include <linux/rmap.h>
  35 #include <linux/spinlock.h>
  36 #include <linux/xxhash.h>
  37 #include <linux/delay.h>
  38 #include <linux/kthread.h>
  39 #include <linux/wait.h>
  40 #include <linux/slab.h>
  41 #include <linux/rbtree.h>
  42 #include <linux/memory.h>
  43 #include <linux/mmu_notifier.h>
  44 #include <linux/swap.h>
  45 #include <linux/ksm.h>
  46 #include <linux/hashtable.h>
  47 #include <linux/freezer.h>
  48 #include <linux/oom.h>
  49 #include <linux/numa.h>
  50
  51 #include <asm/tlbflush.h>
  52 #include "internal.h"
  53
  54 #ifdef CONFIG_NUMA
  55 #define NUMA(x)         (x)
  56 #define DO_NUMA(x)      do { (x); } while (0)
  57 #else
  58 #define NUMA(x)         (0)
  59 #define DO_NUMA(x)      do { } while (0)
  60 #endif
  61
  62 #define ksm_debug(fmt, ...) \
  63         printk(KERN_DEBUG "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
  64 #define ksm_err(fmt, ...) \
  65         printk(KERN_ERR "[ksm:%s:%d] " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
  66
  67 /**
  68  * DOC: Overview
  69  *
  70  * A few notes about the KSM scanning process,
  71  * to make it easier to understand the data structures below:
  72  *
  73  * In order to reduce excessive scanning, KSM sorts the memory pages by their
  74  * contents into a data structure that holds pointers to the pages' locations.
  75  *
  76  * Since the contents of the pages may change at any moment, KSM cannot just
  77  * insert the pages into a normal sorted tree and expect it to find anything.
  78  * Therefore KSM uses two data structures - the stable and the unstable tree.
  79  *
  80  * The stable tree holds pointers to all the merged pages (ksm pages), sorted
  81  * by their contents.  Because each such page is write-protected, searching on
  82  * this tree is fully assured to be working (except when pages are unmapped),
  83  * and therefore this tree is called the stable tree.
  84  *
  85  * The stable tree node includes information required for reverse
  86  * mapping from a KSM page to virtual addresses that map this page.
  87  *
  88  * In order to avoid large latencies of the rmap walks on KSM pages,
  89  * KSM maintains two types of nodes in the stable tree:
  90  *
  91  * * the regular nodes that keep the reverse mapping structures in a
  92  *   linked list
  93  * * the "chains" that link nodes ("dups") that represent the same
  94  *   write protected memory content, but each "dup" corresponds to a
  95  *   different KSM page copy of that content
  96  *
  97  * Internally, the regular nodes, "dups" and "chains" are represented
  98  * using the same :c:type:`struct stable_node` structure.
  99  *
 100  * In addition to the stable tree, KSM uses a second data structure called the
 101  * unstable tree: this tree holds pointers to pages which have been found to
 102  * be "unchanged for a period of time".  The unstable tree sorts these pages
 103  * by their contents, but since they are not write-protected, KSM cannot rely
 104  * upon the unstable tree to work correctly - the unstable tree is liable to
 105  * be corrupted as its contents are modified, and so it is called unstable.
 106  *
 107  * KSM solves this problem by several techniques:
 108  *
 109  * 1) The unstable tree is flushed every time KSM completes scanning all
 110  *    memory areas, and then the tree is rebuilt again from the beginning.
 111  * 2) KSM will only insert into the unstable tree, pages whose hash value
 112  *    has not changed since the previous scan of all memory areas.
 113  * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 114  *    colors of the nodes and not on their contents, assuring that even when
 115  *    the tree gets "corrupted" it won't get out of balance, so scanning time
 116  *    remains the same (also, searching and inserting nodes in an rbtree uses
 117  *    the same algorithm, so we have no overhead when we flush and rebuild).
 118  * 4) KSM never flushes the stable tree, which means that even if it were to
 119  *    take 10 attempts to find a page in the unstable tree, once it is found,
 120  *    it is secured in the stable tree.  (When we scan a new page, we first
 121  *    compare it against the stable tree, and then against the unstable tree.)
 122  *
 123  * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 124  * stable trees and multiple unstable trees: one of each for each NUMA node.
 125  */
 126
 127 /*
 128  * A few notes about lightweight KSM.
 129  *
 130  * A smart crawler leverages semantics of tasks in Tizen.
 131  * When the application goes to background, it is attached to freezer
 132  * task group. LKSM crawler hooks this event and adds a "frozen task"
 133  * to candidate list for scanning.
 134  *
 135  */
 136
 137 /* merge window size */
 138 #define MERGE_WIN 3
 139
 140 /**
 141  * struct mm_slot - ksm information per mm that is being scanned
 142  * @link: link to the mm_slots hash list
 143  * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
 144  * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 145  * @mm: the mm that this information is valid for
 146  *
 147  * extension - added for LKSM
 148  * @state: state of mm_slot (frozen, listed, scanned, newcomer)
 149  * @merge_idx: merge window index to store the number of currently merged pages
 150  * @nr_merged_win: merge window to keep recent three numbers
 151  * @nr_merged: sum of nr_merged_win, used to maintain vips_list (ordered list)
 152  * @ordered_list: list ordered by nr_merged
 153  * @scanning_size: number of anonymous pages in mm_struct
 154  * @fault_cnt: last read count of page fault (minor + major)
 155  * @elapsed: elapsed scanning time
 156  * @nr_scans: number of scanning pages (can be different with scanning_size)
 157  */
 158 struct mm_slot {
 159         struct hlist_node link;
 160         struct list_head mm_list;
 161         struct list_head scan_list;
 162         struct rmap_item *rmap_list;
 163         struct mm_struct *mm;
 164
 165         short state;
 166
 167         short merge_idx;
 168         int nr_merged_win[MERGE_WIN];
 169         int nr_merged;
 170         struct rb_node ordered_list;
 171
 172         unsigned long scanning_size; /* in number of pages */
 173         unsigned long fault_cnt;
 174         unsigned long elapsed;
 175         int nr_scans;
 176
 177 #ifdef CONFIG_LKSM_FILTER
 178         /* used for releasing lksm_region */
 179         struct list_head ref_list;
 180         int nr_regions;
 181 #endif
 182
 183 };
 184
 185 /*
 186  * scanning mode of LKSM:
 187  * LKSM_SCAN_PARTIAL: perform deduplication on subset of processes
 188  * LKSM_SCAN_FULL: perform deduplication on full set of processes
 189  */
 190 enum lksm_scan_mode {
 191         LKSM_SCAN_NONE,
 192         LKSM_SCAN_PARTIAL,
 193         LKSM_SCAN_FULL,
 194 };
 195
 196 /**
 197  * struct ksm_scan - cursor for scanning
 198  * @address: the next address inside that to be scanned
 199  * @rmap_list: link to the next rmap to be scanned in the rmap_list
 200  * @mm_slot: the current mm_slot we are scanning
 201  * @remove_mm_list: temporary list for batching flush of removed slots
 202  * @nr_scannable: the number of remaining unscanned scannable slots
 203  * @nr_frozen: the number of remaining unscanned frozen slots
 204  * @scan_round: scanning round (partial + full)
 205  * @nr_full_scan: the number of full scanning
 206  * @scan_mode: coverage of current scanning
 207  *
 208  * There is only the one ksm_scan instance of this cursor structure.
 209  */
 210 struct ksm_scan {
 211         unsigned long address;
 212         struct rmap_item **rmap_list;
 213
 214         struct mm_slot *mm_slot;
 215         struct list_head remove_mm_list;
 216
 217         /* statistics of scanning targets */
 218         atomic_t nr_scannable;
 219         atomic_t nr_frozen;
 220
 221         unsigned long scan_round;
 222         unsigned long nr_full_scan;
 223
 224         enum lksm_scan_mode scan_mode;
 225
 226 #ifdef CONFIG_LKSM_FILTER
 227         struct lksm_region *region;
 228         unsigned long vma_base_addr;
 229         struct vm_area_struct *cached_vma;
 230 #endif /* CONFIG_LKSM_FILTER */
 231 };
 232
 233 /**
 234  * struct stable_node - node of the stable rbtree
 235  * @node: rb node of this ksm page in the stable tree
 236  * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
 237  * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
 238  * @list: linked into migrate_nodes, pending placement in the proper node tree
 239  * @hlist: hlist head of rmap_items using this ksm page
 240  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
 241  * @chain_prune_time: time of the last full garbage collection
 242  * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
 243  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
 244  */
 245 struct stable_node {
 246         union {
 247                 struct rb_node node;    /* when node of stable tree */
 248                 struct {                /* when listed for migration */
 249                         struct list_head *head;
 250                         struct {
 251                                 struct hlist_node hlist_dup;
 252                                 struct list_head list;
 253                         };
 254                 };
 255         };
 256         struct hlist_head hlist;
 257         union {
 258                 unsigned long kpfn;
 259                 unsigned long chain_prune_time;
 260         };
 261         /*
 262          * STABLE_NODE_CHAIN can be any negative number in
 263          * rmap_hlist_len negative range, but better not -1 to be able
 264          * to reliably detect underflows.
 265          */
 266 #define STABLE_NODE_CHAIN -1024
 267         int rmap_hlist_len;
 268 #ifdef CONFIG_NUMA
 269         int nid;
 270 #endif
 271 };
 272
 273 /**
 274  * struct rmap_item - reverse mapping item for virtual addresses
 275  * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 276  * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 277  * @nid: NUMA node id of unstable tree in which linked (may not match page)
 278  * @region: pointer to the mapped region (LKSM feature)
 279  * @mm: the memory structure this rmap_item is pointing into
 280  * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 281  * @oldchecksum: previous checksum of the page at that virtual address
 282  * @node: rb node of this rmap_item in the unstable tree
 283  * @head: pointer to stable_node heading this list in the stable tree
 284  * @base_addr: used for calculating offset of the address (LKSM feature)
 285  * @hlist: link into hlist of rmap_items hanging off that stable_node
 286  */
 287 struct rmap_item {
 288         struct rmap_item *rmap_list;
 289         union {
 290                 struct anon_vma *anon_vma;      /* when stable */
 291 #ifdef CONFIG_NUMA
 292                 int nid;                /* when node of unstable tree */
 293 #endif
 294 #ifdef CONFIG_LKSM_FILTER
 295                 struct lksm_region *region; /* when unstable */
 296 #endif
 297         };
 298         struct mm_struct *mm;
 299         unsigned long address;          /* + low bits used for flags below */
 300         unsigned int oldchecksum;       /* when unstable (LSB is a frozen bit) */
 301         union {
 302                 struct rb_node node;    /* when node of unstable tree */
 303                 struct {                /* when listed from stable tree */
 304 #ifdef CONFIG_LKSM_FILTER
 305                         union {
 306                                 struct stable_node *head;
 307                                 unsigned long base_addr; /* temporal storage for merge */
 308                         };
 309 #else
 310                         struct stable_node *head;
 311 #endif /* CONFIG_LKSM_FILTER */
 312                         struct hlist_node hlist;
 313                 };
 314         };
 315 };
 316
 317 #define SEQNR_MASK      0x0ff   /* low bits of unstable tree scan_round */
 318 #define UNSTABLE_FLAG   0x100   /* is a node of the unstable tree */
 319 #define STABLE_FLAG     0x200   /* is listed from the stable tree */
 320 #define KSM_FLAG_MASK   (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
 321                                 /* to mask all the flags */
 322
 323 /* The stable and unstable tree heads */
 324 static struct rb_root one_stable_tree[1] = { RB_ROOT };
 325 static struct rb_root one_unstable_tree[1] = { RB_ROOT };
 326 static struct rb_root *root_stable_tree = one_stable_tree;
 327 static struct rb_root *root_unstable_tree = one_unstable_tree;
 328
 329 #define LKSM_NODE_ID 0
 330
 331 /* Recently migrated nodes of stable tree, pending proper placement */
 332 static LIST_HEAD(migrate_nodes);
 333 #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
 334
 335 /* list for VIP processes */
 336 static struct rb_root vips_list = RB_ROOT;
 337 static int lksm_max_vips = 20;
 338
 339 #define MM_SLOTS_HASH_BITS 10
 340 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 341 static DEFINE_HASHTABLE(task_slots_hash, MM_SLOTS_HASH_BITS);
 342
 343 /*
 344  * two list heads in LKSM:
 345  *  - ksm_mm_head: a head for traversing whole list of processes,
 346                 not used for scanning itself
 347  *  - ksm_scan_head: a head for a list of currently scanning processes
 348  */
 349 static struct mm_slot ksm_mm_head = {
 350         .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
 351 };
 352
 353 static struct mm_slot ksm_scan_head = {
 354         .scan_list = LIST_HEAD_INIT(ksm_scan_head.scan_list),
 355 };
 356
 357 static struct ksm_scan ksm_scan = {
 358         .mm_slot = &ksm_scan_head,
 359 };
 360
 361 static struct kmem_cache *rmap_item_cache;
 362 static struct kmem_cache *stable_node_cache;
 363 static struct kmem_cache *mm_slot_cache;
 364 static struct kmem_cache *task_slot_cache;
 365
 366 /* The number of nodes in the stable tree */
 367 static unsigned long ksm_pages_shared;
 368
 369 /* The number of page slots additionally sharing those nodes */
 370 static unsigned long ksm_pages_sharing;
 371
 372 /* The number of nodes in the unstable tree */
 373 static unsigned long ksm_pages_unshared;
 374
 375 /* The number of rmap_items in use: to calculate pages_volatile */
 376 static unsigned long ksm_rmap_items;
 377
 378 /* The number of stable_node chains */
 379 static unsigned long ksm_stable_node_chains;
 380
 381 /* The number of stable_node dups linked to the stable_node chains */
 382 static unsigned long ksm_stable_node_dups;
 383
 384 /* Delay in pruning stale stable_node_dups in the stable_node_chains */
 385 static int ksm_stable_node_chains_prune_millisecs = 2000;
 386
 387 /* Maximum number of page slots sharing a stable node */
 388 static int ksm_max_page_sharing = 256;
 389
 390 /* Number of pages ksmd should scan in one batch */
 391 static unsigned int ksm_thread_pages_to_scan = 100;
 392
 393 /* Milliseconds ksmd should sleep between batches */
 394 static unsigned int ksm_thread_sleep_millisecs = 20;
 395
 396 /* Checksum of an empty (zeroed) page */
 397 static unsigned int zero_checksum __read_mostly;
 398
 399 /* Processes tracked by KSM thread */
 400 static unsigned int ksm_nr_added_process;
 401
 402 /* Whether to merge empty (zeroed) pages with actual zero pages */
 403 static bool ksm_use_zero_pages __read_mostly;
 404
 405 /* An indicator for KSM scanning */
 406 static atomic_t ksm_one_shot_scanning;
 407
 408 /* Boosting when the scanner performs partial scan */
 409 static unsigned int lksm_boosted_pages_to_scan = 100;
 410 static unsigned int lksm_default_pages_to_scan = 100;
 411
 412 #ifdef CONFIG_NUMA
 413 /* Zeroed when merging across nodes is not allowed */
 414 static unsigned int ksm_merge_across_nodes = 1;
 415 static int ksm_nr_node_ids = 1;
 416 #else
 417 #define ksm_merge_across_nodes  1U
 418 #define ksm_nr_node_ids         1
 419 #endif
 420
 421 /*
 422  * Default policy for KSM_RUN_ONESHOT:
 423  * KSM performs both scannings only when the user requests it.
 424  * If scanning is ended, both crawler and scanner threads are blocked until
 425  * the next request is coming.
 426  */
 427 #define KSM_RUN_STOP    0
 428 #define KSM_RUN_MERGE   1
 429 #define KSM_RUN_UNMERGE 2
 430 #define KSM_RUN_OFFLINE 4
 431 #define KSM_RUN_ONESHOT 8
 432
 433 static unsigned long ksm_run = KSM_RUN_STOP;
 434 static atomic_t ksm_state; /* 0: in crawling 1: in scanning */
 435
 436 #define lksm_check_scan_state(ksm_state) (atomic_read(&ksm_state) == 1)
 437 #define lksm_set_scan_state(ksm_state) (atomic_set(&ksm_state, 1))
 438 #define lksm_clear_scan_state(ksm_state) (atomic_set(&ksm_state, 0))
 439
 440 struct task_slot {
 441         struct task_struct *task;
 442         int frozen;
 443         unsigned long inserted;
 444         struct list_head list;
 445         struct hlist_node hlist;
 446 };
 447
 448 /*
 449  * Frozen state:
 450  * When a process stops running on forground (e.g., going to background),
 451  * the system daemon (e.g., resourced) puts it to cgroup_freezer.
 452  * Once a process joins into freezer cgroup, the system kernel does not count
 453  * it as a runnable process, and thus it cannot be scheduled on CPU.
 454  * So, I regard processes in freezer cgroup as a frozen state and that can be
 455  * good candidates of memory deduplication.
 456  *
 457  * LKSM provides a hook to catch the moment that the process is being frozen.
 458  * With the hook, ksm crawler can get candidate list for memory deduplication.
 459  * (see kernel/cgroup_freezer.c)
 460  */
 461 #define FROZEN_BIT 0x01
 462 #define LISTED_BIT 0x02
 463
 464 #define lksm_test_rmap_frozen(rmap_item) (rmap_item->oldchecksum & FROZEN_BIT)
 465 #define lksm_set_rmap_frozen(rmap_item) (rmap_item->oldchecksum |= FROZEN_BIT)
 466 #define lksm_clear_rmap_frozen(rmap_item) (rmap_item->oldchecksum &= ~FROZEN_BIT)
 467 #define lksm_clear_checksum_frozen(checksum) (checksum &= ~FROZEN_BIT)
 468
 469 #define KSM_MM_FROZEN 0x01
 470 #define KSM_MM_LISTED 0x02
 471 #define KSM_MM_NEWCOMER 0x04
 472 #define KSM_MM_SCANNED 0x08
 473 #ifdef CONFIG_LKSM_FILTER
 474 #define KSM_MM_PREPARED 0x10
 475 #endif
 476
 477 #define lksm_test_mm_state(mm_slot, bit) (mm_slot->state & bit)
 478 #define lksm_set_mm_state(mm_slot, bit) (mm_slot->state |= bit)
 479 #define lksm_clear_mm_state(mm_slot, bit) (mm_slot->state &= ~bit)
 480
 481 #ifdef CONFIG_LKSM_FILTER
 482 #define LKSM_REGION_HASH_BITS 10
 483 static DEFINE_HASHTABLE(lksm_region_hash, LKSM_REGION_HASH_BITS);
 484 spinlock_t lksm_region_lock;
 485
 486 /*
 487  * LKSM uses the filter when the region is scanned more than
 488  * LKSM_REGION_MATURE round
 489  */
 490 #define LKSM_REGION_MATURE 5
 491 #define lksm_region_mature(round, region) \
 492                 ((round - region->scan_round) > LKSM_REGION_MATURE)
 493
 494 enum lksm_region_type {
 495         LKSM_REGION_HEAP,
 496         LKSM_REGION_STACK,
 497         LKSM_REGION_FILE1, /* file mapped region: data section */
 498         LKSM_REGION_FILE2, /* file mapped region: bss section */
 499         LKSM_REGION_CONFLICT, /* conflicted regions: do not filtering */
 500         LKSM_REGION_UNKNOWN,
 501 };
 502
 503 static const char * const region_type_str[] = {
 504         "heap",
 505         "stack",
 506         "file_data",
 507         "file_bss",
 508         "conflicted",
 509         "unknown",
 510 };
 511
 512 /* sharing statistics for each region type */
 513 static int region_share[LKSM_REGION_UNKNOWN + 1];
 514
 515 /*
 516  * lksm_region: A region represents a physical mapped area.
 517  * Each process can have its own instance of a region, namely vma.
 518  * Regions for not-a-file-mapped areas like heap and stack just have
 519  * abstract representations as symbols.
 520  *
 521  * LKSM leverages the region for offset-based filtering.
 522  * Each region has a filter which records offsets of addresses of
 523  * shared pages in the region.
 524  * If once a region is matured, LKSM uses the filter to skip scanning of
 525  * unsharable pages.
 526  *
 527  * @type: type of region, refer above enumeration
 528  * @len: length of filter (in the number of 64-bit variables)
 529  * @ino: inode number if the region is mapped to file
 530  * @merge_cnt: the number of merged pages in the region
 531  * @filter_cnt: the number of set bits in filter
 532  * @scan_round: the birth scan round of this region
 533  * @conflict: the count of size changed, clue for conflict
 534  * @refcount: if it reaches zero, the region will be freed
 535  * @hnode: hash node for finding region by ino
 536  * @next: data region can have a next (bss) region
 537  * @prev: reverse pointer to data region
 538  *
 539  * A few notes about bitmap filter variable:
 540  * LKSM uses bitmap filter for skipping scan of unsharable pages.
 541  * If a region is smaller than 256KB (<= 64 pages),
 542  * it can be covered by a bitmap stored in a 64-bit variable.
 543  * LKSM only allocates a bitmap array as a filter when the region is
 544  * larger than 256KB, otherwise it uses a 64-bit variable as a filter.
 545  *
 546  * @filter: when the region is bigger than 64 pages
 547  * @single_filter: when the region is smaller than or equal to 64 pages
 548  */
 549 #define SINGLE_FILTER_LEN 1 /* a region can be covered by single variable */
 550
 551 struct lksm_region {
 552         enum lksm_region_type type;
 553         int len;
 554         int ino;
 555         int merge_cnt;
 556         int filter_cnt;
 557         int scan_round;
 558         int conflict;
 559         atomic_t refcount;
 560         struct hlist_node hnode;
 561         struct lksm_region *next;
 562         struct lksm_region *prev;
 563         union {
 564                 unsigned long *filter;
 565                 unsigned long single_filter;
 566         };
 567 };
 568
 569 /*
 570  * lksm_region_ref:
 571  * Contains references from processes to regions
 572  */
 573
 574 struct lksm_region_ref {
 575         struct list_head list; /* listed by mm_slot */
 576         struct lksm_region *region;
 577 };
 578
 579 /* the number of registered lksm_regions */
 580 static unsigned int lksm_nr_regions;
 581
 582 /* the upper limit for region lookup */
 583 #define LKSM_REGION_ITER_MAX 8
 584
 585 #define lksm_region_size(start, end) ((int)(end - start) >> PAGE_SHIFT)
 586 #define lksm_bitmap_size(size) ((size >> 6) + ((size % BITS_PER_LONG) ? 1 : 0))
 587
 588 /* all processes share one lksm_region for their heaps */
 589 static struct lksm_region heap_region, unknown_region;
 590
 591 static void lksm_register_file_anon_region(struct mm_slot *slot,
 592                         struct vm_area_struct *vma);
 593 static struct lksm_region *lksm_find_region(struct vm_area_struct *vma);
 594 #endif /* CONFIG_LKSM_FILTER */
 595
 596 static int initial_round = 3;
 597 static unsigned long ksm_crawl_round;
 598 static unsigned long crawler_sleep;
 599
 600 /* statistical information */
 601 static int lksm_nr_merged; /* global merge count */
 602 static int lksm_nr_broken; /* global broken count */
 603 static int lksm_nr_scanned_slot; /* global scanned slot count */
 604 static int lksm_slot_nr_merged; /* per-slot merge count */
 605 static int lksm_slot_nr_broken; /* per-slot broken count */
 606
 607 /* initially, KSM takes small full scan interval */
 608 #define DEFAULT_FULL_SCAN_INTERVAL 60000 /* 60 seconds */
 609 static unsigned long full_scan_interval = 100;
 610
 611 /* statistical information about scanning time */
 612 static unsigned long lksm_last_scan_time;
 613 static unsigned long lksm_proc_scan_time;
 614
 615 /* stuffs for pruning short-lived task */
 616 #define KSM_SHORT_TASK_TIME 100
 617 static unsigned long short_lived_thresh = KSM_SHORT_TASK_TIME;
 618
 619 #define get_task_runtime(task) (task->se.sum_exec_runtime)
 620 #define ms_to_ns(ms) (ms * 1000 * 1000)
 621 #define check_short_task(task) \
 622         (get_task_runtime(task) < ms_to_ns(short_lived_thresh))
 623
 624 static void wait_while_offlining(void);
 625 static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen);
 626
 627 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 628 static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
 629 static DEFINE_MUTEX(ksm_thread_mutex);
 630 static DEFINE_SPINLOCK(ksm_mmlist_lock);
 631 static DECLARE_WAIT_QUEUE_HEAD(ksm_crawl_wait);
 632
 633 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 634                 sizeof(struct __struct), __alignof__(struct __struct),\
 635                 (__flags), NULL)
 636
 637 static int __init ksm_slab_init(void)
 638 {
 639         rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
 640         if (!rmap_item_cache)
 641                 goto out;
 642
 643         stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
 644         if (!stable_node_cache)
 645                 goto out_free1;
 646
 647         mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
 648         if (!mm_slot_cache)
 649                 goto out_free2;
 650         task_slot_cache = KSM_KMEM_CACHE(task_slot, 0);
 651         if (!task_slot_cache)
 652                 goto out_free3;
 653
 654         return 0;
 655
 656 out_free3:
 657         kmem_cache_destroy(mm_slot_cache);
 658 out_free2:
 659         kmem_cache_destroy(stable_node_cache);
 660 out_free1:
 661         kmem_cache_destroy(rmap_item_cache);
 662 out:
 663         return -ENOMEM;
 664 }
 665
 666 static void __init ksm_slab_free(void)
 667 {
 668         kmem_cache_destroy(mm_slot_cache);
 669         kmem_cache_destroy(stable_node_cache);
 670         kmem_cache_destroy(rmap_item_cache);
 671         mm_slot_cache = NULL;
 672 }
 673
 674 static __always_inline bool is_stable_node_chain(struct stable_node *chain)
 675 {
 676         return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
 677 }
 678
 679 static __always_inline bool is_stable_node_dup(struct stable_node *dup)
 680 {
 681         return dup->head == STABLE_NODE_DUP_HEAD;
 682 }
 683
 684 static inline void stable_node_chain_add_dup(struct stable_node *dup,
 685                                              struct stable_node *chain)
 686 {
 687         VM_BUG_ON(is_stable_node_dup(dup));
 688         dup->head = STABLE_NODE_DUP_HEAD;
 689         VM_BUG_ON(!is_stable_node_chain(chain));
 690         hlist_add_head(&dup->hlist_dup, &chain->hlist);
 691         ksm_stable_node_dups++;
 692 }
 693
 694 static inline void __stable_node_dup_del(struct stable_node *dup)
 695 {
 696         VM_BUG_ON(!is_stable_node_dup(dup));
 697         hlist_del(&dup->hlist_dup);
 698         ksm_stable_node_dups--;
 699 }
 700
 701 static inline void stable_node_dup_del(struct stable_node *dup)
 702 {
 703         VM_BUG_ON(is_stable_node_chain(dup));
 704         if (is_stable_node_dup(dup))
 705                 __stable_node_dup_del(dup);
 706         else
 707                 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
 708 #ifdef CONFIG_DEBUG_VM
 709         dup->head = NULL;
 710 #endif
 711 }
 712
 713 static inline struct rmap_item *alloc_rmap_item(void)
 714 {
 715         struct rmap_item *rmap_item;
 716
 717         rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
 718                                                 __GFP_NORETRY | __GFP_NOWARN);
 719         if (rmap_item)
 720                 ksm_rmap_items++;
 721         return rmap_item;
 722 }
 723
 724 static inline void free_rmap_item(struct rmap_item *rmap_item)
 725 {
 726         ksm_rmap_items--;
 727         rmap_item->mm = NULL;   /* debug safety */
 728         kmem_cache_free(rmap_item_cache, rmap_item);
 729 }
 730
 731 static inline struct stable_node *alloc_stable_node(void)
 732 {
 733         /*
 734          * The allocation can take too long with GFP_KERNEL when memory is under
 735          * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
 736          * grants access to memory reserves, helping to avoid this problem.
 737          */
 738         return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
 739 }
 740
 741 static inline void free_stable_node(struct stable_node *stable_node)
 742 {
 743         VM_BUG_ON(stable_node->rmap_hlist_len &&
 744                   !is_stable_node_chain(stable_node));
 745         kmem_cache_free(stable_node_cache, stable_node);
 746 }
 747
 748 static inline struct mm_slot *alloc_mm_slot(void)
 749 {
 750         if (!mm_slot_cache)     /* initialization failed */
 751                 return NULL;
 752         return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
 753 }
 754
 755 static inline void free_mm_slot(struct mm_slot *mm_slot)
 756 {
 757         kmem_cache_free(mm_slot_cache, mm_slot);
 758 }
 759
 760 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 761 {
 762         struct mm_slot *slot;
 763
 764         hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
 765                 if (slot->mm == mm)
 766                         return slot;
 767
 768         return NULL;
 769 }
 770
 771 static void insert_to_mm_slots_hash(struct mm_struct *mm,
 772                                     struct mm_slot *mm_slot)
 773 {
 774         mm_slot->mm = mm;
 775         hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
 776 }
 777
 778 static inline struct task_slot *alloc_task_slot(void)
 779 {
 780         if (!task_slot_cache)
 781                 return NULL;
 782         return kmem_cache_zalloc(task_slot_cache, GFP_NOWAIT);
 783 }
 784
 785 static inline void free_task_slot(struct task_slot *task_slot)
 786 {
 787         kmem_cache_free(task_slot_cache, task_slot);
 788 }
 789
 790 static struct task_slot *get_task_slot(struct task_struct *task)
 791 {
 792         struct task_slot *slot;
 793
 794         hash_for_each_possible(task_slots_hash, slot, hlist,
 795                         (unsigned long)task)
 796                 if (slot->task == task)
 797                         return slot;
 798         return NULL;
 799 }
 800
 801 static inline void insert_to_task_slots_hash(struct task_slot *slot)
 802 {
 803         hash_add(task_slots_hash, &slot->hlist, (unsigned long)slot->task);
 804 }
 805
 806 /*
 807  * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 808  * page tables after it has passed through ksm_exit() - which, if necessary,
 809  * takes mmap_sem briefly to serialize against them.  ksm_exit() does not set
 810  * a special flag: they can just back out as soon as mm_users goes to zero.
 811  * ksm_test_exit() is used throughout to make this test for exit: in some
 812  * places for correctness, in some places just to avoid unnecessary work.
 813  */
 814 static inline bool ksm_test_exit(struct mm_struct *mm)
 815 {
 816         return atomic_read(&mm->mm_users) == 0;
 817 }
 818
 819 /*
 820  * We use break_ksm to break COW on a ksm page: it's a stripped down
 821  *
 822  *      if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
 823  *              put_page(page);
 824  *
 825  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 826  * in case the application has unmapped and remapped mm,addr meanwhile.
 827  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 828  * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
 829  *
 830  * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
 831  * of the process that owns 'vma'.  We also do not want to enforce
 832  * protection keys here anyway.
 833  */
 834 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 835 {
 836         struct page *page;
 837         vm_fault_t ret = 0;
 838
 839         do {
 840                 cond_resched();
 841                 page = follow_page(vma, addr,
 842                                 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
 843                 if (IS_ERR_OR_NULL(page))
 844                         break;
 845                 if (PageKsm(page))
 846                         ret = handle_mm_fault(vma, addr,
 847                                         FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
 848                 else
 849                         ret = VM_FAULT_WRITE;
 850                 put_page(page);
 851         } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
 852         /*
 853          * We must loop because handle_mm_fault() may back out if there's
 854          * any difficulty e.g. if pte accessed bit gets updated concurrently.
 855          *
 856          * VM_FAULT_WRITE is what we have been hoping for: it indicates that
 857          * COW has been broken, even if the vma does not permit VM_WRITE;
 858          * but note that a concurrent fault might break PageKsm for us.
 859          *
 860          * VM_FAULT_SIGBUS could occur if we race with truncation of the
 861          * backing file, which also invalidates anonymous pages: that's
 862          * okay, that truncation will have unmapped the PageKsm for us.
 863          *
 864          * VM_FAULT_OOM: at the time of writing (late July 2009), setting
 865          * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
 866          * current task has TIF_MEMDIE set, and will be OOM killed on return
 867          * to user; and ksmd, having no mm, would never be chosen for that.
 868          *
 869          * But if the mm is in a limited mem_cgroup, then the fault may fail
 870          * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
 871          * even ksmd can fail in this way - though it's usually breaking ksm
 872          * just to undo a merge it made a moment before, so unlikely to oom.
 873          *
 874          * That's a pity: we might therefore have more kernel pages allocated
 875          * than we're counting as nodes in the stable tree; but ksm_do_scan
 876          * will retry to break_cow on each pass, so should recover the page
 877          * in due course.  The important thing is to not let VM_MERGEABLE
 878          * be cleared while any such pages might remain in the area.
 879          */
 880         return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 881 }
 882
 883 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
 884                 unsigned long addr)
 885 {
 886         struct vm_area_struct *vma;
 887         if (ksm_test_exit(mm))
 888                 return NULL;
 889         vma = find_vma(mm, addr);
 890         if (!vma || vma->vm_start > addr)
 891                 return NULL;
 892         if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
 893                 return NULL;
 894         return vma;
 895 }
 896
 897 static void break_cow(struct rmap_item *rmap_item)
 898 {
 899         struct mm_struct *mm = rmap_item->mm;
 900         unsigned long addr = rmap_item->address;
 901         struct vm_area_struct *vma;
 902
 903         /*
 904          * It is not an accident that whenever we want to break COW
 905          * to undo, we also need to drop a reference to the anon_vma.
 906          */
 907         put_anon_vma(rmap_item->anon_vma);
 908
 909         down_read(&mm->mmap_sem);
 910         vma = find_mergeable_vma(mm, addr);
 911         if (vma)
 912                 break_ksm(vma, addr);
 913         up_read(&mm->mmap_sem);
 914 }
 915
 916 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 917 {
 918         struct mm_struct *mm = rmap_item->mm;
 919         unsigned long addr = rmap_item->address;
 920         struct vm_area_struct *vma;
 921         struct page *page;
 922
 923         down_read(&mm->mmap_sem);
 924         vma = find_mergeable_vma(mm, addr);
 925         if (!vma)
 926                 goto out;
 927
 928         page = follow_page(vma, addr, FOLL_GET);
 929         if (IS_ERR_OR_NULL(page))
 930                 goto out;
 931         if (PageAnon(page)) {
 932                 flush_anon_page(vma, page, addr);
 933                 flush_dcache_page(page);
 934         } else {
 935                 put_page(page);
 936 out:
 937                 page = NULL;
 938         }
 939         up_read(&mm->mmap_sem);
 940         return page;
 941 }
 942
 943 #ifdef CONFIG_LKSM_FILTER
 944 static inline int is_heap(struct vm_area_struct *vma)
 945 {
 946         return vma->vm_start <= vma->vm_mm->brk &&
 947                 vma->vm_end >= vma->vm_mm->start_brk;
 948 }
 949
 950 /* below code is copied from fs/proc/task_mmu.c */
 951
 952 static int is_stack(struct vm_area_struct *vma)
 953 {
 954         return vma->vm_start <= vma->vm_mm->start_stack &&
 955                 vma->vm_end >= vma->vm_mm->start_stack;
 956 }
 957
 958 static int is_exec(struct vm_area_struct *vma)
 959 {
 960         return (vma->vm_flags & VM_EXEC);
 961 }
 962 #endif /* CONFIG_LKSM_FILTER */
 963
 964 /*
 965  * ksm_join: a wrapper function of ksm_enter.
 966  * The function sets VM_MERGEABLE flag of vmas in the given mm_struct.
 967  */
 968 static int ksm_join(struct mm_struct *mm, int frozen)
 969 {
 970         struct vm_area_struct *vma;
 971         struct mm_slot *slot;
 972         int newly_allocated = 0;
 973
 974         if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
 975                 slot = __ksm_enter_alloc_slot(mm, frozen);
 976                 if (!slot)
 977                         return -ENOMEM;
 978                 newly_allocated = 1;
 979         } else {
 980                 slot = get_mm_slot(mm);
 981                 if (!slot) {
 982                         ksm_err("there is no mm_slot for %p", mm);
 983                         return -EINVAL;
 984                 }
 985         }
 986
 987         for (vma = mm->mmap; vma; vma = vma->vm_next) {
 988                 if (vma->vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
 989                                 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
 990                                 VM_HUGETLB | VM_MIXEDMAP))
 991                         continue;
 992                 vma->vm_flags |= VM_MERGEABLE;
 993 #ifdef CONFIG_LKSM_FILTER
 994                 /*
 995                  * Many page sharings come from library pages because processes
 996                  * are sharing runtime framwork of the OS.
 997                  * Thus, anonymous pages related with file-mapped areas can show
 998                  * sharing patterns which can be exploited in LKSM while other
 999                  * anonymous regions (e.g., heap) don't.
1000                  * LKSM only tracks file-related regions to make filters.
1001                  */
1002                 if (!is_heap(vma) && !is_stack(vma) &&
1003                                 !is_exec(vma) && vma->anon_vma)
1004                         lksm_register_file_anon_region(slot, vma);
1005 #endif
1006         }
1007
1008         return newly_allocated;
1009 }
1010
1011 #define ksm_join_write_lock(mm, frozen, ret) do {\
1012         down_write(&mm->mmap_sem);      \
1013         ret = ksm_join(mm, frozen);     \
1014         up_write(&mm->mmap_sem);        \
1015 } while (0)
1016
1017 #ifdef CONFIG_LKSM_FILTER
1018 static void lksm_region_ref_append
1019 (struct mm_slot *slot, struct lksm_region *region)
1020 {
1021         struct lksm_region_ref *ref;
1022
1023         BUG_ON(!region);
1024         ref = kzalloc(sizeof(struct lksm_region_ref), GFP_KERNEL);
1025         if (!ref)
1026                 return;
1027         ref->region = region;
1028         list_add_tail(&ref->list, &slot->ref_list);
1029
1030         atomic_inc(&region->refcount);
1031 }
1032
1033 static void lksm_region_free(struct lksm_region *region)
1034 {
1035         unsigned long flags;
1036
1037         ksm_debug("lets free region(%p) prev(%p)", region, region->prev);
1038         spin_lock_irqsave(&lksm_region_lock, flags);
1039         if (!region->next) {
1040                 if (region->prev) {
1041                         if (atomic_read(&region->prev->refcount) == 0) {
1042                                 hash_del(&region->prev->hnode);
1043                                 if (region->prev->len > SINGLE_FILTER_LEN)
1044                                         kfree(region->prev->filter);
1045                                 kfree(region->prev);
1046                         } else {
1047                                 ksm_debug("prev region(%p) has ref count(%d)",
1048                                                 region->prev,
1049                                                 atomic_read(&region->prev->refcount));
1050                                 region->prev->next = NULL;
1051                         }
1052                 }
1053                 hash_del(&region->hnode);
1054                 if (region->len > SINGLE_FILTER_LEN)
1055                         kfree(region->filter);
1056                 kfree(region);
1057         }
1058         spin_unlock_irqrestore(&lksm_region_lock, flags);
1059 }
1060
1061 static void lksm_region_ref_remove(struct lksm_region_ref *ref)
1062 {
1063         list_del_init(&ref->list);
1064         if (atomic_dec_and_test(&ref->region->refcount))
1065                 lksm_region_free(ref->region);
1066         kfree(ref);
1067 }
1068
1069 static void lksm_region_ref_list_release(struct mm_slot *slot)
1070 {
1071         struct lksm_region_ref *ref, *next;
1072
1073         ksm_debug("release %p ref list", slot);
1074         list_for_each_entry_safe(ref, next, &slot->ref_list, list) {
1075                 lksm_region_ref_remove(ref);
1076         }
1077 }
1078 #endif /* CONFIG_LKSM_FILTER */
1079
1080 /*
1081  * This helper is used for getting right index into array of tree roots.
1082  * When merge_across_nodes knob is set to 1, there are only two rb-trees for
1083  * stable and unstable pages from all nodes with roots in index 0. Otherwise,
1084  * every node has its own stable and unstable tree.
1085  */
1086 static inline int get_kpfn_nid(unsigned long kpfn)
1087 {
1088         return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
1089 }
1090
1091 static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
1092                                                    struct rb_root *root)
1093 {
1094         struct stable_node *chain = alloc_stable_node();
1095         VM_BUG_ON(is_stable_node_chain(dup));
1096         if (likely(chain)) {
1097                 INIT_HLIST_HEAD(&chain->hlist);
1098                 chain->chain_prune_time = jiffies;
1099                 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
1100 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
1101                 chain->nid = NUMA_NO_NODE; /* debug */
1102 #endif
1103                 ksm_stable_node_chains++;
1104
1105                 /*
1106                  * Put the stable node chain in the first dimension of
1107                  * the stable tree and at the same time remove the old
1108                  * stable node.
1109                  */
1110                 rb_replace_node(&dup->node, &chain->node, root);
1111
1112                 /*
1113                  * Move the old stable node to the second dimension
1114                  * queued in the hlist_dup. The invariant is that all
1115                  * dup stable_nodes in the chain->hlist point to pages
1116                  * that are wrprotected and have the exact same
1117                  * content.
1118                  */
1119                 stable_node_chain_add_dup(dup, chain);
1120         }
1121         return chain;
1122 }
1123
1124 static inline void free_stable_node_chain(struct stable_node *chain,
1125                                           struct rb_root *root)
1126 {
1127         rb_erase(&chain->node, root);
1128         free_stable_node(chain);
1129         ksm_stable_node_chains--;
1130 }
1131
1132 static void remove_node_from_stable_tree(struct stable_node *stable_node)
1133 {
1134         struct rmap_item *rmap_item;
1135
1136         /* check it's not STABLE_NODE_CHAIN or negative */
1137         BUG_ON(stable_node->rmap_hlist_len < 0);
1138
1139         hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1140                 if (rmap_item->hlist.next) {
1141                         ksm_pages_sharing--;
1142                         lksm_slot_nr_broken++;
1143                         lksm_nr_broken++;
1144                 } else
1145                         ksm_pages_shared--;
1146                 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
1147                 stable_node->rmap_hlist_len--;
1148                 put_anon_vma(rmap_item->anon_vma);
1149                 rmap_item->address &= PAGE_MASK;
1150                 cond_resched();
1151         }
1152
1153         /*
1154          * We need the second aligned pointer of the migrate_nodes
1155          * list_head to stay clear from the rb_parent_color union
1156          * (aligned and different than any node) and also different
1157          * from &migrate_nodes. This will verify that future list.h changes
1158          * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
1159          */
1160 #if defined(GCC_VERSION) && GCC_VERSION >= 40903
1161         BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
1162         BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
1163 #endif
1164
1165         if (stable_node->head == &migrate_nodes)
1166                 list_del(&stable_node->list);
1167         else
1168                 stable_node_dup_del(stable_node);
1169         free_stable_node(stable_node);
1170 }
1171
1172 enum get_ksm_page_flags {
1173         GET_KSM_PAGE_NOLOCK,
1174         GET_KSM_PAGE_LOCK,
1175         GET_KSM_PAGE_TRYLOCK
1176 };
1177
1178 /*
1179  * get_ksm_page: checks if the page indicated by the stable node
1180  * is still its ksm page, despite having held no reference to it.
1181  * In which case we can trust the content of the page, and it
1182  * returns the gotten page; but if the page has now been zapped,
1183  * remove the stale node from the stable tree and return NULL.
1184  * But beware, the stable node's page might be being migrated.
1185  *
1186  * You would expect the stable_node to hold a reference to the ksm page.
1187  * But if it increments the page's count, swapping out has to wait for
1188  * ksmd to come around again before it can free the page, which may take
1189  * seconds or even minutes: much too unresponsive.  So instead we use a
1190  * "keyhole reference": access to the ksm page from the stable node peeps
1191  * out through its keyhole to see if that page still holds the right key,
1192  * pointing back to this stable node.  This relies on freeing a PageAnon
1193  * page to reset its page->mapping to NULL, and relies on no other use of
1194  * a page to put something that might look like our key in page->mapping.
1195  * is on its way to being freed; but it is an anomaly to bear in mind.
1196  */
1197 static struct page *get_ksm_page(struct stable_node *stable_node,
1198                                  enum get_ksm_page_flags flags)
1199 {
1200         struct page *page;
1201         void *expected_mapping;
1202         unsigned long kpfn;
1203
1204         expected_mapping = (void *)((unsigned long)stable_node |
1205                                         PAGE_MAPPING_KSM);
1206 again:
1207         kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
1208         page = pfn_to_page(kpfn);
1209         if (READ_ONCE(page->mapping) != expected_mapping)
1210                 goto stale;
1211
1212         /*
1213          * We cannot do anything with the page while its refcount is 0.
1214          * Usually 0 means free, or tail of a higher-order page: in which
1215          * case this node is no longer referenced, and should be freed;
1216          * however, it might mean that the page is under page_ref_freeze().
1217          * The __remove_mapping() case is easy, again the node is now stale;
1218          * the same is in reuse_ksm_page() case; but if page is swapcache
1219          * in migrate_page_move_mapping(), it might still be our page,
1220          * in which case it's essential to keep the node.
1221          */
1222         while (!get_page_unless_zero(page)) {
1223                 /*
1224                  * Another check for page->mapping != expected_mapping would
1225                  * work here too.  We have chosen the !PageSwapCache test to
1226                  * optimize the common case, when the page is or is about to
1227                  * be freed: PageSwapCache is cleared (under spin_lock_irq)
1228                  * in the ref_freeze section of __remove_mapping(); but Anon
1229                  * page->mapping reset to NULL later, in free_pages_prepare().
1230                  */
1231                 if (!PageSwapCache(page))
1232                         goto stale;
1233                 cpu_relax();
1234         }
1235
1236         if (READ_ONCE(page->mapping) != expected_mapping) {
1237                 put_page(page);
1238                 goto stale;
1239         }
1240
1241         if (flags == GET_KSM_PAGE_TRYLOCK) {
1242                 if (!trylock_page(page)) {
1243                         put_page(page);
1244                         return ERR_PTR(-EBUSY);
1245                 }
1246         } else if (flags == GET_KSM_PAGE_LOCK)
1247                 lock_page(page);
1248
1249         if (flags != GET_KSM_PAGE_NOLOCK) {
1250                 if (READ_ONCE(page->mapping) != expected_mapping) {
1251                         unlock_page(page);
1252                         put_page(page);
1253                         goto stale;
1254                 }
1255         }
1256         return page;
1257
1258 stale:
1259         /*
1260          * We come here from above when page->mapping or !PageSwapCache
1261          * suggests that the node is stale; but it might be under migration.
1262          * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
1263          * before checking whether node->kpfn has been changed.
1264          */
1265         smp_rmb();
1266         if (READ_ONCE(stable_node->kpfn) != kpfn)
1267                 goto again;
1268         remove_node_from_stable_tree(stable_node);
1269         return NULL;
1270 }
1271
1272 /*
1273  * Removing rmap_item from stable or unstable tree.
1274  * This function will clean the information from the stable/unstable tree.
1275  */
1276 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1277 {
1278         if (rmap_item->address & STABLE_FLAG) {
1279                 struct stable_node *stable_node;
1280                 struct page *page;
1281
1282                 stable_node = rmap_item->head;
1283                 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1284                 if (!page)
1285                         goto out;
1286
1287                 hlist_del(&rmap_item->hlist);
1288                 unlock_page(page);
1289                 put_page(page);
1290
1291                 if (!hlist_empty(&stable_node->hlist)) {
1292                         ksm_pages_sharing--;
1293                         lksm_slot_nr_broken++;
1294                         lksm_nr_broken++;
1295                 } else
1296                         ksm_pages_shared--;
1297                 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
1298                 stable_node->rmap_hlist_len--;
1299
1300                 put_anon_vma(rmap_item->anon_vma);
1301                 rmap_item->address &= PAGE_MASK;
1302
1303         } else if (rmap_item->address & UNSTABLE_FLAG) {
1304                 unsigned char age;
1305                 /*
1306                  * Usually ksmd can and must skip the rb_erase, because
1307                  * root_unstable_tree was already reset to RB_ROOT.
1308                  * But be careful when an mm is exiting: do the rb_erase
1309                  * if this rmap_item was inserted by this scan, rather
1310                  * than left over from before.
1311                  */
1312                 age = (unsigned char)(ksm_scan.scan_round - rmap_item->address);
1313                 if (!age)
1314                         rb_erase(&rmap_item->node,
1315                                  root_unstable_tree + NUMA(rmap_item->nid));
1316                 else
1317                         RB_CLEAR_NODE(&rmap_item->node);
1318
1319                 ksm_pages_unshared--;
1320                 rmap_item->address &= PAGE_MASK;
1321         }
1322 out:
1323         cond_resched();         /* we're called from many long loops */
1324 }
1325
1326 static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1327                                        struct rmap_item **rmap_list)
1328 {
1329         while (*rmap_list) {
1330                 struct rmap_item *rmap_item = *rmap_list;
1331                 *rmap_list = rmap_item->rmap_list;
1332                 remove_rmap_item_from_tree(rmap_item);
1333                 free_rmap_item(rmap_item);
1334         }
1335 }
1336
1337 /*
1338  * Though it's very tempting to unmerge rmap_items from stable tree rather
1339  * than check every pte of a given vma, the locking doesn't quite work for
1340  * that - an rmap_item is assigned to the stable tree after inserting ksm
1341  * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
1342  * rmap_items from parent to child at fork time (so as not to waste time
1343  * if exit comes before the next scan reaches it).
1344  *
1345  * Similarly, although we'd like to remove rmap_items (so updating counts
1346  * and freeing memory) when unmerging an area, it's easier to leave that
1347  * to the next pass of ksmd - consider, for example, how ksmd might be
1348  * in cmp_and_merge_page on one of the rmap_items we would be removing.
1349  */
1350 static int unmerge_ksm_pages(struct vm_area_struct *vma,
1351                              unsigned long start, unsigned long end)
1352 {
1353         unsigned long addr;
1354         int err = 0;
1355
1356         for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1357                 if (ksm_test_exit(vma->vm_mm))
1358                         break;
1359                 if (signal_pending(current))
1360                         err = -ERESTARTSYS;
1361                 else
1362                         err = break_ksm(vma, addr);
1363         }
1364         return err;
1365 }
1366
1367 static inline struct stable_node *page_stable_node(struct page *page)
1368 {
1369         return PageKsm(page) ? page_rmapping(page) : NULL;
1370 }
1371
1372 static inline void set_page_stable_node(struct page *page,
1373                                         struct stable_node *stable_node)
1374 {
1375         page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1376 }
1377
1378 #ifdef CONFIG_SYSFS
1379 /*
1380  * Only called through the sysfs control interface:
1381  */
1382 static int remove_stable_node(struct stable_node *stable_node)
1383 {
1384         struct page *page;
1385         int err;
1386
1387         page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1388         if (!page) {
1389                 /*
1390                  * get_ksm_page did remove_node_from_stable_tree itself.
1391                  */
1392                 return 0;
1393         }
1394
1395         /*
1396          * Page could be still mapped if this races with __mmput() running in
1397          * between ksm_exit() and exit_mmap(). Just refuse to let
1398          * merge_across_nodes/max_page_sharing be switched.
1399          */
1400         err = -EBUSY;
1401         if (!page_mapped(page)) {
1402                 /*
1403                  * The stable node did not yet appear stale to get_ksm_page(),
1404                  * since that allows for an unmapped ksm page to be recognized
1405                  * right up until it is freed; but the node is safe to remove.
1406                  * This page might be in a pagevec waiting to be freed,
1407                  * or it might be PageSwapCache (perhaps under writeback),
1408                  * or it might have been removed from swapcache a moment ago.
1409                  */
1410                 set_page_stable_node(page, NULL);
1411                 remove_node_from_stable_tree(stable_node);
1412                 err = 0;
1413         }
1414
1415         unlock_page(page);
1416         put_page(page);
1417         return err;
1418 }
1419
1420 static int remove_stable_node_chain(struct stable_node *stable_node,
1421                                     struct rb_root *root)
1422 {
1423         struct stable_node *dup;
1424         struct hlist_node *hlist_safe;
1425
1426         if (!is_stable_node_chain(stable_node)) {
1427                 VM_BUG_ON(is_stable_node_dup(stable_node));
1428                 if (remove_stable_node(stable_node))
1429                         return true;
1430                 else
1431                         return false;
1432         }
1433
1434         hlist_for_each_entry_safe(dup, hlist_safe,
1435                                   &stable_node->hlist, hlist_dup) {
1436                 VM_BUG_ON(!is_stable_node_dup(dup));
1437                 if (remove_stable_node(dup))
1438                         return true;
1439         }
1440         BUG_ON(!hlist_empty(&stable_node->hlist));
1441         free_stable_node_chain(stable_node, root);
1442         return false;
1443 }
1444
1445 static int remove_all_stable_nodes(void)
1446 {
1447         struct stable_node *stable_node, *next;
1448         int nid;
1449         int err = 0;
1450
1451         for (nid = 0; nid < ksm_nr_node_ids; nid++) {
1452                 while (root_stable_tree[nid].rb_node) {
1453                         stable_node = rb_entry(root_stable_tree[nid].rb_node,
1454                                                 struct stable_node, node);
1455                         if (remove_stable_node_chain(stable_node,
1456                                                      root_stable_tree + nid)) {
1457                                 err = -EBUSY;
1458                                 break;  /* proceed to next nid */
1459                         }
1460                         cond_resched();
1461                 }
1462         }
1463         list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
1464                 if (remove_stable_node(stable_node))
1465                         err = -EBUSY;
1466                 cond_resched();
1467         }
1468         return err;
1469 }
1470
1471 static int unmerge_and_remove_all_rmap_items(void)
1472 {
1473         struct mm_slot *mm_slot;
1474         struct mm_struct *mm;
1475         struct vm_area_struct *vma;
1476         int err = 0;
1477
1478         spin_lock(&ksm_mmlist_lock);
1479         ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1480                                                 struct mm_slot, mm_list);
1481         spin_unlock(&ksm_mmlist_lock);
1482
1483         for (mm_slot = ksm_scan.mm_slot;
1484                         mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1485                 mm = mm_slot->mm;
1486                 down_read(&mm->mmap_sem);
1487                 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1488                         if (ksm_test_exit(mm))
1489                                 break;
1490                         if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1491                                 continue;
1492                         err = unmerge_ksm_pages(vma,
1493                                                 vma->vm_start, vma->vm_end);
1494                         if (err)
1495                                 goto error;
1496                 }
1497
1498                 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1499                 up_read(&mm->mmap_sem);
1500
1501                 spin_lock(&ksm_mmlist_lock);
1502                 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1503                                                 struct mm_slot, mm_list);
1504                 if (ksm_test_exit(mm)) {
1505                         hash_del(&mm_slot->link);
1506                         list_del(&mm_slot->mm_list);
1507                         spin_unlock(&ksm_mmlist_lock);
1508
1509                         free_mm_slot(mm_slot);
1510                         clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1511                         mmdrop(mm);
1512                 } else
1513                         spin_unlock(&ksm_mmlist_lock);
1514         }
1515
1516         /* Clean up stable nodes, but don't worry if some are still busy */
1517         remove_all_stable_nodes();
1518         ksm_scan.scan_round = 0;
1519         return 0;
1520
1521 error:
1522         up_read(&mm->mmap_sem);
1523         spin_lock(&ksm_mmlist_lock);
1524         ksm_scan.mm_slot = &ksm_mm_head;
1525         spin_unlock(&ksm_mmlist_lock);
1526         return err;
1527 }
1528 #endif /* CONFIG_SYSFS */
1529
1530 static u32 calc_checksum(struct page *page)
1531 {
1532         u32 checksum;
1533         void *addr = kmap_atomic(page);
1534         checksum = xxhash(addr, PAGE_SIZE, 0);
1535         kunmap_atomic(addr);
1536         return lksm_clear_checksum_frozen(checksum);
1537 }
1538
1539 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1540                               pte_t *orig_pte)
1541 {
1542         struct mm_struct *mm = vma->vm_mm;
1543         struct page_vma_mapped_walk pvmw = {
1544                 .page = page,
1545                 .vma = vma,
1546         };
1547         int swapped;
1548         int err = -EFAULT;
1549         struct mmu_notifier_range range;
1550
1551         pvmw.address = page_address_in_vma(page, vma);
1552         if (pvmw.address == -EFAULT)
1553                 goto out;
1554
1555         BUG_ON(PageTransCompound(page));
1556
1557         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1558                                 pvmw.address,
1559                                 pvmw.address + PAGE_SIZE);
1560         mmu_notifier_invalidate_range_start(&range);
1561
1562         if (!page_vma_mapped_walk(&pvmw))
1563                 goto out_mn;
1564         if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1565                 goto out_unlock;
1566
1567         if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1568             (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1569                                                 mm_tlb_flush_pending(mm)) {
1570                 pte_t entry;
1571
1572                 swapped = PageSwapCache(page);
1573                 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1574                 /*
1575                  * Ok this is tricky, when get_user_pages_fast() run it doesn't
1576                  * take any lock, therefore the check that we are going to make
1577                  * with the pagecount against the mapcount is racey and
1578                  * O_DIRECT can happen right after the check.
1579                  * So we clear the pte and flush the tlb before the check
1580                  * this assure us that no O_DIRECT can happen after the check
1581                  * or in the middle of the check.
1582                  *
1583                  * No need to notify as we are downgrading page table to read
1584                  * only not changing it to point to a new page.
1585                  *
1586                  * See Documentation/vm/mmu_notifier.rst
1587                  */
1588                 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1589                 /*
1590                  * Check that no O_DIRECT or similar I/O is in progress on the
1591                  * page
1592                  */
1593                 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1594                         set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1595                         goto out_unlock;
1596                 }
1597                 if (pte_dirty(entry))
1598                         set_page_dirty(page);
1599
1600                 if (pte_protnone(entry))
1601                         entry = pte_mkclean(pte_clear_savedwrite(entry));
1602                 else
1603                         entry = pte_mkclean(pte_wrprotect(entry));
1604                 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1605         }
1606         *orig_pte = *pvmw.pte;
1607         err = 0;
1608
1609 out_unlock:
1610         page_vma_mapped_walk_done(&pvmw);
1611 out_mn:
1612         mmu_notifier_invalidate_range_end(&range);
1613 out:
1614         return err;
1615 }
1616
1617 /**
1618  * replace_page - replace page in vma by new ksm page
1619  * @vma:      vma that holds the pte pointing to page
1620  * @page:     the page we are replacing by kpage
1621  * @kpage:    the ksm page we replace page by
1622  * @orig_pte: the original value of the pte
1623  *
1624  * Returns 0 on success, -EFAULT on failure.
1625  */
1626 static int replace_page(struct vm_area_struct *vma, struct page *page,
1627                         struct page *kpage, pte_t orig_pte)
1628 {
1629         struct mm_struct *mm = vma->vm_mm;
1630         pmd_t *pmd;
1631         pte_t *ptep;
1632         pte_t newpte;
1633         spinlock_t *ptl;
1634         unsigned long addr;
1635         int err = -EFAULT;
1636         struct mmu_notifier_range range;
1637
1638         addr = page_address_in_vma(page, vma);
1639         if (addr == -EFAULT)
1640                 goto out;
1641
1642         pmd = mm_find_pmd(mm, addr);
1643         if (!pmd)
1644                 goto out;
1645
1646         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1647                                 addr + PAGE_SIZE);
1648         mmu_notifier_invalidate_range_start(&range);
1649
1650         ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1651         if (!pte_same(*ptep, orig_pte)) {
1652                 pte_unmap_unlock(ptep, ptl);
1653                 goto out_mn;
1654         }
1655
1656         /*
1657          * No need to check ksm_use_zero_pages here: we can only have a
1658          * zero_page here if ksm_use_zero_pages was enabled alreaady.
1659          */
1660         if (!is_zero_pfn(page_to_pfn(kpage))) {
1661                 get_page(kpage);
1662                 page_add_anon_rmap(kpage, vma, addr, false);
1663                 newpte = mk_pte(kpage, vma->vm_page_prot);
1664         } else {
1665                 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1666                                                vma->vm_page_prot));
1667                 /*
1668                  * We're replacing an anonymous page with a zero page, which is
1669                  * not anonymous. We need to do proper accounting otherwise we
1670                  * will get wrong values in /proc, and a BUG message in dmesg
1671                  * when tearing down the mm.
1672                  */
1673                 dec_mm_counter(mm, MM_ANONPAGES);
1674         }
1675
1676         flush_cache_page(vma, addr, pte_pfn(*ptep));
1677         /*
1678          * No need to notify as we are replacing a read only page with another
1679          * read only page with the same content.
1680          *
1681          * See Documentation/vm/mmu_notifier.rst
1682          */
1683         ptep_clear_flush(vma, addr, ptep);
1684         set_pte_at_notify(mm, addr, ptep, newpte);
1685
1686         page_remove_rmap(page, false);
1687         if (!page_mapped(page))
1688                 try_to_free_swap(page);
1689         put_page(page);
1690
1691         pte_unmap_unlock(ptep, ptl);
1692         err = 0;
1693 out_mn:
1694         mmu_notifier_invalidate_range_end(&range);
1695 out:
1696         return err;
1697 }
1698
1699 /*
1700  * try_to_merge_one_page - take two pages and merge them into one
1701  * @vma: the vma that holds the pte pointing to page
1702  * @page: the PageAnon page that we want to replace with kpage
1703  * @kpage: the PageKsm page that we want to map instead of page,
1704  *         or NULL the first time when we want to use page as kpage.
1705  *
1706  * This function returns 0 if the pages were merged, -EFAULT otherwise.
1707  */
1708 static int try_to_merge_one_page(struct vm_area_struct *vma,
1709                                  struct page *page, struct page *kpage)
1710 {
1711         pte_t orig_pte = __pte(0);
1712         int err = -EFAULT;
1713
1714         if (page == kpage)                      /* ksm page forked */
1715                 return 0;
1716
1717         if (!PageAnon(page))
1718                 goto out;
1719
1720         /*
1721          * We need the page lock to read a stable PageSwapCache in
1722          * write_protect_page().  We use trylock_page() instead of
1723          * lock_page() because we don't want to wait here - we
1724          * prefer to continue scanning and merging different pages,
1725          * then come back to this page when it is unlocked.
1726          */
1727         if (!trylock_page(page))
1728                 goto out;
1729
1730         if (PageTransCompound(page)) {
1731                 if (split_huge_page(page))
1732                         goto out_unlock;
1733         }
1734
1735         /*
1736          * If this anonymous page is mapped only here, its pte may need
1737          * to be write-protected.  If it's mapped elsewhere, all of its
1738          * ptes are necessarily already write-protected.  But in either
1739          * case, we need to lock and check page_count is not raised.
1740          */
1741         if (write_protect_page(vma, page, &orig_pte) == 0) {
1742                 if (!kpage) {
1743                         /*
1744                          * While we hold page lock, upgrade page from
1745                          * PageAnon+anon_vma to PageKsm+NULL stable_node:
1746                          * stable_tree_insert() will update stable_node.
1747                          */
1748                         set_page_stable_node(page, NULL);
1749                         mark_page_accessed(page);
1750                         /*
1751                          * Page reclaim just frees a clean page with no dirty
1752                          * ptes: make sure that the ksm page would be swapped.
1753                          */
1754                         if (!PageDirty(page))
1755                                 SetPageDirty(page);
1756                         err = 0;
1757                 } else if (pages_identical(page, kpage))
1758                         err = replace_page(vma, page, kpage, orig_pte);
1759         }
1760
1761         if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1762                 munlock_vma_page(page);
1763                 if (!PageMlocked(kpage)) {
1764                         unlock_page(page);
1765                         lock_page(kpage);
1766                         mlock_vma_page(kpage);
1767                         page = kpage;           /* for final unlock */
1768                 }
1769         }
1770
1771 out_unlock:
1772         unlock_page(page);
1773 out:
1774         return err;
1775 }
1776
1777 /*
1778  * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
1779  * but no new kernel page is allocated: kpage must already be a ksm page.
1780  *
1781  * This function returns 0 if the pages were merged, -EFAULT otherwise.
1782  */
1783 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1784                                       struct page *page, struct page *kpage)
1785 {
1786         struct mm_struct *mm = rmap_item->mm;
1787         struct vm_area_struct *vma;
1788         int err = -EFAULT;
1789
1790         down_read(&mm->mmap_sem);
1791         vma = find_mergeable_vma(mm, rmap_item->address);
1792         if (!vma)
1793                 goto out;
1794
1795         err = try_to_merge_one_page(vma, page, kpage);
1796         if (err)
1797                 goto out;
1798
1799         /* Unstable nid is in union with stable anon_vma: remove first */
1800         remove_rmap_item_from_tree(rmap_item);
1801
1802 #ifdef CONFIG_LKSM_FILTER
1803         /* node is removed from tree, base_addr can be safely used */
1804         rmap_item->base_addr = vma->vm_start;
1805 #endif
1806         /* Must get reference to anon_vma while still holding mmap_sem */
1807         rmap_item->anon_vma = vma->anon_vma;
1808         get_anon_vma(vma->anon_vma);
1809 out:
1810         up_read(&mm->mmap_sem);
1811         return err;
1812 }
1813
1814 /*
1815  * try_to_merge_two_pages - take two identical pages and prepare them
1816  * to be merged into one page.
1817  *
1818  * This function returns the kpage if we successfully merged two identical
1819  * pages into one ksm page, NULL otherwise.
1820  *
1821  * Note that this function upgrades page to ksm page: if one of the pages
1822  * is already a ksm page, try_to_merge_with_ksm_page should be used.
1823  */
1824 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1825                                            struct page *page,
1826                                            struct rmap_item *tree_rmap_item,
1827                                            struct page *tree_page)
1828 {
1829         int err;
1830
1831         err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1832         if (!err) {
1833                 err = try_to_merge_with_ksm_page(tree_rmap_item,
1834                                                         tree_page, page);
1835                 /*
1836                  * If that fails, we have a ksm page with only one pte
1837                  * pointing to it: so break it.
1838                  */
1839                 if (err)
1840                         break_cow(rmap_item);
1841         }
1842         return err ? NULL : page;
1843 }
1844
1845 static __always_inline
1846 bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1847 {
1848         VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1849         /*
1850          * Check that at least one mapping still exists, otherwise
1851          * there's no much point to merge and share with this
1852          * stable_node, as the underlying tree_page of the other
1853          * sharer is going to be freed soon.
1854          */
1855         return stable_node->rmap_hlist_len &&
1856                 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1857 }
1858
1859 static __always_inline
1860 bool is_page_sharing_candidate(struct stable_node *stable_node)
1861 {
1862         return __is_page_sharing_candidate(stable_node, 0);
1863 }
1864
1865 static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1866                                     struct stable_node **_stable_node,
1867                                     struct rb_root *root,
1868                                     bool prune_stale_stable_nodes)
1869 {
1870         struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1871         struct hlist_node *hlist_safe;
1872         struct page *_tree_page, *tree_page = NULL;
1873         int nr = 0;
1874         int found_rmap_hlist_len;
1875
1876         if (!prune_stale_stable_nodes ||
1877             time_before(jiffies, stable_node->chain_prune_time +
1878                         msecs_to_jiffies(
1879                                 ksm_stable_node_chains_prune_millisecs)))
1880                 prune_stale_stable_nodes = false;
1881         else
1882                 stable_node->chain_prune_time = jiffies;
1883
1884         hlist_for_each_entry_safe(dup, hlist_safe,
1885                                   &stable_node->hlist, hlist_dup) {
1886                 cond_resched();
1887                 /*
1888                  * We must walk all stable_node_dup to prune the stale
1889                  * stable nodes during lookup.
1890                  *
1891                  * get_ksm_page can drop the nodes from the
1892                  * stable_node->hlist if they point to freed pages
1893                  * (that's why we do a _safe walk). The "dup"
1894                  * stable_node parameter itself will be freed from
1895                  * under us if it returns NULL.
1896                  */
1897                 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1898                 if (!_tree_page)
1899                         continue;
1900                 nr += 1;
1901                 if (is_page_sharing_candidate(dup)) {
1902                         if (!found ||
1903                             dup->rmap_hlist_len > found_rmap_hlist_len) {
1904                                 if (found)
1905                                         put_page(tree_page);
1906                                 found = dup;
1907                                 found_rmap_hlist_len = found->rmap_hlist_len;
1908                                 tree_page = _tree_page;
1909
1910                                 /* skip put_page for found dup */
1911                                 if (!prune_stale_stable_nodes)
1912                                         break;
1913                                 continue;
1914                         }
1915                 }
1916                 put_page(_tree_page);
1917         }
1918
1919         if (found) {
1920                 /*
1921                  * nr is counting all dups in the chain only if
1922                  * prune_stale_stable_nodes is true, otherwise we may
1923                  * break the loop at nr == 1 even if there are
1924                  * multiple entries.
1925                  */
1926                 if (prune_stale_stable_nodes && nr == 1) {
1927                         /*
1928                          * If there's not just one entry it would
1929                          * corrupt memory, better BUG_ON. In KSM
1930                          * context with no lock held it's not even
1931                          * fatal.
1932                          */
1933                         BUG_ON(stable_node->hlist.first->next);
1934
1935                         /*
1936                          * There's just one entry and it is below the
1937                          * deduplication limit so drop the chain.
1938                          */
1939                         rb_replace_node(&stable_node->node, &found->node,
1940                                         root);
1941                         free_stable_node(stable_node);
1942                         ksm_stable_node_chains--;
1943                         ksm_stable_node_dups--;
1944                         /*
1945                          * NOTE: the caller depends on the stable_node
1946                          * to be equal to stable_node_dup if the chain
1947                          * was collapsed.
1948                          */
1949                         *_stable_node = found;
1950                         /*
1951                          * Just for robustneess as stable_node is
1952                          * otherwise left as a stable pointer, the
1953                          * compiler shall optimize it away at build
1954                          * time.
1955                          */
1956                         stable_node = NULL;
1957                 } else if (stable_node->hlist.first != &found->hlist_dup &&
1958                            __is_page_sharing_candidate(found, 1)) {
1959                         /*
1960                          * If the found stable_node dup can accept one
1961                          * more future merge (in addition to the one
1962                          * that is underway) and is not at the head of
1963                          * the chain, put it there so next search will
1964                          * be quicker in the !prune_stale_stable_nodes
1965                          * case.
1966                          *
1967                          * NOTE: it would be inaccurate to use nr > 1
1968                          * instead of checking the hlist.first pointer
1969                          * directly, because in the
1970                          * prune_stale_stable_nodes case "nr" isn't
1971                          * the position of the found dup in the chain,
1972                          * but the total number of dups in the chain.
1973                          */
1974                         hlist_del(&found->hlist_dup);
1975                         hlist_add_head(&found->hlist_dup,
1976                                        &stable_node->hlist);
1977                 }
1978         }
1979
1980         *_stable_node_dup = found;
1981         return tree_page;
1982 }
1983
1984 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1985                                                struct rb_root *root)
1986 {
1987         if (!is_stable_node_chain(stable_node))
1988                 return stable_node;
1989         if (hlist_empty(&stable_node->hlist)) {
1990                 free_stable_node_chain(stable_node, root);
1991                 return NULL;
1992         }
1993         return hlist_entry(stable_node->hlist.first,
1994                            typeof(*stable_node), hlist_dup);
1995 }
1996
1997 /*
1998  * Like for get_ksm_page, this function can free the *_stable_node and
1999  * *_stable_node_dup if the returned tree_page is NULL.
2000  *
2001  * It can also free and overwrite *_stable_node with the found
2002  * stable_node_dup if the chain is collapsed (in which case
2003  * *_stable_node will be equal to *_stable_node_dup like if the chain
2004  * never existed). It's up to the caller to verify tree_page is not
2005  * NULL before dereferencing *_stable_node or *_stable_node_dup.
2006  *
2007  * *_stable_node_dup is really a second output parameter of this
2008  * function and will be overwritten in all cases, the caller doesn't
2009  * need to initialize it.
2010  */
2011 static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
2012                                         struct stable_node **_stable_node,
2013                                         struct rb_root *root,
2014                                         bool prune_stale_stable_nodes)
2015 {
2016         struct stable_node *stable_node = *_stable_node;
2017         if (!is_stable_node_chain(stable_node)) {
2018                 if (is_page_sharing_candidate(stable_node)) {
2019                         *_stable_node_dup = stable_node;
2020                         return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
2021                 }
2022                 /*
2023                  * _stable_node_dup set to NULL means the stable_node
2024                  * reached the ksm_max_page_sharing limit.
2025                  */
2026                 *_stable_node_dup = NULL;
2027                 return NULL;
2028         }
2029         return stable_node_dup(_stable_node_dup, _stable_node, root,
2030                                prune_stale_stable_nodes);
2031 }
2032
2033 static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
2034                                                 struct stable_node **s_n,
2035                                                 struct rb_root *root)
2036 {
2037         return __stable_node_chain(s_n_d, s_n, root, true);
2038 }
2039
2040 static __always_inline struct page *chain(struct stable_node **s_n_d,
2041                                           struct stable_node *s_n,
2042                                           struct rb_root *root)
2043 {
2044         struct stable_node *old_stable_node = s_n;
2045         struct page *tree_page;
2046
2047         tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
2048         /* not pruning dups so s_n cannot have changed */
2049         VM_BUG_ON(s_n != old_stable_node);
2050         return tree_page;
2051 }
2052
2053 /*
2054  * stable_tree_search - search for page inside the stable tree
2055  *
2056  * This function checks if there is a page inside the stable tree
2057  * with identical content to the page that we are scanning right now.
2058  *
2059  * This function returns the stable tree node of identical content if found,
2060  * NULL otherwise.
2061  */
2062 static struct page *stable_tree_search(struct page *page)
2063 {
2064         int nid;
2065         struct rb_root *root;
2066         struct rb_node **new;
2067         struct rb_node *parent;
2068         struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
2069         struct stable_node *page_node;
2070
2071         page_node = page_stable_node(page);
2072         if (page_node && page_node->head != &migrate_nodes) {
2073                 /* ksm page forked */
2074                 get_page(page);
2075                 return page;
2076         }
2077
2078         nid = get_kpfn_nid(page_to_pfn(page));
2079         root = root_stable_tree + nid;
2080 again:
2081         new = &root->rb_node;
2082         parent = NULL;
2083
2084         while (*new) {
2085                 struct page *tree_page;
2086                 int ret;
2087
2088                 cond_resched();
2089                 stable_node = rb_entry(*new, struct stable_node, node);
2090                 stable_node_any = NULL;
2091                 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
2092                 /*
2093                  * NOTE: stable_node may have been freed by
2094                  * chain_prune() if the returned stable_node_dup is
2095                  * not NULL. stable_node_dup may have been inserted in
2096                  * the rbtree instead as a regular stable_node (in
2097                  * order to collapse the stable_node chain if a single
2098                  * stable_node dup was found in it). In such case the
2099                  * stable_node is overwritten by the calleee to point
2100                  * to the stable_node_dup that was collapsed in the
2101                  * stable rbtree and stable_node will be equal to
2102                  * stable_node_dup like if the chain never existed.
2103                  */
2104                 if (!stable_node_dup) {
2105                         /*
2106                          * Either all stable_node dups were full in
2107                          * this stable_node chain, or this chain was
2108                          * empty and should be rb_erased.
2109                          */
2110                         stable_node_any = stable_node_dup_any(stable_node,
2111                                                               root);
2112                         if (!stable_node_any) {
2113                                 /* rb_erase just run */
2114                                 goto again;
2115                         }
2116                         /*
2117                          * Take any of the stable_node dups page of
2118                          * this stable_node chain to let the tree walk
2119                          * continue. All KSM pages belonging to the
2120                          * stable_node dups in a stable_node chain
2121                          * have the same content and they're
2122                          * wrprotected at all times. Any will work
2123                          * fine to continue the walk.
2124                          */
2125                         tree_page = get_ksm_page(stable_node_any,
2126                                                  GET_KSM_PAGE_NOLOCK);
2127                 }
2128                 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
2129                 if (!tree_page) {
2130                         /*
2131                          * If we walked over a stale stable_node,
2132                          * get_ksm_page() will call rb_erase() and it
2133                          * may rebalance the tree from under us. So
2134                          * restart the search from scratch. Returning
2135                          * NULL would be safe too, but we'd generate
2136                          * false negative insertions just because some
2137                          * stable_node was stale.
2138                          */
2139                         goto again;
2140                 }
2141
2142                 ret = memcmp_pages(page, tree_page);
2143                 put_page(tree_page);
2144
2145                 parent = *new;
2146                 if (ret < 0)
2147                         new = &parent->rb_left;
2148                 else if (ret > 0)
2149                         new = &parent->rb_right;
2150                 else {
2151                         if (page_node) {
2152                                 VM_BUG_ON(page_node->head != &migrate_nodes);
2153                                 /*
2154                                  * Test if the migrated page should be merged
2155                                  * into a stable node dup. If the mapcount is
2156                                  * 1 we can migrate it with another KSM page
2157                                  * without adding it to the chain.
2158                                  */
2159                                 if (page_mapcount(page) > 1)
2160                                         goto chain_append;
2161                         }
2162
2163                         if (!stable_node_dup) {
2164                                 /*
2165                                  * If the stable_node is a chain and
2166                                  * we got a payload match in memcmp
2167                                  * but we cannot merge the scanned
2168                                  * page in any of the existing
2169                                  * stable_node dups because they're
2170                                  * all full, we need to wait the
2171                                  * scanned page to find itself a match
2172                                  * in the unstable tree to create a
2173                                  * brand new KSM page to add later to
2174                                  * the dups of this stable_node.
2175                                  */
2176                                 return NULL;
2177                         }
2178
2179                         /*
2180                          * Lock and unlock the stable_node's page (which
2181                          * might already have been migrated) so that page
2182                          * migration is sure to notice its raised count.
2183                          * It would be more elegant to return stable_node
2184                          * than kpage, but that involves more changes.
2185                          */
2186                         tree_page = get_ksm_page(stable_node_dup,
2187                                                  GET_KSM_PAGE_TRYLOCK);
2188
2189                         if (PTR_ERR(tree_page) == -EBUSY)
2190                                 return ERR_PTR(-EBUSY);
2191
2192                         if (unlikely(!tree_page))
2193                                 /*
2194                                  * The tree may have been rebalanced,
2195                                  * so re-evaluate parent and new.
2196                                  */
2197                                 goto again;
2198                         unlock_page(tree_page);
2199
2200                         if (get_kpfn_nid(stable_node_dup->kpfn) !=
2201                             NUMA(stable_node_dup->nid)) {
2202                                 put_page(tree_page);
2203                                 goto replace;
2204                         }
2205                         return tree_page;
2206                 }
2207         }
2208
2209         if (!page_node)
2210                 return NULL;
2211
2212         list_del(&page_node->list);
2213         DO_NUMA(page_node->nid = nid);
2214         rb_link_node(&page_node->node, parent, new);
2215         rb_insert_color(&page_node->node, root);
2216 out:
2217         if (is_page_sharing_candidate(page_node)) {
2218                 get_page(page);
2219                 return page;
2220         } else
2221                 return NULL;
2222
2223 replace:
2224         /*
2225          * If stable_node was a chain and chain_prune collapsed it,
2226          * stable_node has been updated to be the new regular
2227          * stable_node. A collapse of the chain is indistinguishable
2228          * from the case there was no chain in the stable
2229          * rbtree. Otherwise stable_node is the chain and
2230          * stable_node_dup is the dup to replace.
2231          */
2232         if (stable_node_dup == stable_node) {
2233                 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
2234                 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
2235                 /* there is no chain */
2236                 if (page_node) {
2237                         VM_BUG_ON(page_node->head != &migrate_nodes);
2238                         list_del(&page_node->list);
2239                         DO_NUMA(page_node->nid = nid);
2240                         rb_replace_node(&stable_node_dup->node,
2241                                         &page_node->node,
2242                                         root);
2243                         if (is_page_sharing_candidate(page_node))
2244                                 get_page(page);
2245                         else
2246                                 page = NULL;
2247                 } else {
2248                         rb_erase(&stable_node_dup->node, root);
2249                         page = NULL;
2250                 }
2251         } else {
2252                 VM_BUG_ON(!is_stable_node_chain(stable_node));
2253                 __stable_node_dup_del(stable_node_dup);
2254                 if (page_node) {
2255                         VM_BUG_ON(page_node->head != &migrate_nodes);
2256                         list_del(&page_node->list);
2257                         DO_NUMA(page_node->nid = nid);
2258                         stable_node_chain_add_dup(page_node, stable_node);
2259                         if (is_page_sharing_candidate(page_node))
2260                                 get_page(page);
2261                         else
2262                                 page = NULL;
2263                 } else {
2264                         page = NULL;
2265                 }
2266         }
2267         stable_node_dup->head = &migrate_nodes;
2268         list_add(&stable_node_dup->list, stable_node_dup->head);
2269         return page;
2270
2271 chain_append:
2272         /* stable_node_dup could be null if it reached the limit */
2273         if (!stable_node_dup)
2274                 stable_node_dup = stable_node_any;
2275         /*
2276          * If stable_node was a chain and chain_prune collapsed it,
2277          * stable_node has been updated to be the new regular
2278          * stable_node. A collapse of the chain is indistinguishable
2279          * from the case there was no chain in the stable
2280          * rbtree. Otherwise stable_node is the chain and
2281          * stable_node_dup is the dup to replace.
2282          */
2283         if (stable_node_dup == stable_node) {
2284                 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
2285                 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
2286                 /* chain is missing so create it */
2287                 stable_node = alloc_stable_node_chain(stable_node_dup,
2288                                                       root);
2289                 if (!stable_node)
2290                         return NULL;
2291         }
2292         /*
2293          * Add this stable_node dup that was
2294          * migrated to the stable_node chain
2295          * of the current nid for this page
2296          * content.
2297          */
2298         VM_BUG_ON(!is_stable_node_chain(stable_node));
2299         VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
2300         VM_BUG_ON(page_node->head != &migrate_nodes);
2301         list_del(&page_node->list);
2302         DO_NUMA(page_node->nid = nid);
2303         stable_node_chain_add_dup(page_node, stable_node);
2304         goto out;
2305 }
2306
2307 /*
2308  * stable_tree_insert - insert stable tree node pointing to new ksm page
2309  * into the stable tree.
2310  *
2311  * This function returns the stable tree node just allocated on success,
2312  * NULL otherwise.
2313  */
2314 static struct stable_node *stable_tree_insert(struct page *kpage)
2315 {
2316         int nid;
2317         unsigned long kpfn;
2318         struct rb_root *root;
2319         struct rb_node **new;
2320         struct rb_node *parent;
2321         struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
2322         bool need_chain = false;
2323
2324         kpfn = page_to_pfn(kpage);
2325         nid = get_kpfn_nid(kpfn);
2326         root = root_stable_tree + nid;
2327 again:
2328         parent = NULL;
2329         new = &root->rb_node;
2330
2331         while (*new) {
2332                 struct page *tree_page;
2333                 int ret;
2334
2335                 cond_resched();
2336                 stable_node = rb_entry(*new, struct stable_node, node);
2337                 stable_node_any = NULL;
2338                 tree_page = chain(&stable_node_dup, stable_node, root);
2339                 if (!stable_node_dup) {
2340                         /*
2341                          * Either all stable_node dups were full in
2342                          * this stable_node chain, or this chain was
2343                          * empty and should be rb_erased.
2344                          */
2345                         stable_node_any = stable_node_dup_any(stable_node,
2346                                                               root);
2347                         if (!stable_node_any) {
2348                                 /* rb_erase just run */
2349                                 goto again;
2350                         }
2351                         /*
2352                          * Take any of the stable_node dups page of
2353                          * this stable_node chain to let the tree walk
2354                          * continue. All KSM pages belonging to the
2355                          * stable_node dups in a stable_node chain
2356                          * have the same content and they're
2357                          * wrprotected at all times. Any will work
2358                          * fine to continue the walk.
2359                          */
2360                         tree_page = get_ksm_page(stable_node_any,
2361                                                  GET_KSM_PAGE_NOLOCK);
2362                 }
2363                 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
2364                 if (!tree_page) {
2365                         /*
2366                          * If we walked over a stale stable_node,
2367                          * get_ksm_page() will call rb_erase() and it
2368                          * may rebalance the tree from under us. So
2369                          * restart the search from scratch. Returning
2370                          * NULL would be safe too, but we'd generate
2371                          * false negative insertions just because some
2372                          * stable_node was stale.
2373                          */
2374                         goto again;
2375                 }
2376
2377                 ret = memcmp_pages(kpage, tree_page);
2378                 put_page(tree_page);
2379
2380                 parent = *new;
2381                 if (ret < 0)
2382                         new = &parent->rb_left;
2383                 else if (ret > 0)
2384                         new = &parent->rb_right;
2385                 else {
2386                         need_chain = true;
2387                         break;
2388                 }
2389         }
2390
2391         stable_node_dup = alloc_stable_node();
2392         if (!stable_node_dup)
2393                 return NULL;
2394
2395         INIT_HLIST_HEAD(&stable_node_dup->hlist);
2396         stable_node_dup->kpfn = kpfn;
2397         set_page_stable_node(kpage, stable_node_dup);
2398         stable_node_dup->rmap_hlist_len = 0;
2399         DO_NUMA(stable_node_dup->nid = nid);
2400         if (!need_chain) {
2401                 rb_link_node(&stable_node_dup->node, parent, new);
2402                 rb_insert_color(&stable_node_dup->node, root);
2403         } else {
2404                 if (!is_stable_node_chain(stable_node)) {
2405                         struct stable_node *orig = stable_node;
2406                         /* chain is missing so create it */
2407                         stable_node = alloc_stable_node_chain(orig, root);
2408                         if (!stable_node) {
2409                                 free_stable_node(stable_node_dup);
2410                                 return NULL;
2411                         }
2412                 }
2413                 stable_node_chain_add_dup(stable_node_dup, stable_node);
2414         }
2415
2416         return stable_node_dup;
2417 }
2418
2419 /*
2420  * unstable_tree_search_insert - search for identical page,
2421  * else insert rmap_item into the unstable tree.
2422  *
2423  * This function searches for a page in the unstable tree identical to the
2424  * page currently being scanned; and if no identical page is found in the
2425  * tree, we insert rmap_item as a new object into the unstable tree.
2426  *
2427  * This function returns pointer to rmap_item found to be identical
2428  * to the currently scanned page, NULL otherwise.
2429  *
2430  * This function does both searching and inserting, because they share
2431  * the same walking algorithm in an rbtree.
2432  */
2433 static
2434 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
2435                                               struct page *page,
2436                                               struct page **tree_pagep)
2437 {
2438         struct rb_node **new;
2439         struct rb_root *root;
2440         struct rb_node *parent = NULL;
2441         int nid;
2442
2443         nid = get_kpfn_nid(page_to_pfn(page));
2444         root = root_unstable_tree + nid;
2445         new = &root->rb_node;
2446
2447         while (*new) {
2448                 struct rmap_item *tree_rmap_item;
2449                 struct page *tree_page;
2450                 int ret;
2451
2452                 cond_resched();
2453                 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
2454                 tree_page = get_mergeable_page(tree_rmap_item);
2455                 if (!tree_page)
2456                         return NULL;
2457
2458                 /*
2459                  * Don't substitute a ksm page for a forked page.
2460                  */
2461                 if (page == tree_page) {
2462                         put_page(tree_page);
2463                         return NULL;
2464                 }
2465
2466                 ret = memcmp_pages(page, tree_page);
2467
2468                 parent = *new;
2469                 if (ret < 0) {
2470                         put_page(tree_page);
2471                         new = &parent->rb_left;
2472                 } else if (ret > 0) {
2473                         put_page(tree_page);
2474                         new = &parent->rb_right;
2475                 } else if (!ksm_merge_across_nodes &&
2476                            page_to_nid(tree_page) != nid) {
2477                         /*
2478                          * If tree_page has been migrated to another NUMA node,
2479                          * it will be flushed out and put in the right unstable
2480                          * tree next time: only merge with it when across_nodes.
2481                          */
2482                         put_page(tree_page);
2483                         return NULL;
2484                 } else {
2485                         *tree_pagep = tree_page;
2486                         return tree_rmap_item;
2487                 }
2488         }
2489
2490         rmap_item->address |= UNSTABLE_FLAG;
2491         rmap_item->address |= (ksm_scan.scan_round & SEQNR_MASK);
2492         DO_NUMA(rmap_item->nid = nid);
2493         rb_link_node(&rmap_item->node, parent, new);
2494         rb_insert_color(&rmap_item->node, root);
2495
2496 #ifdef CONFIG_LKSM_FILTER
2497         rmap_item->region = ksm_scan.region;
2498 #endif
2499         ksm_pages_unshared++;
2500         return NULL;
2501 }
2502
2503 /*
2504  * stable_tree_append - add another rmap_item to the linked list of
2505  * rmap_items hanging off a given node of the stable tree, all sharing
2506  * the same ksm page.
2507  */
2508 static void stable_tree_append(struct rmap_item *rmap_item,
2509                                struct stable_node *stable_node,
2510                                bool max_page_sharing_bypass)
2511 {
2512         /*
2513          * rmap won't find this mapping if we don't insert the
2514          * rmap_item in the right stable_node
2515          * duplicate. page_migration could break later if rmap breaks,
2516          * so we can as well crash here. We really need to check for
2517          * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
2518          * for other negative values as an undeflow if detected here
2519          * for the first time (and not when decreasing rmap_hlist_len)
2520          * would be sign of memory corruption in the stable_node.
2521          */
2522         BUG_ON(stable_node->rmap_hlist_len < 0);
2523
2524         stable_node->rmap_hlist_len++;
2525         if (!max_page_sharing_bypass)
2526                 /* possibly non fatal but unexpected overflow, only warn */
2527                 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2528                              ksm_max_page_sharing);
2529
2530         rmap_item->head = stable_node;
2531         rmap_item->address |= STABLE_FLAG;
2532         hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2533
2534         if (rmap_item->hlist.next) {
2535                 ksm_pages_sharing++;
2536                 lksm_slot_nr_merged++;
2537                 lksm_nr_merged++;
2538         } else
2539                 ksm_pages_shared++;
2540 }
2541
2542 #ifdef CONFIG_LKSM_FILTER
2543 static inline void stable_tree_append_region(struct rmap_item *rmap_item,
2544                                struct stable_node *stable_node,
2545                                struct lksm_region *region,
2546                                bool max_page_sharing_bypass)
2547 {
2548         if (region->type == LKSM_REGION_FILE1
2549                         || region->type == LKSM_REGION_FILE2) {
2550                 int ret;
2551                 unsigned long offset =
2552                                 (rmap_item->address - rmap_item->base_addr) >> PAGE_SHIFT;
2553                 if (unlikely(region->filter_cnt == 0)
2554                                 && region->len > SINGLE_FILTER_LEN
2555                                 && !region->filter) {
2556                         region->filter = kcalloc(region->len, sizeof(long), GFP_KERNEL);
2557                         if (!region->filter) {
2558                                 ksm_err("fail to allocate memory for filter");
2559                                 goto skip;
2560                         }
2561                 }
2562                 if (region->len > SINGLE_FILTER_LEN)
2563                         ret = test_and_set_bit(offset, region->filter);
2564                 else
2565                         ret = test_and_set_bit(offset, &region->single_filter);
2566                 if (!ret)
2567                         region->filter_cnt++;
2568         }
2569         region->merge_cnt++;
2570         region_share[region->type]++;
2571 skip:
2572         stable_tree_append(rmap_item, stable_node, max_page_sharing_bypass);
2573 }
2574 #endif /* CONFIG_LKSM_FILTER */
2575
2576 /*
2577  * cmp_and_merge_page - first see if page can be merged into the stable tree;
2578  * if not, compare checksum to previous and if it's the same, see if page can
2579  * be inserted into the unstable tree, or merged with a page already there and
2580  * both transferred to the stable tree.
2581  *
2582  * @page: the page that we are searching identical page to.
2583  * @rmap_item: the reverse mapping into the virtual address of this page
2584  */
2585 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2586 {
2587         struct mm_struct *mm = rmap_item->mm;
2588         struct rmap_item *tree_rmap_item;
2589         struct page *tree_page = NULL;
2590         struct stable_node *stable_node;
2591         struct page *kpage;
2592         unsigned int checksum;
2593         int err;
2594         bool max_page_sharing_bypass = false;
2595
2596         stable_node = page_stable_node(page);
2597         if (stable_node) {
2598                 if (stable_node->head != &migrate_nodes &&
2599                     get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2600                     NUMA(stable_node->nid)) {
2601                         stable_node_dup_del(stable_node);
2602                         stable_node->head = &migrate_nodes;
2603                         list_add(&stable_node->list, stable_node->head);
2604                 }
2605                 if (stable_node->head != &migrate_nodes &&
2606                     rmap_item->head == stable_node)
2607                         return;
2608                 /*
2609                  * If it's a KSM fork, allow it to go over the sharing limit
2610                  * without warnings.
2611                  */
2612                 if (!is_page_sharing_candidate(stable_node))
2613                         max_page_sharing_bypass = true;
2614         }
2615
2616         /* We first start with searching the page inside the stable tree */
2617         kpage = stable_tree_search(page);
2618         if (kpage == page && rmap_item->head == stable_node) {
2619                 put_page(kpage);
2620                 return;
2621         }
2622
2623         remove_rmap_item_from_tree(rmap_item);
2624
2625         if (kpage) {
2626                 if (PTR_ERR(kpage) == -EBUSY)
2627                         return;
2628
2629                 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2630                 if (!err) {
2631                         /*
2632                          * The page was successfully merged:
2633                          * add its rmap_item to the stable tree.
2634                          */
2635                         lock_page(kpage);
2636 #ifdef CONFIG_LKSM_FILTER
2637                         stable_tree_append_region(rmap_item, page_stable_node(kpage),
2638                                         ksm_scan.region, max_page_sharing_bypass);
2639 #else
2640                         stable_tree_append(rmap_item, page_stable_node(kpage),
2641                                            max_page_sharing_bypass);
2642 #endif
2643                         unlock_page(kpage);
2644                 }
2645                 put_page(kpage);
2646                 return;
2647         }
2648
2649         /*
2650          * LKSM: In LKSM, KSM is running in a event-triggered manner.
2651          * Because of that scanning is much infrequently performed.
2652          * We just skip caculation of checksum for LKSM to catch scanning
2653          * chances more.
2654          */
2655         if (ksm_scan.scan_round < initial_round
2656                                 && !lksm_test_rmap_frozen(rmap_item)) {
2657                 checksum = calc_checksum(page);
2658                 if (rmap_item->oldchecksum != checksum) {
2659                         rmap_item->oldchecksum = checksum;
2660                         return;
2661                 }
2662         }
2663
2664         /*
2665          * Same checksum as an empty page. We attempt to merge it with the
2666          * appropriate zero page if the user enabled this via sysfs.
2667          */
2668         if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2669                 struct vm_area_struct *vma;
2670
2671                 down_read(&mm->mmap_sem);
2672                 vma = find_mergeable_vma(mm, rmap_item->address);
2673                 if (vma) {
2674                         err = try_to_merge_one_page(vma, page,
2675                                         ZERO_PAGE(rmap_item->address));
2676                 } else {
2677                         /*
2678                          * If the vma is out of date, we do not need to
2679                          * continue.
2680                          */
2681                         err = 0;
2682                 }
2683                 up_read(&mm->mmap_sem);
2684                 /*
2685                  * In case of failure, the page was not really empty, so we
2686                  * need to continue. Otherwise we're done.
2687                  */
2688                 if (!err)
2689                         return;
2690         }
2691         tree_rmap_item =
2692                 unstable_tree_search_insert(rmap_item, page, &tree_page);
2693         if (tree_rmap_item) {
2694                 bool split;
2695 #ifdef CONFIG_LKSM_FILTER
2696                 struct lksm_region *tree_region = tree_rmap_item->region;
2697 #endif
2698                 kpage = try_to_merge_two_pages(rmap_item, page,
2699                                                 tree_rmap_item, tree_page);
2700                 /*
2701                  * If both pages we tried to merge belong to the same compound
2702                  * page, then we actually ended up increasing the reference
2703                  * count of the same compound page twice, and split_huge_page
2704                  * failed.
2705                  * Here we set a flag if that happened, and we use it later to
2706                  * try split_huge_page again. Since we call put_page right
2707                  * afterwards, the reference count will be correct and
2708                  * split_huge_page should succeed.
2709                  */
2710                 split = PageTransCompound(page)
2711                         && compound_head(page) == compound_head(tree_page);
2712                 put_page(tree_page);
2713                 if (kpage) {
2714                         /*
2715                          * The pages were successfully merged: insert new
2716                          * node in the stable tree and add both rmap_items.
2717                          */
2718                         lock_page(kpage);
2719                         stable_node = stable_tree_insert(kpage);
2720                         if (stable_node) {
2721 #ifdef CONFIG_LKSM_FILTER
2722                                 stable_tree_append_region(tree_rmap_item, stable_node,
2723                                                 tree_region, false);
2724                                 stable_tree_append_region(rmap_item, stable_node,
2725                                                 ksm_scan.region, false);
2726 #else
2727                                 stable_tree_append(tree_rmap_item, stable_node,
2728                                                    false);
2729                                 stable_tree_append(rmap_item, stable_node,
2730                                                    false);
2731 #endif
2732                         }
2733                         unlock_page(kpage);
2734
2735                         /*
2736                          * If we fail to insert the page into the stable tree,
2737                          * we will have 2 virtual addresses that are pointing
2738                          * to a ksm page left outside the stable tree,
2739                          * in which case we need to break_cow on both.
2740                          */
2741                         if (!stable_node) {
2742                                 break_cow(tree_rmap_item);
2743                                 break_cow(rmap_item);
2744 #ifdef CONFIG_LKSM_FILTER
2745                                 tree_rmap_item->region = tree_region;
2746                                 rmap_item->region = ksm_scan.region;
2747 #endif
2748                         }
2749                 } else if (split) {
2750                         /*
2751                          * We are here if we tried to merge two pages and
2752                          * failed because they both belonged to the same
2753                          * compound page. We will split the page now, but no
2754                          * merging will take place.
2755                          * We do not want to add the cost of a full lock; if
2756                          * the page is locked, it is better to skip it and
2757                          * perhaps try again later.
2758                          */
2759                         if (!trylock_page(page))
2760                                 return;
2761                         split_huge_page(page);
2762                         unlock_page(page);
2763                 }
2764         }
2765 }
2766
2767 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2768                                             struct rmap_item **rmap_list,
2769                                             unsigned long addr)
2770 {
2771         struct rmap_item *rmap_item;
2772
2773         while (*rmap_list) {
2774                 rmap_item = *rmap_list;
2775                 if ((rmap_item->address & PAGE_MASK) == addr) {
2776                         if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
2777                                         && rmap_item->address & UNSTABLE_FLAG)
2778                                 lksm_set_rmap_frozen(rmap_item);
2779                         else
2780                                 lksm_clear_rmap_frozen(rmap_item);
2781                         return rmap_item;
2782                 }
2783                 if (rmap_item->address > addr)
2784                         break;
2785                 *rmap_list = rmap_item->rmap_list;
2786                 remove_rmap_item_from_tree(rmap_item);
2787                 free_rmap_item(rmap_item);
2788         }
2789
2790         rmap_item = alloc_rmap_item();
2791         if (rmap_item) {
2792                 /* It has already been zeroed */
2793                 rmap_item->mm = mm_slot->mm;
2794                 rmap_item->address = addr;
2795                 rmap_item->rmap_list = *rmap_list;
2796 #ifdef CONFIG_LKSM_FILTER
2797                 rmap_item->region = ksm_scan.region;
2798 #endif
2799                 *rmap_list = rmap_item;
2800                 if (lksm_test_mm_state(mm_slot, FROZEN_BIT))
2801                         lksm_set_rmap_frozen(rmap_item);
2802                 else
2803                         lksm_clear_rmap_frozen(rmap_item);
2804         }
2805         return rmap_item;
2806 }
2807
2808 /*
2809  * lksm_flush_removed_mm_list:
2810  * batched flushing out removed mm_slots by lksm_remove_mm_slot
2811  */
2812 static void lksm_flush_removed_mm_list(void)
2813 {
2814         struct mm_slot *head, *next, *slot;
2815
2816         spin_lock(&ksm_mmlist_lock);
2817         head = list_first_entry_or_null(&ksm_scan.remove_mm_list,
2818                         struct mm_slot, mm_list);
2819         if (!head) {
2820                 spin_unlock(&ksm_mmlist_lock);
2821                 return;
2822         }
2823
2824         list_del_init(&ksm_scan.remove_mm_list);
2825         spin_unlock(&ksm_mmlist_lock);
2826
2827         if (!list_empty(&head->mm_list)) {
2828                 list_for_each_entry_safe(slot, next, &head->mm_list, mm_list) {
2829                         ksm_debug("slot(%p) will be freed", slot);
2830                         list_del(&slot->mm_list);
2831
2832                         cond_resched();
2833
2834                         remove_trailing_rmap_items(slot, &slot->rmap_list);
2835 #ifdef CONFIG_LKSM_FILTER
2836                         lksm_region_ref_list_release(slot);
2837 #endif
2838                         clear_bit(MMF_VM_MERGEABLE, &slot->mm->flags);
2839                         mmdrop(slot->mm);
2840                         free_mm_slot(slot);
2841                 }
2842         }
2843
2844         ksm_debug("slot(%p) will be freed", head);
2845
2846         cond_resched();
2847         remove_trailing_rmap_items(head, &head->rmap_list);
2848         clear_bit(MMF_VM_MERGEABLE, &head->mm->flags);
2849         mmdrop(head->mm);
2850         free_mm_slot(head);
2851 }
2852
2853 /*
2854  * remove mm_slot from lists
2855  * LKSM defers releasing stuffs at the end of scanning
2856  */
2857 static inline void lksm_remove_mm_slot(struct mm_slot *slot)
2858 {
2859         hash_del(&slot->link);
2860         list_del_init(&slot->scan_list);
2861         list_move(&slot->mm_list, &ksm_scan.remove_mm_list);
2862         if (!RB_EMPTY_NODE(&slot->ordered_list)) {
2863                 rb_erase(&slot->ordered_list, &vips_list);
2864                 RB_CLEAR_NODE(&slot->ordered_list);
2865         }
2866 }
2867
2868 /* caller must hold ksm_mmlist_lock */
2869 static struct mm_slot *lksm_get_unscanned_mm_slot(struct mm_slot *slot)
2870 {
2871         struct mm_slot *next;
2872
2873         list_for_each_entry_safe_continue(slot, next, &ksm_scan_head.scan_list,
2874                         scan_list) {
2875                 if (ksm_test_exit(slot->mm)) {
2876                         ksm_debug("slot:%p %p is moved to remove list", slot, slot->mm);
2877                         if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
2878                                 atomic_dec(&ksm_scan.nr_frozen);
2879                         else
2880                                 atomic_dec(&ksm_scan.nr_scannable);
2881                         lksm_remove_mm_slot(slot);
2882                         continue;
2883                 }
2884
2885                 lksm_nr_scanned_slot++;
2886                 break;
2887         }
2888
2889         return slot;
2890 }
2891
2892 /* caller must hold ksm_mmlist_lock */
2893 static void lksm_insert_mm_slot_ordered(struct mm_slot *slot)
2894 {
2895         struct rb_root *root;
2896         struct rb_node **new;
2897         struct rb_node *parent;
2898         struct mm_slot *temp_slot;
2899
2900         root = &vips_list;
2901         parent = NULL;
2902         new = &root->rb_node;
2903
2904         while (*new) {
2905                 temp_slot = rb_entry(*new, struct mm_slot, ordered_list);
2906
2907                 parent = *new;
2908                 if (slot->nr_merged > temp_slot->nr_merged)
2909                         new = &parent->rb_left;
2910                 else
2911                         new = &parent->rb_right;
2912         }
2913
2914         rb_link_node(&slot->ordered_list, parent, new);
2915         rb_insert_color(&slot->ordered_list, root);
2916 }
2917
2918 #ifdef CONFIG_LKSM_FILTER
2919 /*
2920  * most vmas grow up except stack.
2921  * the given value of size must be same with orig's one.
2922  */
2923
2924 static inline void __lksm_copy_filter
2925 (unsigned long *orig, unsigned long *newer, int size)
2926 {
2927         while (size-- >= 0)
2928                 *(newer++) = *(orig++);
2929 }
2930
2931 static inline void lksm_copy_filter
2932 (struct lksm_region *region, unsigned long *filter)
2933 {
2934         if (region->len > SINGLE_FILTER_LEN) {
2935                 if (region->filter)
2936                         __lksm_copy_filter(region->filter, filter, region->len);
2937         } else
2938                 __lksm_copy_filter(&region->single_filter, filter, region->len);
2939 }
2940
2941 static struct vm_area_struct *lksm_find_next_vma
2942 (struct mm_struct *mm, struct mm_slot *slot)
2943 {
2944         struct vm_area_struct *vma;
2945         struct lksm_region *region;
2946
2947         if (ksm_test_exit(mm))
2948                 vma = NULL;
2949         else
2950                 vma = find_vma(mm, ksm_scan.address);
2951         for (; vma; vma = vma->vm_next) {
2952                 if (!(vma->vm_flags & VM_MERGEABLE))
2953                         continue;
2954                 if (ksm_scan.address < vma->vm_start)
2955                         ksm_scan.address = vma->vm_start;
2956                 if (!vma->anon_vma) {
2957                         ksm_scan.address = vma->vm_end;
2958                         continue;
2959                 }
2960
2961                 if (ksm_scan.cached_vma == vma)
2962                         region = ksm_scan.region;
2963                 else {
2964                         region = lksm_find_region(vma);
2965                         ksm_scan.cached_vma = vma;
2966                         ksm_scan.vma_base_addr = vma->vm_start;
2967                 }
2968
2969                 if (!region || region->type == LKSM_REGION_CONFLICT)
2970                         region = &unknown_region;
2971                 else if (region->type != LKSM_REGION_HEAP
2972                                         && region->type != LKSM_REGION_CONFLICT
2973                                         && region->type != LKSM_REGION_UNKNOWN) {
2974                         int size = lksm_region_size(vma->vm_start, vma->vm_end);
2975                         int len = (size > BITS_PER_LONG) ? lksm_bitmap_size(size)
2976                                         : SINGLE_FILTER_LEN;
2977
2978                         if (len > SINGLE_FILTER_LEN && unlikely(region->len != len)) {
2979                                 region->conflict++;
2980                                 if (region->conflict > 1) {
2981                                         region->type = LKSM_REGION_CONFLICT;
2982                                         if (region->len > SINGLE_FILTER_LEN)
2983                                                 kfree(region->filter);
2984                                         region->filter = NULL;
2985                                         region->len = SINGLE_FILTER_LEN;
2986                                         /* conflicted regions will be unfiltered */
2987                                         region = &unknown_region;
2988                                         ksm_debug("the region is frequently conflicted. break.");
2989                                         break;
2990                                 }
2991                                 if (region->len < len) {
2992                                         unsigned long *filter;
2993                                         ksm_debug("size of region(%p) is changed: %d -> %d (size: %d)",
2994                                                         region, region->len, len, size);
2995                                         ksm_debug("region-%d type: %d vma:%p", region->ino, region->type, vma);
2996                                         filter = kcalloc(len, sizeof(long), GFP_KERNEL);
2997                                         if (!filter) {
2998                                                 ksm_err("fail to allocate memory for filter");
2999                                                 goto skip;
3000                                         }
3001                                         if (region->filter_cnt > 0)
3002                                                 lksm_copy_filter(region, filter);
3003                                         if (region->len > SINGLE_FILTER_LEN)
3004                                                 kfree(region->filter);
3005                                         region->filter = filter;
3006                                         region->len = len;
3007                                 }
3008                         }
3009                 }
3010 skip:
3011                 if (ksm_scan.region != region)
3012                         ksm_scan.region = region;
3013                 break;
3014         }
3015         return vma;
3016 }
3017
3018 static inline unsigned long lksm_get_next_filtered_address
3019 (struct lksm_region *region, unsigned long addr, unsigned long base)
3020 {
3021         unsigned long next_offset, curr_offset, nbits;
3022
3023         curr_offset = (addr - base) >> PAGE_SHIFT;
3024         nbits = (region->len == 0) ? BITS_PER_LONG :
3025                                 (region->len << (6 + PAGE_SHIFT));
3026         if (region->len > SINGLE_FILTER_LEN)
3027                 next_offset = find_next_bit(region->filter, nbits, curr_offset);
3028         else
3029                 next_offset = find_next_bit(&region->single_filter,
3030                                 nbits, curr_offset);
3031
3032         return (next_offset << PAGE_SHIFT) + base;
3033 }
3034
3035 #define lksm_region_skipped(region) \
3036                 (region->len > 0 && !region->filter)
3037
3038 static struct rmap_item *__scan_next_rmap_item(struct page **page,
3039                 struct mm_struct *mm, struct mm_slot *slot)
3040 {
3041         struct vm_area_struct *vma;
3042         struct rmap_item *rmap_item;
3043         unsigned long addr;
3044
3045 again:
3046         cond_resched();
3047         vma = lksm_find_next_vma(mm, slot);
3048
3049         while (vma && ksm_scan.address < vma->vm_end) {
3050                 if (ksm_test_exit(mm)) {
3051                         vma = NULL;
3052                         break;
3053                 }
3054                 if (!lksm_test_mm_state(slot, KSM_MM_NEWCOMER)
3055                                 && !lksm_test_mm_state(slot, KSM_MM_FROZEN)
3056                                 && ksm_scan.region->type != LKSM_REGION_HEAP
3057                                 && ksm_scan.region->type != LKSM_REGION_UNKNOWN
3058                                 && lksm_region_mature(ksm_scan.scan_round, ksm_scan.region)
3059                                 && !lksm_region_skipped(ksm_scan.region)) {
3060                         if (ksm_scan.region->filter_cnt > 0) {
3061                                 addr = lksm_get_next_filtered_address(ksm_scan.region,
3062                                                 ksm_scan.address, ksm_scan.vma_base_addr);
3063                                 ksm_scan.address = addr;
3064                                 if (ksm_scan.address >= vma->vm_end)
3065                                         break;
3066                                 if (ksm_scan.address < vma->vm_start) {
3067                                         ksm_debug("address(%lu) is less than vm_start(%lu)",
3068                                                 ksm_scan.address, vma->vm_start);
3069                                         break;
3070                                 }
3071                         } else {
3072                                 ksm_scan.address = vma->vm_end;
3073                                 break;
3074                         }
3075                 }
3076                 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
3077                 if (IS_ERR_OR_NULL(*page)) {
3078                         ksm_scan.address += PAGE_SIZE;
3079                         cond_resched();
3080                         continue;
3081                 }
3082                 if (PageAnon(*page)) {
3083                         flush_anon_page(vma, *page, ksm_scan.address);
3084                         flush_dcache_page(*page);
3085                         rmap_item = get_next_rmap_item(slot,
3086                                         ksm_scan.rmap_list, ksm_scan.address);
3087                         if (rmap_item) {
3088                                 ksm_scan.rmap_list =
3089                                                 &rmap_item->rmap_list;
3090                                 ksm_scan.address += PAGE_SIZE;
3091                         } else
3092                                 put_page(*page);
3093                         return rmap_item;
3094                 }
3095                 put_page(*page);
3096                 ksm_scan.address += PAGE_SIZE;
3097                 cond_resched();
3098         }
3099         if (vma)
3100                 goto again;
3101         /* clean-up a scanned region */
3102         ksm_scan.region = NULL;
3103         ksm_scan.cached_vma = NULL;
3104         ksm_scan.vma_base_addr = 0;
3105
3106         return NULL; /* no scannable rmap item */
3107 }
3108
3109 #else /* CONFIG_LKSM_FILTER */
3110
3111 static struct rmap_item *__scan_next_rmap_item(struct page **page,
3112                 struct mm_struct *mm, struct mm_slot *slot)
3113 {
3114         struct vm_area_struct *vma;
3115         struct rmap_item *rmap_item;
3116
3117         if (ksm_test_exit(mm))
3118                 vma = NULL;
3119         else
3120                 vma = find_vma(mm, ksm_scan.address);
3121
3122         for (; vma; vma = vma->vm_next) {
3123                 if (!(vma->vm_flags & VM_MERGEABLE))
3124                         continue;
3125                 if (ksm_scan.address < vma->vm_start)
3126                         ksm_scan.address = vma->vm_start;
3127                 if (!vma->anon_vma)
3128                         ksm_scan.address = vma->vm_end;
3129
3130                 while (ksm_scan.address < vma->vm_end) {
3131                         if (ksm_test_exit(mm))
3132                                 break;
3133                         *page = follow_page(vma, ksm_scan.address, FOLL_GET);
3134                         if (IS_ERR_OR_NULL(*page)) {
3135                                 ksm_scan.address += PAGE_SIZE;
3136                                 cond_resched();
3137                                 continue;
3138                         }
3139                         if (PageAnon(*page)) {
3140                                 flush_anon_page(vma, *page, ksm_scan.address);
3141                                 flush_dcache_page(*page);
3142                                 rmap_item = get_next_rmap_item(slot,
3143                                         ksm_scan.rmap_list, ksm_scan.address);
3144                                 if (rmap_item) {
3145                                         ksm_scan.rmap_list =
3146                                                         &rmap_item->rmap_list;
3147                                         ksm_scan.address += PAGE_SIZE;
3148                                 } else
3149                                         put_page(*page);
3150                                 return rmap_item;
3151                         }
3152                         put_page(*page);
3153                         ksm_scan.address += PAGE_SIZE;
3154                         cond_resched();
3155                 }
3156         }
3157
3158         return NULL;
3159 }
3160
3161 #endif /* CONFIG_LKSM_FILTER */
3162
3163 static inline int sum_merge_win(int merge_win[], int len)
3164 {
3165         int i, sum = 0;
3166
3167         for (i = 0; i < len; i++)
3168                 sum += merge_win[i];
3169         return sum;
3170 }
3171
3172 static inline int lksm_account_mm_slot_nr_merge(struct mm_slot *slot, int nr_merged)
3173 {
3174         slot->nr_merged_win[slot->merge_idx++] = nr_merged;
3175         if (slot->merge_idx == MERGE_WIN)
3176                 slot->merge_idx = 0;
3177         slot->nr_merged = sum_merge_win(slot->nr_merged_win, MERGE_WIN);
3178         return slot->nr_merged;
3179 }
3180
3181 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
3182 {
3183         struct mm_struct *mm;
3184         struct mm_slot *slot;
3185         struct rmap_item *rmap_item;
3186
3187         if (list_empty(&ksm_scan_head.scan_list))
3188                 return NULL;
3189
3190         slot = ksm_scan.mm_slot;
3191         if (slot == &ksm_scan_head) {
3192                 /*
3193                  * A number of pages can hang around indefinitely on per-cpu
3194                  * pagevecs, raised page count preventing write_protect_page
3195                  * from merging them.  Though it doesn't really matter much,
3196                  * it is puzzling to see some stuck in pages_volatile until
3197                  * other activity jostles them out, and they also prevented
3198                  * LTP's KSM test from succeeding deterministically; so drain
3199                  * them here (here rather than on entry to ksm_do_scan(),
3200                  * so we don't IPI too often when pages_to_scan is set low).
3201                  */
3202                 lru_add_drain_all();
3203
3204                 if (ksm_scan.scan_round < ksm_crawl_round) {
3205                         ksm_scan.scan_round = ksm_crawl_round;
3206                         root_unstable_tree[LKSM_NODE_ID] = RB_ROOT;
3207                 }
3208
3209                 spin_lock(&ksm_mmlist_lock);
3210                 slot = lksm_get_unscanned_mm_slot(slot);
3211                 ksm_scan.mm_slot = slot;
3212                 spin_unlock(&ksm_mmlist_lock);
3213
3214                 /*
3215                  * Although we tested list_empty() above, a racing __ksm_exit
3216                  * of the last mm on the list may have removed it since then.
3217                  */
3218                 if (slot == &ksm_scan_head)
3219                         return NULL;
3220
3221                 slot->elapsed = get_jiffies_64();
3222 next_mm:
3223                 ksm_scan.address = 0;
3224                 ksm_scan.rmap_list = &slot->rmap_list;
3225         }
3226
3227         if (unlikely(!ksm_scan.rmap_list))
3228                 ksm_scan.rmap_list = &slot->rmap_list;
3229
3230         mm = slot->mm;
3231         BUG_ON(!mm);
3232         down_read(&mm->mmap_sem);
3233         rmap_item = __scan_next_rmap_item(page, mm, slot);
3234
3235         if (rmap_item) {
3236                 slot->nr_scans++;
3237                 up_read(&mm->mmap_sem);
3238                 return rmap_item;
3239         }
3240
3241         if (ksm_test_exit(mm)) {
3242                 ksm_scan.address = 0;
3243                 ksm_scan.rmap_list = &slot->rmap_list;
3244         }
3245         /*
3246          * Nuke all the rmap_items that are above this current rmap:
3247          * because there were no VM_MERGEABLE vmas with such addresses.
3248          */
3249         remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
3250
3251         spin_lock(&ksm_mmlist_lock);
3252         ksm_scan.mm_slot = lksm_get_unscanned_mm_slot(slot);
3253
3254         if (ksm_scan.address == 0) {
3255                 /*
3256                  * We've completed a full scan of all vmas, holding mmap_sem
3257                  * throughout, and found no VM_MERGEABLE: so do the same as
3258                  * __ksm_exit does to remove this mm from all our lists now.
3259                  * This applies either when cleaning up after __ksm_exit
3260                  * (but beware: we can reach here even before __ksm_exit),
3261                  * or when all VM_MERGEABLE areas have been unmapped (and
3262                  * mmap_sem then protects against race with MADV_MERGEABLE).
3263                  */
3264                 up_read(&mm->mmap_sem);
3265                 if (lksm_test_mm_state(slot, KSM_MM_FROZEN))
3266                         atomic_dec(&ksm_scan.nr_frozen);
3267                 else
3268                         atomic_dec(&ksm_scan.nr_scannable);
3269                 lksm_remove_mm_slot(slot);
3270                 spin_unlock(&ksm_mmlist_lock);
3271
3272                 lksm_slot_nr_merged = 0;
3273                 lksm_slot_nr_broken = 0;
3274         } else {
3275                 int newcomer = 0, frozen = 0;
3276
3277                 up_read(&mm->mmap_sem);
3278
3279                 if (lksm_test_mm_state(slot, KSM_MM_NEWCOMER)) {
3280                         lksm_clear_mm_state(slot, KSM_MM_NEWCOMER);
3281                         newcomer = 1;
3282                 }
3283                 if (lksm_test_mm_state(slot, KSM_MM_FROZEN)) {
3284                         lksm_clear_mm_state(slot, KSM_MM_FROZEN);
3285                         frozen = 1;
3286                         atomic_dec(&ksm_scan.nr_frozen);
3287                 } else
3288                         atomic_dec(&ksm_scan.nr_scannable);
3289                 lksm_set_mm_state(slot, KSM_MM_SCANNED);
3290
3291                 list_del_init(&slot->scan_list);
3292                 if (!RB_EMPTY_NODE(&slot->ordered_list)) {
3293                         rb_erase(&slot->ordered_list, &vips_list);
3294                         RB_CLEAR_NODE(&slot->ordered_list);
3295                 }
3296                 if (lksm_account_mm_slot_nr_merge(slot, lksm_slot_nr_merged))
3297                         lksm_insert_mm_slot_ordered(slot);
3298
3299                 slot->elapsed = get_jiffies_64() - slot->elapsed;
3300                 spin_unlock(&ksm_mmlist_lock);
3301
3302                 if (ksm_test_exit(slot->mm))
3303                         ksm_debug("slot(%p:%p) is exited", slot, slot->mm);
3304                 else
3305                         ksm_debug("slot-%d(%s) %d merged %d scanned %lu pages "
3306                                         "(sum: %d) - (%s, %s) takes %u msecs (nr_scannable: %d)",
3307                                         task_pid_nr(slot->mm->owner), slot->mm->owner->comm,
3308                                         lksm_slot_nr_merged - lksm_slot_nr_broken, slot->nr_scans,
3309                                         slot->scanning_size, slot->nr_merged,
3310                                         newcomer ? "new" : "old",
3311                                         frozen ? "frozen" : "normal",
3312                                         jiffies_to_msecs(slot->elapsed),
3313                                         atomic_read(&ksm_scan.nr_scannable));
3314
3315                 lksm_slot_nr_merged = 0;
3316                 lksm_slot_nr_broken = 0;
3317         }
3318
3319         /* Repeat until we've completed scanning the whole list */
3320         slot = ksm_scan.mm_slot;
3321         if (slot != &ksm_scan_head) {
3322                 slot->elapsed = get_jiffies_64();
3323                 goto next_mm;
3324         }
3325
3326         return NULL;
3327 }
3328
3329 /**
3330  * ksm_do_scan  - the ksm scanner main worker function.
3331  * @scan_npages:  number of pages we want to scan before we return.
3332  */
3333 static int ksm_do_scan(unsigned int scan_npages)
3334 {
3335         struct rmap_item *rmap_item;
3336         struct page *uninitialized_var(page);
3337
3338         while (scan_npages-- && likely(!freezing(current))) {
3339                 cond_resched();
3340                 rmap_item = scan_get_next_rmap_item(&page);
3341                 if (!rmap_item)
3342                         return 1; /* need sleep */
3343                 cmp_and_merge_page(page, rmap_item);
3344                 put_page(page);
3345         }
3346         return 0;
3347 }
3348
3349 static int ksmd_should_run(void)
3350 {
3351         return (ksm_run & KSM_RUN_MERGE) &&
3352                 !list_empty(&ksm_scan_head.scan_list);
3353 }
3354
3355 static void lksm_scan_wrapup_wait(void)
3356 {
3357         if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3358                 if (ksm_thread_pages_to_scan != lksm_default_pages_to_scan)
3359                         ksm_thread_pages_to_scan = lksm_default_pages_to_scan;
3360         } else if (ksm_scan.scan_mode == LKSM_SCAN_FULL)
3361                 ksm_scan.nr_full_scan++;
3362
3363         lksm_nr_merged = 0;
3364         lksm_nr_broken = 0;
3365         lksm_nr_scanned_slot = 0;
3366
3367         ksm_scan.scan_mode = LKSM_SCAN_NONE;
3368         if (ksm_run & KSM_RUN_ONESHOT)
3369                 atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
3370
3371         lksm_clear_scan_state(ksm_state);
3372
3373         wait_event_freezable(ksm_thread_wait,
3374                         (lksm_check_scan_state(ksm_state) && ksmd_should_run())
3375                         || kthread_should_stop());
3376 }
3377
3378 static int lksm_scan_thread(void *nothing)
3379 {
3380         unsigned long begin, elapsed;
3381         unsigned int sleep_ms;
3382         int need_to_sleep = 0;
3383
3384         set_freezable();
3385         set_user_nice(current, 5);
3386
3387         ksm_debug("KSM_SCAND pid: %d", task_pid_nr(current));
3388         while (!kthread_should_stop()) {
3389                 mutex_lock(&ksm_thread_mutex);
3390                 wait_while_offlining();
3391                 if (ksmd_should_run())
3392                         need_to_sleep = ksm_do_scan(ksm_thread_pages_to_scan);
3393                 mutex_unlock(&ksm_thread_mutex);
3394
3395                 try_to_freeze();
3396
3397                 if (need_to_sleep) {
3398                         if (!ksmd_should_run()) {
3399                                 /* if no one left in scanning list, go to sleep for a while */
3400                                 lksm_flush_removed_mm_list();
3401
3402                                 elapsed = get_jiffies_64() - begin;
3403                                 lksm_last_scan_time = elapsed;
3404                                 lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
3405
3406                                 ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages "
3407                                                 "are merged/broken (nr_scannable: %d, nr_frozen: %d)",
3408                                                 lksm_nr_scanned_slot,
3409                                                 jiffies_to_msecs(lksm_last_scan_time),
3410                                                 lksm_nr_merged, lksm_nr_broken,
3411                                                 atomic_read(&ksm_scan.nr_scannable),
3412                                                 atomic_read(&ksm_scan.nr_frozen));
3413
3414                                 lksm_scan_wrapup_wait();
3415
3416                                 ksm_debug("Start %lu-th scanning: nr_scannable(%d) "
3417                                                 "nr_frozen(%d)",
3418                                                 ksm_scan.scan_round,
3419                                                 atomic_read(&ksm_scan.nr_scannable),
3420                                                 atomic_read(&ksm_scan.nr_frozen));
3421
3422                                 if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3423                                         if (lksm_boosted_pages_to_scan !=
3424                                                         ksm_thread_pages_to_scan) {
3425                                                 ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
3426                                                 ksm_debug("set pages_to_scan to %u",
3427                                                                 lksm_boosted_pages_to_scan);
3428                                         }
3429                                 }
3430                                 begin = get_jiffies_64();
3431                         } else {
3432                                 /* new scanning targets are coming */
3433                                 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
3434                                 wait_event_interruptible_timeout(ksm_iter_wait,
3435                                                 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
3436                                                 msecs_to_jiffies(sleep_ms));
3437                         }
3438                         need_to_sleep = 0;
3439                 } else if (ksmd_should_run()) {
3440                         /* normal sleep */
3441                         sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
3442                         wait_event_interruptible_timeout(ksm_iter_wait,
3443                                 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
3444                                 msecs_to_jiffies(sleep_ms));
3445                 } else {
3446                         /* wait for activating ksm */
3447                         if (likely(ksm_scan.scan_round > 0)) {
3448                                 lksm_flush_removed_mm_list();
3449
3450                                 elapsed = get_jiffies_64() - begin;
3451                                 lksm_last_scan_time = elapsed;
3452                                 lksm_proc_scan_time = elapsed / lksm_nr_scanned_slot;
3453
3454                                 ksm_debug("Scanning(%d) takes %u ms, %d/%d-pages are merged/broken",
3455                                                 lksm_nr_scanned_slot, jiffies_to_msecs(lksm_last_scan_time),
3456                                                 lksm_nr_merged, lksm_nr_broken);
3457
3458                                 lksm_scan_wrapup_wait();
3459                         } else
3460                                 wait_event_freezable(ksm_thread_wait,
3461                                         (lksm_check_scan_state(ksm_state) && ksmd_should_run())
3462                                         || kthread_should_stop());
3463
3464                         ksm_debug("Start %lu-th scanning: nr_scannable(%d) nr_frozen(%d)",
3465                                         ksm_scan.scan_round,
3466                                         atomic_read(&ksm_scan.nr_scannable),
3467                                         atomic_read(&ksm_scan.nr_frozen));
3468
3469                         if (ksm_scan.scan_mode == LKSM_SCAN_PARTIAL) {
3470                                 ksm_thread_pages_to_scan = lksm_boosted_pages_to_scan;
3471                                 ksm_debug("set pages_to_scan to %u",
3472                                                 lksm_boosted_pages_to_scan);
3473                         }
3474                         begin = get_jiffies_64();
3475                 }
3476         }
3477         return 0;
3478 }
3479
3480 /*
3481  * lksm crawler declaration & definition part
3482  */
3483 static struct task_struct *ksm_crawld;
3484
3485 LIST_HEAD(frozen_task_list);
3486 DEFINE_SPINLOCK(frozen_task_lock);
3487
3488 enum {
3489         KSM_CRAWL_SLEEP,
3490         KSM_CRAWL_RUN,
3491 } ksm_crawl_state;
3492 static atomic_t crawl_state;
3493
3494 enum {
3495         LKSM_TASK_SLOT_NONE = 0,
3496         LKSM_TASK_SLOT_REMOVED,
3497 };
3498
3499 static inline int lksm_count_and_clear_mm_slots
3500 (struct mm_slot *head, unsigned long *delay)
3501 {
3502         int count = 0;
3503         struct mm_slot *slot;
3504
3505         spin_lock(&ksm_mmlist_lock);
3506         list_for_each_entry(slot, &head->mm_list, mm_list) {
3507                 if (list_empty(&slot->scan_list)) {
3508                         lksm_clear_mm_state(slot, KSM_MM_SCANNED);
3509                         slot->nr_scans = 0;
3510                         slot->scanning_size = get_mm_counter(slot->mm, MM_ANONPAGES);
3511                         list_add_tail(&slot->scan_list, &ksm_scan_head.scan_list);
3512                         *delay += slot->elapsed;
3513                         count++;
3514                 }
3515         }
3516         spin_unlock(&ksm_mmlist_lock);
3517         return count;
3518 }
3519
3520 static int lksm_prepare_frozen_scan(void)
3521 {
3522         int nr_frozen = 0, nr_added = 0, err;
3523         struct task_struct *task;
3524         struct task_slot *task_slot;
3525         struct mm_struct *mm;
3526
3527         spin_lock(&frozen_task_lock);
3528         nr_frozen = atomic_read(&ksm_scan.nr_frozen);
3529         if (list_empty(&frozen_task_list)) {
3530                 spin_unlock(&frozen_task_lock);
3531                 return nr_frozen;
3532         }
3533
3534         ksm_debug("prepare frozen scan: round(%lu)", ksm_crawl_round);
3535         task_slot = list_first_entry_or_null(&frozen_task_list,
3536                         struct task_slot, list);
3537         while (task_slot) {
3538                 list_del(&task_slot->list);
3539                 hash_del(&task_slot->hlist);
3540                 spin_unlock(&frozen_task_lock);
3541
3542                 task = task_slot->task;
3543                 if (ksm_run & KSM_RUN_UNMERGE) {
3544                         put_task_struct(task);
3545                         free_task_slot(task_slot);
3546                         goto clean_up_abort;
3547                 }
3548
3549                 mm = get_task_mm(task);
3550
3551                 if (!mm || ksm_test_exit(mm))
3552                         goto mm_exit_out;
3553
3554                 if (mm) {
3555                         ksm_join_write_lock(mm, task_slot->frozen, err);
3556                         if (!err)
3557                                 nr_added++;
3558                 }
3559
3560 mm_exit_out:
3561                 free_task_slot(task_slot);
3562                 put_task_struct(task);
3563                 if (mm)
3564                         mmput(mm);
3565
3566                 cond_resched();
3567
3568                 spin_lock(&frozen_task_lock);
3569                 task_slot = list_first_entry_or_null(&frozen_task_list,
3570                                 struct task_slot, list);
3571         }
3572         spin_unlock(&frozen_task_lock);
3573         atomic_add(nr_added, &ksm_scan.nr_frozen);
3574
3575         return nr_added + nr_frozen;
3576
3577 clean_up_abort:
3578         spin_lock(&frozen_task_lock);
3579         task_slot = list_first_entry_or_null(&frozen_task_list,
3580                         struct task_slot, list);
3581         while (task_slot) {
3582                 list_del(&task_slot->list);
3583                 hash_del(&task_slot->hlist);
3584                 spin_unlock(&frozen_task_lock);
3585
3586                 task = task_slot->task;
3587                 put_task_struct(task);
3588                 free_task_slot(task_slot);
3589
3590                 spin_lock(&frozen_task_lock);
3591                 task_slot = list_first_entry_or_null(&frozen_task_list,
3592                                 struct task_slot, list);
3593         }
3594         spin_unlock(&frozen_task_lock);
3595
3596         return 0;
3597 }
3598
3599 /* this function make a list of new processes and vip processes */
3600 static int lksm_prepare_partial_scan(void)
3601 {
3602         int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0;
3603         unsigned long delay = 0;
3604         unsigned long fault_cnt = 0;
3605         struct task_struct *task;
3606         struct mm_struct *mm;
3607         struct mm_slot *mm_slot;
3608         struct list_head recheck_list;
3609         struct rb_node *node;
3610
3611         ksm_debug("prepare partial scan: round(%lu)", ksm_crawl_round);
3612         INIT_LIST_HEAD(&recheck_list);
3613
3614         nr_frozen = lksm_prepare_frozen_scan();
3615
3616         /* get newbies */
3617         for_each_process(task) {
3618                 if (task == current || task_pid_nr(task) == 0
3619                         || check_short_task(task))
3620                         continue;
3621                 if (ksm_run & KSM_RUN_UNMERGE) {
3622                         nr_frozen = 0;
3623                         nr_added = 0;
3624                         goto abort;
3625                 }
3626                 mm = get_task_mm(task);
3627                 if (!mm)
3628                         continue;
3629                 ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
3630                 if (ret > 0)
3631                         nr_added++;
3632                 mmput(mm);
3633         }
3634
3635         /* get vips */
3636         if (nr_added + nr_frozen >= lksm_max_vips) {
3637                 ksm_debug("nr_scannable(%d) already fulfilled skip vips",
3638                                 nr_added + nr_frozen);
3639                 goto skip_vips;
3640         }
3641
3642         spin_lock(&ksm_mmlist_lock);
3643         node = rb_first(&vips_list);
3644         if (!node) {
3645                 ksm_debug("empty vip list");
3646                 spin_unlock(&ksm_mmlist_lock);
3647                 goto skip_vips;
3648         }
3649         mm_slot = rb_entry(node, struct mm_slot, ordered_list);
3650         while (nr_scannable + nr_added + nr_frozen < lksm_max_vips) {
3651                 if (ksm_run & KSM_RUN_UNMERGE) {
3652                         spin_unlock(&ksm_mmlist_lock);
3653                         nr_scannable = 0;
3654                         nr_frozen = 0;
3655                         nr_added = 0;
3656                         goto abort;
3657                 }
3658                 if (ksm_test_exit(mm_slot->mm)) {
3659                         if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
3660                                 atomic_dec(&ksm_scan.nr_scannable);
3661                         lksm_remove_mm_slot(mm_slot);
3662                         goto next_node;
3663                 }
3664                 if (!lksm_test_mm_state(mm_slot, KSM_MM_LISTED))
3665                         goto next_node;
3666
3667                 /* prunning by fault count */
3668                 fault_cnt = mm_slot->mm->owner->maj_flt + mm_slot->mm->owner->min_flt;
3669                 if (mm_slot->fault_cnt == fault_cnt)
3670                         goto next_node;
3671
3672                 mm_slot->fault_cnt = fault_cnt;
3673                 mm_slot->scanning_size = get_mm_counter(mm_slot->mm, MM_ANONPAGES);
3674                 mm_slot->nr_scans = 0;
3675                 delay += mm_slot->elapsed;
3676                 ksm_debug("slot(nr_merged: %d, scanning_size: %lu) task(%s)",
3677                                 mm_slot->nr_merged, mm_slot->scanning_size,
3678                                 mm_slot->mm->owner->comm);
3679                 list_move_tail(&mm_slot->scan_list, &recheck_list);
3680                 lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
3681 #ifdef CONFIG_LKSM_FILTER
3682                 /* to prevent mm_slot termination on __ksm_exit */
3683                 lksm_set_mm_state(mm_slot, KSM_MM_PREPARED);
3684 #endif
3685                 nr_scannable++;
3686
3687 next_node:
3688                 node = rb_next(node);
3689                 if (!node)
3690                         break;
3691                 mm_slot = rb_entry(node, struct mm_slot, ordered_list);
3692         }
3693         spin_unlock(&ksm_mmlist_lock);
3694 #ifdef CONFIG_LKSM_FILTER
3695         list_for_each_entry(mm_slot, &recheck_list, scan_list) {
3696                 if (ksm_test_exit(mm_slot->mm))
3697                         continue;
3698                 mm_slot->nr_scans = 0;
3699                 /* check new maps */
3700                 down_read(&mm_slot->mm->mmap_sem);
3701                 ksm_join(mm_slot->mm, KSM_TASK_UNFROZEN);
3702                 up_read(&mm_slot->mm->mmap_sem);
3703         }
3704 #endif
3705 skip_vips:
3706         spin_lock(&ksm_mmlist_lock);
3707         if (!list_empty(&recheck_list)) {
3708 #ifdef CONFIG_LKSM_FILTER
3709         list_for_each_entry(mm_slot, &recheck_list, scan_list)
3710             lksm_clear_mm_state(mm_slot, KSM_MM_PREPARED);
3711 #endif
3712                 list_splice(&recheck_list, &ksm_scan_head.scan_list);
3713         }
3714         spin_unlock(&ksm_mmlist_lock);
3715
3716         ksm_scan.scan_mode = LKSM_SCAN_PARTIAL;
3717         ksm_crawl_round++;
3718
3719         atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
3720         ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
3721                 nr_frozen, nr_added, nr_scannable, atomic_read(&ksm_scan.nr_scannable));
3722 abort:
3723         return nr_frozen + nr_added + nr_scannable;
3724 }
3725
3726 static int lksm_prepare_full_scan(unsigned long *next_fullscan)
3727 {
3728         int ret, nr_frozen = 0, nr_added = 0, nr_scannable = 0, nr_target;
3729         unsigned long delay = 0;
3730         struct task_struct *task;
3731         struct mm_struct *mm;
3732
3733         ksm_debug("prepare full scan: round(%lu)", ksm_crawl_round);
3734
3735         nr_frozen = lksm_prepare_frozen_scan();
3736
3737         for_each_process(task) {
3738                 if (task == current || task_pid_nr(task) == 0
3739                         || check_short_task(task))
3740                         continue;
3741                 if (ksm_run & KSM_RUN_UNMERGE) {
3742                         nr_target = 0;
3743                         goto abort;
3744                 }
3745
3746                 mm = get_task_mm(task);
3747                 if (!mm)
3748                         continue;
3749                 ksm_join_write_lock(mm, KSM_TASK_UNFROZEN, ret);
3750                 if (ret > 0)
3751                         nr_added++;
3752                 mmput(mm);
3753         }
3754
3755         nr_scannable = lksm_count_and_clear_mm_slots(&ksm_mm_head, &delay);
3756         nr_target = nr_scannable + nr_added + nr_frozen;
3757
3758         /* calculate crawler's sleep time */
3759         delay += msecs_to_jiffies((nr_frozen + nr_added) * lksm_proc_scan_time);
3760         *next_fullscan = jiffies + delay + msecs_to_jiffies(full_scan_interval);
3761
3762         ksm_scan.scan_mode = LKSM_SCAN_FULL;
3763         ksm_crawl_round++;
3764
3765         atomic_add(nr_scannable + nr_added, &ksm_scan.nr_scannable);
3766         ksm_debug("nr_frozen: %d nr_added: %d nr_scannable: %d - %d",
3767                         nr_frozen, nr_added, nr_scannable,
3768                         atomic_read(&ksm_scan.nr_scannable));
3769 abort:
3770         return nr_target;
3771 }
3772
3773 static int lksm_do_wait_userspace_event(unsigned long sleep_time)
3774 {
3775         wait_event_freezable(ksm_crawl_wait,
3776                         kthread_should_stop() ||
3777                         (atomic_read(&ksm_one_shot_scanning) > 0));
3778         return atomic_read(&ksm_one_shot_scanning);
3779 }
3780
3781 static int lksm_do_wait_frozen_event(unsigned long sleep_time)
3782 {
3783         int need_scan = 0;
3784
3785         spin_lock_irq(&frozen_task_lock);
3786         if (list_empty(&frozen_task_list))
3787                 /* wait until candidate list is filled */
3788                 wait_event_interruptible_lock_irq_timeout(
3789                                 ksm_crawl_wait,
3790                                 kthread_should_stop()
3791                                 || !list_empty(&frozen_task_list)
3792                                 || !list_empty(&ksm_scan_head.scan_list),
3793                                 frozen_task_lock, sleep_time);
3794
3795         if (!list_empty(&frozen_task_list) ||
3796                         !list_empty(&ksm_scan_head.scan_list))
3797                 need_scan = 1;
3798         spin_unlock_irq(&frozen_task_lock);
3799
3800         return need_scan;
3801 }
3802
3803 static inline void lksm_wake_up_scan_thread(void)
3804 {
3805         ksm_debug("wake up lksm_scan_thread");
3806         lksm_set_scan_state(ksm_state);
3807         wake_up(&ksm_thread_wait);
3808 }
3809
3810 #define LKSM_CRAWL_FROZEN_EVENT_WAIT 100 /* 100ms */
3811
3812 static void lksm_do_crawl_once
3813 (unsigned long *next_fscan, unsigned long sleep_time)
3814 {
3815         int nr_added = 0;
3816         int scan_mode;
3817
3818         /* cralwer thread waits for trigger event from userspace */
3819         scan_mode = lksm_do_wait_userspace_event(sleep_time);
3820
3821         if (scan_mode == LKSM_SCAN_PARTIAL) {
3822                 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3823                 msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
3824                 nr_added = lksm_prepare_partial_scan();
3825         } else if (scan_mode == LKSM_SCAN_FULL) {
3826                 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3827                 nr_added = lksm_prepare_full_scan(next_fscan);
3828         }
3829
3830         try_to_freeze();
3831
3832         if (nr_added > 0)
3833                 lksm_wake_up_scan_thread();
3834         else {
3835                 ksm_debug("No one can be scanned!");
3836                 atomic_set(&ksm_one_shot_scanning, LKSM_SCAN_NONE);
3837         }
3838         atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3839 }
3840
3841 static void lksm_do_crawl_periodic
3842 (unsigned long *next_fscan, unsigned long sleep_time)
3843 {
3844         int nr_added = 0;
3845
3846         if (time_is_before_eq_jiffies(*next_fscan)) {
3847                 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3848                 nr_added = lksm_prepare_full_scan(next_fscan);
3849         } else if (lksm_do_wait_frozen_event(sleep_time)) {
3850                 atomic_set(&crawl_state, KSM_CRAWL_RUN);
3851                 msleep(LKSM_CRAWL_FROZEN_EVENT_WAIT);
3852                 nr_added = lksm_prepare_partial_scan();
3853         }
3854
3855         try_to_freeze();
3856
3857         if (nr_added > 0)
3858                 lksm_wake_up_scan_thread();
3859         atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3860 }
3861
3862 static int lksm_crawl_thread(void *data)
3863 {
3864         int nr_added = 0;
3865         unsigned long next_fscan = jiffies;     /* next full scan */
3866         unsigned long sleep_time = crawler_sleep;
3867
3868         set_freezable();
3869         set_user_nice(current, 5);
3870
3871         ksm_debug("KSM_CRAWLD pid: %d", task_pid_nr(current));
3872         wait_event_freezable(ksm_crawl_wait,
3873                 kthread_should_stop() || ksm_run & KSM_RUN_MERGE);
3874         /* initial loop */
3875         while (!kthread_should_stop() && ksm_crawl_round < initial_round) {
3876
3877                 try_to_freeze();
3878
3879                 if ((ksm_run & KSM_RUN_MERGE) &&
3880                                 !lksm_check_scan_state(ksm_state) &&
3881                                 time_is_before_eq_jiffies(next_fscan)) {
3882                         nr_added = lksm_prepare_full_scan(&next_fscan);
3883                         if (nr_added) {
3884                                 lksm_wake_up_scan_thread();
3885                                 nr_added = 0;
3886                         }
3887                         next_fscan = jiffies + sleep_time;
3888                 }
3889
3890                 wait_event_interruptible_timeout(ksm_crawl_wait,
3891                         kthread_should_stop() || !lksm_check_scan_state(ksm_state),
3892                         sleep_time);
3893         }
3894
3895         /* initialization loop done */
3896         full_scan_interval = DEFAULT_FULL_SCAN_INTERVAL;
3897         next_fscan = jiffies + msecs_to_jiffies(full_scan_interval);
3898         atomic_set(&crawl_state, KSM_CRAWL_SLEEP);
3899
3900         /* normal operation loop */
3901         while (!kthread_should_stop()) {
3902                 if (ksm_run & KSM_RUN_ONESHOT) {
3903                         if (!lksm_check_scan_state(ksm_state))
3904                                 lksm_do_crawl_once(&next_fscan, sleep_time);
3905                         else
3906                                 /* wait until scanning done */
3907                                 wait_event_freezable(ksm_crawl_wait,
3908                                         !lksm_check_scan_state(ksm_state)
3909                                         || kthread_should_stop());
3910                 } else if (ksm_run & KSM_RUN_MERGE) {
3911                         if (!lksm_check_scan_state(ksm_state))
3912                                 lksm_do_crawl_periodic(&next_fscan, sleep_time);
3913                         else
3914                                 /* wait until scanning done */
3915                                 wait_event_interruptible_timeout(ksm_crawl_wait,
3916                                         !lksm_check_scan_state(ksm_state)
3917                                         || kthread_should_stop(),
3918                                         sleep_time);
3919                         try_to_freeze();
3920                 } else {
3921                         ksm_debug("ksm is not activated");
3922                         wait_event_freezable(ksm_crawl_wait,
3923                                 kthread_should_stop() || (ksm_run & KSM_RUN_MERGE));
3924                 }
3925         }
3926
3927         return 0;
3928 }
3929
3930 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
3931                 unsigned long end, int advice, unsigned long *vm_flags)
3932 {
3933         struct mm_struct *mm = vma->vm_mm;
3934         int err;
3935
3936         switch (advice) {
3937         case MADV_MERGEABLE:
3938                 /*
3939                  * Be somewhat over-protective for now!
3940                  */
3941                 if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
3942                                  VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
3943                                  VM_HUGETLB | VM_MIXEDMAP))
3944                         return 0;               /* just ignore the advice */
3945
3946                 if (vma_is_dax(vma))
3947                         return 0;
3948
3949 #ifdef VM_SAO
3950                 if (*vm_flags & VM_SAO)
3951                         return 0;
3952 #endif
3953 #ifdef VM_SPARC_ADI
3954                 if (*vm_flags & VM_SPARC_ADI)
3955                         return 0;
3956 #endif
3957
3958                 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
3959                         err = __ksm_enter(mm, KSM_TASK_UNFROZEN);
3960                         if (err)
3961                                 return err;
3962                 }
3963
3964                 *vm_flags |= VM_MERGEABLE;
3965                 break;
3966
3967         case MADV_UNMERGEABLE:
3968                 if (!(*vm_flags & VM_MERGEABLE))
3969                         return 0;               /* just ignore the advice */
3970
3971                 if (vma->anon_vma) {
3972                         err = unmerge_ksm_pages(vma, start, end);
3973                         if (err)
3974                                 return err;
3975                 }
3976
3977                 *vm_flags &= ~VM_MERGEABLE;
3978                 break;
3979         }
3980
3981         return 0;
3982 }
3983
3984 static struct mm_slot *__ksm_enter_alloc_slot(struct mm_struct *mm, int frozen)
3985 {
3986         struct mm_slot *mm_slot;
3987
3988         mm_slot = alloc_mm_slot();
3989         if (!mm_slot)
3990                 return NULL;
3991
3992         if (frozen == KSM_TASK_FROZEN)
3993                 lksm_set_mm_state(mm_slot, KSM_MM_FROZEN | KSM_MM_NEWCOMER);
3994         else
3995                 lksm_set_mm_state(mm_slot, KSM_MM_LISTED | KSM_MM_NEWCOMER);
3996
3997         lksm_clear_mm_state(mm_slot, KSM_MM_SCANNED);
3998         RB_CLEAR_NODE(&mm_slot->ordered_list);
3999         mm_slot->fault_cnt = mm->owner->maj_flt + mm->owner->min_flt;
4000         mm_slot->scanning_size = get_mm_counter(mm, MM_ANONPAGES);
4001
4002         spin_lock(&ksm_mmlist_lock);
4003         insert_to_mm_slots_hash(mm, mm_slot);
4004         /*
4005          * When KSM_RUN_MERGE (or KSM_RUN_STOP),
4006          * insert just behind the scanning cursor, to let the area settle
4007          * down a little; when fork is followed by immediate exec, we don't
4008          * want ksmd to waste time setting up and tearing down an rmap_list.
4009          *
4010          * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
4011          * scanning cursor, otherwise KSM pages in newly forked mms will be
4012          * missed: then we might as well insert at the end of the list.
4013          */
4014         if (ksm_run & KSM_RUN_UNMERGE)
4015                 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
4016         else {
4017                 list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
4018                 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
4019         }
4020         ksm_nr_added_process++;
4021         spin_unlock(&ksm_mmlist_lock);
4022 #ifdef CONFIG_LKSM_FILTER
4023         INIT_LIST_HEAD(&mm_slot->ref_list);
4024 #endif
4025         set_bit(MMF_VM_MERGEABLE, &mm->flags);
4026         atomic_inc(&mm->mm_count);
4027
4028         return mm_slot;
4029 }
4030
4031 int __ksm_enter(struct mm_struct *mm, int frozen)
4032 {
4033         if (!__ksm_enter_alloc_slot(mm, frozen))
4034                 return -ENOMEM;
4035         return 0;
4036 }
4037
4038 void __ksm_exit(struct mm_struct *mm)
4039 {
4040         struct mm_slot *mm_slot;
4041         int easy_to_free = 0;
4042
4043         /*
4044          * This process is exiting: if it's straightforward (as is the
4045          * case when ksmd was never running), free mm_slot immediately.
4046          * But if it's at the cursor or has rmap_items linked to it, use
4047          * mmap_sem to synchronize with any break_cows before pagetables
4048          * are freed, and leave the mm_slot on the list for ksmd to free.
4049          * Beware: ksm may already have noticed it exiting and freed the slot.
4050          */
4051
4052         spin_lock(&ksm_mmlist_lock);
4053         mm_slot = get_mm_slot(mm);
4054         if (!mm_slot) {
4055                 spin_unlock(&ksm_mmlist_lock);
4056                 return;
4057         }
4058
4059         if (ksm_scan.mm_slot != mm_slot) {
4060 #ifdef CONFIG_LKSM_FILTER
4061                 if (lksm_test_mm_state(mm_slot, KSM_MM_PREPARED))
4062                         goto deferring_free;
4063 #endif
4064                 if (!mm_slot->rmap_list) {
4065                         hash_del(&mm_slot->link);
4066                         list_del(&mm_slot->mm_list);
4067                         list_del(&mm_slot->scan_list);
4068                         if (!RB_EMPTY_NODE(&mm_slot->ordered_list)) {
4069                                 rb_erase(&mm_slot->ordered_list, &vips_list);
4070                                 RB_CLEAR_NODE(&mm_slot->ordered_list);
4071                         }
4072                         easy_to_free = 1;
4073                 } else
4074                         lksm_remove_mm_slot(mm_slot);
4075                 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)) {
4076                         atomic_dec(&ksm_scan.nr_frozen);
4077                         ksm_debug("nr_frozen: %d", atomic_read(&ksm_scan.nr_frozen));
4078                 } else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED)) {
4079                         atomic_dec(&ksm_scan.nr_scannable);
4080                         ksm_debug("nr_scannable: %d", atomic_read(&ksm_scan.nr_scannable));
4081                 }
4082         }
4083 #ifdef CONFIG_LKSM_FILTER
4084 deferring_free:
4085 #endif
4086         ksm_nr_added_process--;
4087         spin_unlock(&ksm_mmlist_lock);
4088
4089         if (easy_to_free) {
4090 #ifdef CONFIG_LKSM_FILTER
4091                 lksm_region_ref_list_release(mm_slot);
4092 #endif
4093                 free_mm_slot(mm_slot);
4094                 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
4095                 mmdrop(mm);
4096         } else if (mm_slot) {
4097                 down_write(&mm->mmap_sem);
4098                 up_write(&mm->mmap_sem);
4099         }
4100 }
4101
4102 struct page *ksm_might_need_to_copy(struct page *page,
4103                         struct vm_area_struct *vma, unsigned long address)
4104 {
4105         struct anon_vma *anon_vma = page_anon_vma(page);
4106         struct page *new_page;
4107
4108         if (PageKsm(page)) {
4109                 if (page_stable_node(page) &&
4110                     !(ksm_run & KSM_RUN_UNMERGE))
4111                         return page;    /* no need to copy it */
4112         } else if (!anon_vma) {
4113                 return page;            /* no need to copy it */
4114         } else if (anon_vma->root == vma->anon_vma->root &&
4115                  page->index == linear_page_index(vma, address)) {
4116                 return page;            /* still no need to copy it */
4117         }
4118         if (!PageUptodate(page))
4119                 return page;            /* let do_swap_page report the error */
4120
4121         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
4122         if (new_page) {
4123                 copy_user_highpage(new_page, page, address, vma);
4124
4125                 SetPageDirty(new_page);
4126                 __SetPageUptodate(new_page);
4127                 __SetPageLocked(new_page);
4128         }
4129
4130         return new_page;
4131 }
4132
4133 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
4134 {
4135         struct stable_node *stable_node;
4136         struct rmap_item *rmap_item;
4137         int search_new_forks = 0;
4138
4139         VM_BUG_ON_PAGE(!PageKsm(page), page);
4140
4141         /*
4142          * Rely on the page lock to protect against concurrent modifications
4143          * to that page's node of the stable tree.
4144          */
4145         VM_BUG_ON_PAGE(!PageLocked(page), page);
4146
4147         stable_node = page_stable_node(page);
4148         if (!stable_node)
4149                 return;
4150 again:
4151         hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
4152                 struct anon_vma *anon_vma = rmap_item->anon_vma;
4153                 struct anon_vma_chain *vmac;
4154                 struct vm_area_struct *vma;
4155
4156                 cond_resched();
4157                 anon_vma_lock_read(anon_vma);
4158                 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
4159                                                0, ULONG_MAX) {
4160                         unsigned long addr;
4161
4162                         cond_resched();
4163                         vma = vmac->vma;
4164
4165                         /* Ignore the stable/unstable/sqnr flags */
4166                         addr = rmap_item->address & ~KSM_FLAG_MASK;
4167
4168                         if (addr < vma->vm_start || addr >= vma->vm_end)
4169                                 continue;
4170                         /*
4171                          * Initially we examine only the vma which covers this
4172                          * rmap_item; but later, if there is still work to do,
4173                          * we examine covering vmas in other mms: in case they
4174                          * were forked from the original since ksmd passed.
4175                          */
4176                         if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
4177                                 continue;
4178
4179                         if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
4180                                 continue;
4181
4182                         if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
4183                                 anon_vma_unlock_read(anon_vma);
4184                                 return;
4185                         }
4186                         if (rwc->done && rwc->done(page)) {
4187                                 anon_vma_unlock_read(anon_vma);
4188                                 return;
4189                         }
4190                 }
4191                 anon_vma_unlock_read(anon_vma);
4192         }
4193         if (!search_new_forks++)
4194                 goto again;
4195 }
4196
4197 bool reuse_ksm_page(struct page *page,
4198                     struct vm_area_struct *vma,
4199                     unsigned long address)
4200 {
4201 #ifdef CONFIG_DEBUG_VM
4202         if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
4203                         WARN_ON(!page_mapped(page)) ||
4204                         WARN_ON(!PageLocked(page))) {
4205                 dump_page(page, "reuse_ksm_page");
4206                 return false;
4207         }
4208 #endif
4209
4210         if (PageSwapCache(page) || !page_stable_node(page))
4211                 return false;
4212         /* Prohibit parallel get_ksm_page() */
4213         if (!page_ref_freeze(page, 1))
4214                 return false;
4215
4216         page_move_anon_rmap(page, vma);
4217         page->index = linear_page_index(vma, address);
4218         page_ref_unfreeze(page, 1);
4219
4220         return true;
4221 }
4222 #ifdef CONFIG_MIGRATION
4223 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
4224 {
4225         struct stable_node *stable_node;
4226
4227         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
4228         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
4229         VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
4230
4231         stable_node = page_stable_node(newpage);
4232         if (stable_node) {
4233                 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
4234                 stable_node->kpfn = page_to_pfn(newpage);
4235                 /*
4236                  * newpage->mapping was set in advance; now we need smp_wmb()
4237                  * to make sure that the new stable_node->kpfn is visible
4238                  * to get_ksm_page() before it can see that oldpage->mapping
4239                  * has gone stale (or that PageSwapCache has been cleared).
4240                  */
4241                 smp_wmb();
4242                 set_page_stable_node(oldpage, NULL);
4243         }
4244 }
4245 #endif /* CONFIG_MIGRATION */
4246
4247 #ifdef CONFIG_MEMORY_HOTREMOVE
4248 static void wait_while_offlining(void)
4249 {
4250         while (ksm_run & KSM_RUN_OFFLINE) {
4251                 mutex_unlock(&ksm_thread_mutex);
4252                 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
4253                             TASK_UNINTERRUPTIBLE);
4254                 mutex_lock(&ksm_thread_mutex);
4255         }
4256 }
4257
4258 static bool stable_node_dup_remove_range(struct stable_node *stable_node,
4259                                          unsigned long start_pfn,
4260                                          unsigned long end_pfn)
4261 {
4262         if (stable_node->kpfn >= start_pfn &&
4263             stable_node->kpfn < end_pfn) {
4264                 /*
4265                  * Don't get_ksm_page, page has already gone:
4266                  * which is why we keep kpfn instead of page*
4267                  */
4268                 remove_node_from_stable_tree(stable_node);
4269                 return true;
4270         }
4271         return false;
4272 }
4273
4274 static bool stable_node_chain_remove_range(struct stable_node *stable_node,
4275                                            unsigned long start_pfn,
4276                                            unsigned long end_pfn,
4277                                            struct rb_root *root)
4278 {
4279         struct stable_node *dup;
4280         struct hlist_node *hlist_safe;
4281
4282         if (!is_stable_node_chain(stable_node)) {
4283                 VM_BUG_ON(is_stable_node_dup(stable_node));
4284                 return stable_node_dup_remove_range(stable_node, start_pfn,
4285                                                     end_pfn);
4286         }
4287
4288         hlist_for_each_entry_safe(dup, hlist_safe,
4289                                   &stable_node->hlist, hlist_dup) {
4290                 VM_BUG_ON(!is_stable_node_dup(dup));
4291                 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
4292         }
4293         if (hlist_empty(&stable_node->hlist)) {
4294                 free_stable_node_chain(stable_node, root);
4295                 return true; /* notify caller that tree was rebalanced */
4296         } else
4297                 return false;
4298 }
4299
4300 static void ksm_check_stable_tree(unsigned long start_pfn,
4301                                   unsigned long end_pfn)
4302 {
4303         struct stable_node *stable_node, *next;
4304         struct rb_node *node;
4305         int nid;
4306
4307         for (nid = 0; nid < ksm_nr_node_ids; nid++) {
4308                 node = rb_first(root_stable_tree + nid);
4309                 while (node) {
4310                         stable_node = rb_entry(node, struct stable_node, node);
4311                         if (stable_node_chain_remove_range(stable_node,
4312                                                            start_pfn, end_pfn,
4313                                                            root_stable_tree +
4314                                                            nid))
4315                                 node = rb_first(root_stable_tree + nid);
4316                         else
4317                                 node = rb_next(node);
4318                         cond_resched();
4319                 }
4320         }
4321         list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
4322                 if (stable_node->kpfn >= start_pfn &&
4323                     stable_node->kpfn < end_pfn)
4324                         remove_node_from_stable_tree(stable_node);
4325                 cond_resched();
4326         }
4327 }
4328
4329 static int ksm_memory_callback(struct notifier_block *self,
4330                                unsigned long action, void *arg)
4331 {
4332         struct memory_notify *mn = arg;
4333
4334         switch (action) {
4335         case MEM_GOING_OFFLINE:
4336                 /*
4337                  * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
4338                  * and remove_all_stable_nodes() while memory is going offline:
4339                  * it is unsafe for them to touch the stable tree at this time.
4340                  * But unmerge_ksm_pages(), rmap lookups and other entry points
4341                  * which do not need the ksm_thread_mutex are all safe.
4342                  */
4343                 mutex_lock(&ksm_thread_mutex);
4344                 ksm_run |= KSM_RUN_OFFLINE;
4345                 mutex_unlock(&ksm_thread_mutex);
4346                 break;
4347
4348         case MEM_OFFLINE:
4349                 /*
4350                  * Most of the work is done by page migration; but there might
4351                  * be a few stable_nodes left over, still pointing to struct
4352                  * pages which have been offlined: prune those from the tree,
4353                  * otherwise get_ksm_page() might later try to access a
4354                  * non-existent struct page.
4355                  */
4356                 ksm_check_stable_tree(mn->start_pfn,
4357                                       mn->start_pfn + mn->nr_pages);
4358                 /* fallthrough */
4359
4360         case MEM_CANCEL_OFFLINE:
4361                 mutex_lock(&ksm_thread_mutex);
4362                 ksm_run &= ~KSM_RUN_OFFLINE;
4363                 mutex_unlock(&ksm_thread_mutex);
4364
4365                 smp_mb();       /* wake_up_bit advises this */
4366                 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
4367                 break;
4368         }
4369         return NOTIFY_OK;
4370 }
4371 #else
4372 static void wait_while_offlining(void)
4373 {
4374 }
4375 #endif /* CONFIG_MEMORY_HOTREMOVE */
4376
4377 #ifdef CONFIG_SYSFS
4378 /*
4379  * This all compiles without CONFIG_SYSFS, but is a waste of space.
4380  */
4381
4382 #define KSM_ATTR_RO(_name) \
4383         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
4384 #define KSM_ATTR(_name) \
4385         static struct kobj_attribute _name##_attr = \
4386                 __ATTR(_name, 0644, _name##_show, _name##_store)
4387
4388 static ssize_t sleep_millisecs_show(struct kobject *kobj,
4389                                     struct kobj_attribute *attr, char *buf)
4390 {
4391         return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
4392 }
4393
4394 static ssize_t sleep_millisecs_store(struct kobject *kobj,
4395                                      struct kobj_attribute *attr,
4396                                      const char *buf, size_t count)
4397 {
4398         unsigned long msecs;
4399         int err;
4400
4401         err = kstrtoul(buf, 10, &msecs);
4402         if (err || msecs > UINT_MAX)
4403                 return -EINVAL;
4404
4405         ksm_thread_sleep_millisecs = msecs;
4406         wake_up_interruptible(&ksm_iter_wait);
4407
4408         return count;
4409 }
4410 KSM_ATTR(sleep_millisecs);
4411
4412 static ssize_t pages_to_scan_show(struct kobject *kobj,
4413                                   struct kobj_attribute *attr, char *buf)
4414 {
4415         return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
4416 }
4417
4418 static ssize_t pages_to_scan_store(struct kobject *kobj,
4419                                    struct kobj_attribute *attr,
4420                                    const char *buf, size_t count)
4421 {
4422         int err;
4423         unsigned long nr_pages;
4424
4425         err = kstrtoul(buf, 10, &nr_pages);
4426         if (err || nr_pages > UINT_MAX)
4427                 return -EINVAL;
4428
4429         ksm_thread_pages_to_scan = nr_pages;
4430
4431         return count;
4432 }
4433 KSM_ATTR(pages_to_scan);
4434
4435 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
4436                         char *buf)
4437 {
4438         if (ksm_run & KSM_RUN_ONESHOT)
4439                 return sprintf(buf, "%u\n", KSM_RUN_ONESHOT);
4440         else
4441                 return sprintf(buf, "%lu\n", ksm_run);
4442 }
4443
4444 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
4445                          const char *buf, size_t count)
4446 {
4447         int err;
4448         unsigned long flags;
4449
4450         err = kstrtoul(buf, 10, &flags);
4451         if (err || flags > UINT_MAX)
4452                 return -EINVAL;
4453         if (flags > KSM_RUN_ONESHOT)
4454                 return -EINVAL;
4455
4456         /*
4457          * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
4458          * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
4459          * breaking COW to free the pages_shared (but leaves mm_slots
4460          * on the list for when ksmd may be set running again).
4461          */
4462
4463         mutex_lock(&ksm_thread_mutex);
4464         wait_while_offlining();
4465         if (ksm_run != flags) {
4466                 if (flags == KSM_RUN_ONESHOT)
4467                         ksm_run = KSM_RUN_MERGE | KSM_RUN_ONESHOT;
4468                 else
4469                         ksm_run = flags;
4470                 if (flags & KSM_RUN_UNMERGE) {
4471                         set_current_oom_origin();
4472                         err = unmerge_and_remove_all_rmap_items();
4473                         clear_current_oom_origin();
4474                         if (err) {
4475                                 ksm_run = KSM_RUN_STOP;
4476                                 count = err;
4477                         }
4478                 }
4479         }
4480         mutex_unlock(&ksm_thread_mutex);
4481
4482         if (ksm_run & KSM_RUN_MERGE) {
4483                 ksm_debug("activate KSM");
4484                 wake_up(&ksm_crawl_wait);
4485         }
4486
4487         return count;
4488 }
4489 KSM_ATTR(run);
4490
4491 #ifdef CONFIG_NUMA
4492 static ssize_t merge_across_nodes_show(struct kobject *kobj,
4493                                 struct kobj_attribute *attr, char *buf)
4494 {
4495         return sprintf(buf, "%u\n", ksm_merge_across_nodes);
4496 }
4497
4498 static ssize_t merge_across_nodes_store(struct kobject *kobj,
4499                                    struct kobj_attribute *attr,
4500                                    const char *buf, size_t count)
4501 {
4502         int err;
4503         unsigned long knob;
4504
4505         err = kstrtoul(buf, 10, &knob);
4506         if (err)
4507                 return err;
4508         if (knob > 1)
4509                 return -EINVAL;
4510
4511         mutex_lock(&ksm_thread_mutex);
4512         wait_while_offlining();
4513         if (ksm_merge_across_nodes != knob) {
4514                 if (ksm_pages_shared || remove_all_stable_nodes())
4515                         err = -EBUSY;
4516                 else if (root_stable_tree == one_stable_tree) {
4517                         struct rb_root *buf;
4518                         /*
4519                          * This is the first time that we switch away from the
4520                          * default of merging across nodes: must now allocate
4521                          * a buffer to hold as many roots as may be needed.
4522                          * Allocate stable and unstable together:
4523                          * MAXSMP NODES_SHIFT 10 will use 16kB.
4524                          */
4525                         buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
4526                                       GFP_KERNEL);
4527                         /* Let us assume that RB_ROOT is NULL is zero */
4528                         if (!buf)
4529                                 err = -ENOMEM;
4530                         else {
4531                                 root_stable_tree = buf;
4532                                 root_unstable_tree = buf + nr_node_ids;
4533                                 /* Stable tree is empty but not the unstable */
4534                                 root_unstable_tree[0] = one_unstable_tree[0];
4535                         }
4536                 }
4537                 if (!err) {
4538                         ksm_merge_across_nodes = knob;
4539                         ksm_nr_node_ids = knob ? 1 : nr_node_ids;
4540                 }
4541         }
4542         mutex_unlock(&ksm_thread_mutex);
4543
4544         return err ? err : count;
4545 }
4546 KSM_ATTR(merge_across_nodes);
4547 #endif
4548
4549 static ssize_t use_zero_pages_show(struct kobject *kobj,
4550                                 struct kobj_attribute *attr, char *buf)
4551 {
4552         return sprintf(buf, "%u\n", ksm_use_zero_pages);
4553 }
4554 static ssize_t use_zero_pages_store(struct kobject *kobj,
4555                                    struct kobj_attribute *attr,
4556                                    const char *buf, size_t count)
4557 {
4558         int err;
4559         bool value;
4560
4561         err = kstrtobool(buf, &value);
4562         if (err)
4563                 return -EINVAL;
4564
4565         ksm_use_zero_pages = value;
4566
4567         return count;
4568 }
4569 KSM_ATTR(use_zero_pages);
4570
4571 static ssize_t max_page_sharing_show(struct kobject *kobj,
4572                                      struct kobj_attribute *attr, char *buf)
4573 {
4574         return sprintf(buf, "%u\n", ksm_max_page_sharing);
4575 }
4576
4577 static ssize_t max_page_sharing_store(struct kobject *kobj,
4578                                       struct kobj_attribute *attr,
4579                                       const char *buf, size_t count)
4580 {
4581         int err;
4582         int knob;
4583
4584         err = kstrtoint(buf, 10, &knob);
4585         if (err)
4586                 return err;
4587         /*
4588          * When a KSM page is created it is shared by 2 mappings. This
4589          * being a signed comparison, it implicitly verifies it's not
4590          * negative.
4591          */
4592         if (knob < 2)
4593                 return -EINVAL;
4594
4595         if (READ_ONCE(ksm_max_page_sharing) == knob)
4596                 return count;
4597
4598         mutex_lock(&ksm_thread_mutex);
4599         wait_while_offlining();
4600         if (ksm_max_page_sharing != knob) {
4601                 if (ksm_pages_shared || remove_all_stable_nodes())
4602                         err = -EBUSY;
4603                 else
4604                         ksm_max_page_sharing = knob;
4605         }
4606         mutex_unlock(&ksm_thread_mutex);
4607
4608         return err ? err : count;
4609 }
4610 KSM_ATTR(max_page_sharing);
4611
4612 static ssize_t pages_shared_show(struct kobject *kobj,
4613                                  struct kobj_attribute *attr, char *buf)
4614 {
4615         return sprintf(buf, "%lu\n", ksm_pages_shared);
4616 }
4617 KSM_ATTR_RO(pages_shared);
4618
4619 static ssize_t pages_sharing_show(struct kobject *kobj,
4620                                   struct kobj_attribute *attr, char *buf)
4621 {
4622         return sprintf(buf, "%lu\n", ksm_pages_sharing);
4623 }
4624 KSM_ATTR_RO(pages_sharing);
4625
4626 static ssize_t pages_unshared_show(struct kobject *kobj,
4627                                    struct kobj_attribute *attr, char *buf)
4628 {
4629         return sprintf(buf, "%lu\n", ksm_pages_unshared);
4630 }
4631 KSM_ATTR_RO(pages_unshared);
4632
4633 static ssize_t pages_volatile_show(struct kobject *kobj,
4634                                    struct kobj_attribute *attr, char *buf)
4635 {
4636         long ksm_pages_volatile;
4637
4638         ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
4639                                 - ksm_pages_sharing - ksm_pages_unshared;
4640         /*
4641          * It was not worth any locking to calculate that statistic,
4642          * but it might therefore sometimes be negative: conceal that.
4643          */
4644         if (ksm_pages_volatile < 0)
4645                 ksm_pages_volatile = 0;
4646         return sprintf(buf, "%ld\n", ksm_pages_volatile);
4647 }
4648 KSM_ATTR_RO(pages_volatile);
4649
4650 static ssize_t stable_node_dups_show(struct kobject *kobj,
4651                                      struct kobj_attribute *attr, char *buf)
4652 {
4653         return sprintf(buf, "%lu\n", ksm_stable_node_dups);
4654 }
4655 KSM_ATTR_RO(stable_node_dups);
4656
4657 static ssize_t stable_node_chains_show(struct kobject *kobj,
4658                                        struct kobj_attribute *attr, char *buf)
4659 {
4660         return sprintf(buf, "%lu\n", ksm_stable_node_chains);
4661 }
4662 KSM_ATTR_RO(stable_node_chains);
4663
4664 static ssize_t
4665 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
4666                                         struct kobj_attribute *attr,
4667                                         char *buf)
4668 {
4669         return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
4670 }
4671
4672 static ssize_t
4673 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
4674                                          struct kobj_attribute *attr,
4675                                          const char *buf, size_t count)
4676 {
4677         unsigned long msecs;
4678         int err;
4679
4680         err = kstrtoul(buf, 10, &msecs);
4681         if (err || msecs > UINT_MAX)
4682                 return -EINVAL;
4683
4684         ksm_stable_node_chains_prune_millisecs = msecs;
4685
4686         return count;
4687 }
4688 KSM_ATTR(stable_node_chains_prune_millisecs);
4689
4690 static ssize_t full_scans_show(struct kobject *kobj,
4691                                struct kobj_attribute *attr, char *buf)
4692 {
4693         return sprintf(buf, "%lu\n", ksm_scan.nr_full_scan);
4694 }
4695 KSM_ATTR_RO(full_scans);
4696
4697 static ssize_t scanning_process_show(struct kobject *kobj,
4698                                         struct kobj_attribute *attr, char *buf)
4699 {
4700         return sprintf(buf, "%u\n", ksm_nr_added_process);
4701 }
4702 KSM_ATTR_RO(scanning_process);
4703
4704 static ssize_t full_scan_interval_show(struct kobject *kobj,
4705         struct kobj_attribute *attr, char *buf)
4706 {
4707         return sprintf(buf, "%lu\n", full_scan_interval);
4708 }
4709
4710 static ssize_t full_scan_interval_store(struct kobject *kbj,
4711         struct kobj_attribute *attr, const char *buf, size_t count)
4712 {
4713         int err;
4714         unsigned long interval;
4715
4716         err = kstrtoul(buf, 10, &interval);
4717         if (err || interval > UINT_MAX)
4718                 return -EINVAL;
4719
4720         full_scan_interval = interval;
4721         return count;
4722 }
4723 KSM_ATTR(full_scan_interval);
4724
4725 static ssize_t one_shot_scanning_show(struct kobject *kobj,
4726                                         struct kobj_attribute *attr, char *buf)
4727 {
4728         return sprintf(buf, "%d\n", atomic_read(&ksm_one_shot_scanning));
4729 }
4730
4731 static ssize_t one_shot_scanning_store(struct kobject *kbj,
4732         struct kobj_attribute *attr, const char *buf, size_t count)
4733 {
4734         int err, val;
4735
4736         err = kstrtoint(buf, 10, &val);
4737         if (err || (val != LKSM_SCAN_PARTIAL && val != LKSM_SCAN_FULL)) {
4738                 ksm_err("wrong value: %d", val);
4739                 return -EINVAL;
4740         }
4741
4742         if (!atomic_cmpxchg(&ksm_one_shot_scanning, LKSM_SCAN_NONE, val)) {
4743                 wake_up(&ksm_crawl_wait);
4744                 return count;
4745         }
4746         ksm_debug("ksm is still scanning");
4747         return -EINVAL;
4748 }
4749 KSM_ATTR(one_shot_scanning);
4750
4751 static ssize_t scan_boost_show(struct kobject *kobj,
4752                    struct kobj_attribute *attr, char *buf)
4753 {
4754         return sprintf(buf, "%u\n", lksm_boosted_pages_to_scan);
4755 }
4756
4757 static ssize_t scan_boost_store(struct kobject *kbj,
4758    struct kobj_attribute *attr, const char *buf, size_t count)
4759 {
4760         int err, val;
4761
4762         err = kstrtoint(buf, 10, &val);
4763         /* lksm_boosted_pages_to_scan must presence in from 100 to 10000 */
4764         if (err || val < 100 || val > 10000) {
4765                 ksm_err("wrong value: %d", val);
4766                 return -EINVAL;
4767         }
4768
4769         lksm_boosted_pages_to_scan = (unsigned int) val;
4770
4771         return count;
4772 }
4773 KSM_ATTR(scan_boost);
4774
4775 #ifdef CONFIG_LKSM_FILTER
4776 static ssize_t nr_regions_show(struct kobject *kobj,
4777                                  struct kobj_attribute *attr, char *buf)
4778 {
4779         return sprintf(buf, "%u\n", lksm_nr_regions);
4780 }
4781 KSM_ATTR_RO(nr_regions);
4782
4783 static ssize_t region_share_show(struct kobject *obj,
4784                                 struct kobj_attribute *attr, char *buf)
4785 {
4786         return sprintf(buf, "%s:%d %s:%d %s:%d %s:%d %s:%d\n",
4787                         region_type_str[0], region_share[0], region_type_str[1], region_share[1],
4788                         region_type_str[2], region_share[2], region_type_str[3], region_share[3],
4789                         region_type_str[4], region_share[4]);
4790 }
4791 KSM_ATTR_RO(region_share);
4792 #endif /* CONFIG_LKSM_FILTER */
4793
4794 static struct attribute *ksm_attrs[] = {
4795         &sleep_millisecs_attr.attr,
4796         &pages_to_scan_attr.attr,
4797         &run_attr.attr,
4798         &pages_shared_attr.attr,
4799         &pages_sharing_attr.attr,
4800         &pages_unshared_attr.attr,
4801         &pages_volatile_attr.attr,
4802         &full_scans_attr.attr,
4803 #ifdef CONFIG_NUMA
4804         &merge_across_nodes_attr.attr,
4805 #endif
4806         &max_page_sharing_attr.attr,
4807         &stable_node_chains_attr.attr,
4808         &stable_node_dups_attr.attr,
4809         &stable_node_chains_prune_millisecs_attr.attr,
4810         &use_zero_pages_attr.attr,
4811         &scanning_process_attr.attr,
4812         &full_scan_interval_attr.attr,
4813         &one_shot_scanning_attr.attr,
4814         &scan_boost_attr.attr,
4815 #ifdef CONFIG_LKSM_FILTER
4816         &nr_regions_attr.attr,
4817         &region_share_attr.attr,
4818 #endif
4819         NULL,
4820 };
4821
4822 static const struct attribute_group ksm_attr_group = {
4823         .attrs = ksm_attrs,
4824         .name = "ksm",
4825 };
4826 #endif /* CONFIG_SYSFS */
4827
4828 #ifdef CONFIG_LKSM_FILTER
4829 static inline void init_lksm_region
4830 (struct lksm_region *region, unsigned long ino, int type, int len)
4831 {
4832         region->ino = ino;
4833         region->type = type;
4834         region->len = len;
4835 }
4836
4837 /* if region is newly allocated, the function returns true. */
4838 static void lksm_insert_region
4839 (struct lksm_region **region, unsigned long ino,
4840 struct vm_area_struct *vma, int type)
4841 {
4842         int size, len, need_hash_add = 0;
4843         struct lksm_region *next = NULL;
4844         unsigned long flags;
4845
4846         size = lksm_region_size(vma->vm_start, vma->vm_end);
4847         BUG_ON(size < 0);
4848         len = (size > BITS_PER_LONG) ? lksm_bitmap_size(size) : SINGLE_FILTER_LEN;
4849
4850         if (!(*region)) {
4851                 *region = kzalloc(sizeof(struct lksm_region), GFP_KERNEL);
4852                 if (!*region) {
4853                         ksm_err("region allocation failed");
4854                         return;
4855                 }
4856                 init_lksm_region(*region, ino, LKSM_REGION_FILE1, len);
4857                 (*region)->scan_round = ksm_crawl_round;
4858                 atomic_set(&(*region)->refcount, 0);
4859                 lksm_nr_regions++;
4860                 need_hash_add = 1;
4861         }
4862
4863         if (!(*region)->next && type == LKSM_REGION_FILE2) {
4864                 next = kzalloc(sizeof(struct lksm_region), GFP_KERNEL);
4865                 if (!next) {
4866                         if (need_hash_add)
4867                                 kfree(*region);
4868                         *region = NULL;
4869                         ksm_err("region allocation failed");
4870                         return;
4871                 }
4872                 init_lksm_region(next, ino, LKSM_REGION_FILE2, len);
4873                 atomic_set(&next->refcount, 0);
4874                 next->scan_round = ksm_crawl_round;
4875                 lksm_nr_regions++;
4876         }
4877
4878         if (need_hash_add || next) {
4879                 spin_lock_irqsave(&lksm_region_lock, flags);
4880                 if (need_hash_add)
4881                         hash_add(lksm_region_hash, &(*region)->hnode, ino);
4882                 if (next) {
4883                         (*region)->next = next;
4884                         next->prev = *region;
4885                 }
4886                 spin_unlock_irqrestore(&lksm_region_lock, flags);
4887         }
4888 }
4889
4890 static inline struct lksm_region *lksm_hash_find_region(unsigned long ino)
4891 {
4892         struct lksm_region *region;
4893
4894         hash_for_each_possible(lksm_region_hash, region, hnode, ino)
4895                 if (region->ino == ino)
4896                         return region;
4897         return NULL;
4898 }
4899
4900 static void lksm_register_file_anon_region
4901 (struct mm_slot *slot, struct vm_area_struct *vma)
4902 {
4903         struct lksm_region *region;
4904         struct file *file = NULL;
4905         struct inode *inode;
4906         unsigned long flags;
4907         int type;
4908
4909         if (vma->vm_file) {
4910                 file = vma->vm_file;
4911                 type = LKSM_REGION_FILE1;
4912         } else if (vma->vm_prev) {
4913                 /* LKSM should deal with .NET libraries */
4914                 struct vm_area_struct *prev = vma->vm_prev;
4915                 if (prev->vm_flags & VM_MERGEABLE && prev->vm_file) {
4916                         /* Linux standard map structure */
4917                         file = prev->vm_file;
4918                         type = LKSM_REGION_FILE2;
4919                 } else {
4920                         /* DLL map structure */
4921                         int i = 0;
4922                         bool find = false;
4923                         while (i <= LKSM_REGION_ITER_MAX && prev) {
4924                                 if (file == NULL)
4925                                         file = prev->vm_file;
4926                                 else if (prev->vm_file && file != prev->vm_file)
4927                                         break;
4928
4929                                 if (prev->vm_flags & VM_MERGEABLE && file) {
4930                                         find = true;
4931                                         break;
4932                                 }
4933                                 prev = prev->vm_prev;
4934                                 i++;
4935                         }
4936                         if (find)
4937                                 type = LKSM_REGION_FILE2;
4938                         else
4939                                 file = NULL;
4940                 }
4941         }
4942
4943         if (file) {
4944                 inode = file_inode(file);
4945                 BUG_ON(!inode);
4946
4947                 spin_lock_irqsave(&lksm_region_lock, flags);
4948                 region = lksm_hash_find_region(inode->i_ino);
4949                 spin_unlock_irqrestore(&lksm_region_lock, flags);
4950
4951                 lksm_insert_region(&region, inode->i_ino, vma, type);
4952                 if (region) {
4953                         if (type == LKSM_REGION_FILE1)
4954                                 lksm_region_ref_append(slot, region);
4955                         else
4956                                 lksm_region_ref_append(slot, region->next);
4957                 }
4958         }
4959 }
4960
4961 static struct lksm_region *lksm_find_region(struct vm_area_struct *vma)
4962 {
4963         struct lksm_region *region = NULL;
4964         struct file *file = NULL;
4965         struct inode *inode;
4966         unsigned long ino = 0, flags;
4967         int type;
4968
4969         if (is_heap(vma))
4970                 return &heap_region;
4971         else if (is_stack(vma))
4972                 return NULL;
4973         else if (!vma->anon_vma)
4974                 return NULL;
4975         else if (is_exec(vma))
4976                 return NULL;
4977
4978         if (vma->vm_file) {
4979                 /* check thread stack */
4980                 file = vma->vm_file;
4981                 type = LKSM_REGION_FILE1;
4982         } else if (vma->vm_prev) {
4983                 struct vm_area_struct *prev = vma->vm_prev;
4984                 if (prev->vm_flags & VM_MERGEABLE && prev->vm_file) {
4985                         /* Linux standard map structure */
4986                         file = prev->vm_file;
4987                         type = LKSM_REGION_FILE2;
4988                 } else {
4989                         /* DLL map structure */
4990                         int i = 0;
4991                         bool find = false;
4992                         while (i <= LKSM_REGION_ITER_MAX && prev) {
4993                                 if (file == NULL)
4994                                         file = prev->vm_file;
4995                                 else if (prev->vm_file && file != prev->vm_file)
4996                                         break;
4997
4998                                 if (prev->vm_flags & VM_MERGEABLE && file) {
4999                                         find = true;
5000                                         break;
5001                                 }
5002                                 prev = prev->vm_prev;
5003                                 i++;
5004                         }
5005                         if (find)
5006                                 type = LKSM_REGION_FILE2;
5007                         else
5008                                 file = NULL;
5009                 }
5010         }
5011
5012         if (file) {
5013                 inode = file_inode(file);
5014                 BUG_ON(!inode);
5015                 ino = inode->i_ino;
5016
5017                 if (ksm_scan.region && ksm_scan.region->ino == ino) {
5018                         if (ksm_scan.region->type == type)
5019                                 return ksm_scan.region;
5020                         else if (ksm_scan.region->type == LKSM_REGION_FILE1)
5021                                 region = ksm_scan.region;
5022                 } else {
5023                         spin_lock_irqsave(&lksm_region_lock, flags);
5024                         region = lksm_hash_find_region(ino);
5025                         spin_unlock_irqrestore(&lksm_region_lock, flags);
5026                 }
5027         }
5028
5029         if (region && type == LKSM_REGION_FILE2) {
5030                 if (!region->next) {
5031                         ksm_debug("region(%p:%lu:%s)-vma(%p) doesn't have next area (file: %p)",
5032                                         region, ino, region_type_str[region->type], vma, file);
5033                         lksm_insert_region(&region, ino, vma, type);
5034                         BUG_ON(!region->next);
5035                 }
5036                 return region->next;
5037         }
5038         return region;
5039 }
5040 #endif /* CONFIG_LKSM_FILTER */
5041
5042 static inline int __lksm_remove_candidate(struct task_struct *task)
5043 {
5044         int ret = LKSM_TASK_SLOT_NONE;
5045         struct task_slot *slot = get_task_slot(task);
5046
5047         if (slot) {
5048                 list_del(&slot->list);
5049                 hash_del(&slot->hlist);
5050                 free_task_slot(slot);
5051                 ret = LKSM_TASK_SLOT_REMOVED;
5052         }
5053         return ret;
5054 }
5055
5056 /* called by ksm_exit */
5057 void lksm_remove_candidate(struct mm_struct *mm)
5058 {
5059         int ret;
5060
5061         if (!mm->owner) {
5062                 struct mm_slot *mm_slot;
5063
5064                 spin_lock(&ksm_mmlist_lock);
5065                 mm_slot = get_mm_slot(mm);
5066                 if (mm_slot && mm_slot != ksm_scan.mm_slot) {
5067                         list_move(&mm_slot->mm_list, &ksm_scan.remove_mm_list);
5068                         if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN))
5069                                 atomic_dec(&ksm_scan.nr_frozen);
5070                         else if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5071                                 atomic_dec(&ksm_scan.nr_scannable);
5072                         ksm_debug("mm_slot: %p will be exited", mm_slot);
5073                 }
5074                 spin_unlock(&ksm_mmlist_lock);
5075                 return;
5076         }
5077
5078         if (!ksm_test_exit(mm))
5079                 ksm_debug("proc-%d(%s) will be removed",
5080                                 task_pid_nr(mm->owner), mm->owner->comm);
5081
5082         ksm_debug("proc-%d(%s) is exited", task_pid_nr(mm->owner), mm->owner->comm);
5083         spin_lock(&frozen_task_lock);
5084         ret = __lksm_remove_candidate(mm->owner);
5085         spin_unlock(&frozen_task_lock);
5086         if (ret == LKSM_TASK_SLOT_REMOVED)
5087                 put_task_struct(mm->owner);
5088 }
5089
5090 static int lksm_task_frozen(struct task_struct *task)
5091 {
5092         int need_wakeup = 0;
5093         struct mm_struct *mm = task->mm;
5094         struct mm_slot *mm_slot;
5095         struct task_slot *task_slot;
5096
5097         if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5098                 /* a mergeable task becoming frozen */
5099                 spin_lock(&ksm_mmlist_lock);
5100                 mm_slot = get_mm_slot(mm);
5101                 BUG_ON(!mm_slot);
5102
5103                 if (mm_slot != ksm_scan.mm_slot
5104                                 && lksm_test_mm_state(mm_slot, KSM_MM_LISTED)) {
5105                         if (list_empty(&mm_slot->scan_list))
5106                                 list_add_tail(&mm_slot->scan_list, &ksm_scan_head.scan_list);
5107                         if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5108                                 atomic_dec(&ksm_scan.nr_scannable);
5109                         lksm_clear_mm_state(mm_slot, KSM_MM_LISTED);
5110                         lksm_set_mm_state(mm_slot, KSM_MM_FROZEN);
5111                         atomic_inc(&ksm_scan.nr_frozen);
5112
5113                         need_wakeup = (ksm_run == KSM_RUN_MERGE);
5114                         ksm_debug("lksm_task_frozen called for task(%s): %p (nr_frozen: %d)",
5115                                         task->comm, task, atomic_read(&ksm_scan.nr_frozen));
5116                 }
5117                 spin_unlock(&ksm_mmlist_lock);
5118         } else {
5119                 task_slot = alloc_task_slot();
5120                 if (!task_slot) {
5121                         ksm_err("[ksm_tizen] Cannot allocate memory for task_slot\n");
5122                         return -ENOMEM;
5123                 }
5124
5125                 task_slot->task = task;
5126                 task_slot->frozen = KSM_TASK_FROZEN;
5127                 task_slot->inserted = jiffies;
5128
5129                 get_task_struct(task);
5130
5131                 spin_lock(&frozen_task_lock);
5132                 list_add(&task_slot->list, &frozen_task_list);
5133                 insert_to_task_slots_hash(task_slot);
5134                 spin_unlock(&frozen_task_lock);
5135
5136                 need_wakeup = (ksm_run == KSM_RUN_MERGE);
5137                 ksm_debug("task-%d(%s) is added to frozen task list",
5138                                 task_pid_nr(task), task->comm);
5139         }
5140
5141         if (need_wakeup && atomic_read(&crawl_state) == KSM_CRAWL_SLEEP)
5142                 wake_up(&ksm_crawl_wait);
5143
5144         return 0;
5145 }
5146
5147 static int lksm_task_thawed(struct task_struct *task)
5148 {
5149         struct mm_struct *mm = task->mm;
5150         struct mm_slot *mm_slot;
5151         struct task_slot *task_slot;
5152
5153         if (mm && test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5154                 /* a frozen task becoming thawed */
5155                 spin_lock(&ksm_mmlist_lock);
5156                 mm_slot = get_mm_slot(mm);
5157                 BUG_ON(!mm_slot);
5158
5159                 if (lksm_test_mm_state(mm_slot, KSM_MM_FROZEN)
5160                                 && ksm_scan.mm_slot != mm_slot) {
5161                         if (!lksm_test_mm_state(mm_slot, KSM_MM_SCANNED))
5162                                 atomic_inc(&ksm_scan.nr_scannable);
5163                         else
5164                                 list_del_init(&mm_slot->scan_list);
5165                         lksm_clear_mm_state(mm_slot, KSM_MM_FROZEN);
5166                         lksm_set_mm_state(mm_slot, KSM_MM_LISTED);
5167                         atomic_dec(&ksm_scan.nr_frozen);
5168                         ksm_debug("nr_frozen: %d nr_scannable: %d",
5169                                         atomic_read(&ksm_scan.nr_frozen),
5170                                         atomic_read(&ksm_scan.nr_scannable));
5171                 }
5172                 spin_unlock(&ksm_mmlist_lock);
5173         } else {
5174                 /* just remove task slot, it will be cared by full_scan */
5175                 spin_lock(&frozen_task_lock);
5176                 task_slot = get_task_slot(task);
5177                 if (task_slot) {
5178                         list_del(&task_slot->list);
5179                         hash_del(&task_slot->hlist);
5180                 }
5181                 spin_unlock(&frozen_task_lock);
5182                 if (task_slot) {
5183                         free_task_slot(task_slot);
5184                         put_task_struct(task);
5185                         ksm_debug("task-%d(%s) is removed from frozen task list",
5186                                 task_pid_nr(task), task->comm);
5187                 }
5188         }
5189
5190         return 0;
5191 }
5192
5193 /*
5194  * lksm_hint: a hook for construct candidate list
5195  * this function cannot sleep
5196  */
5197 int lksm_hint(struct task_struct *task, int frozen)
5198 {
5199         /*
5200          * If lksm_hint is called by ksm_fork, the task yet has its own
5201          * mm_struct because it does not completes mm_struct initialization.
5202          * Thus, we skip this check and put the task into candidate list.
5203          */
5204         if (frozen == KSM_TASK_FROZEN)
5205                 return lksm_task_frozen(task);
5206         else if (frozen == KSM_TASK_THAWED)
5207                 return lksm_task_thawed(task);
5208         else
5209                 return 0;
5210 }
5211
5212 static void __init lksm_init(void)
5213 {
5214         ksm_crawld = kthread_create(lksm_crawl_thread, NULL, "ksm_crawld");
5215
5216         if (ksm_crawld == NULL) {
5217                 printk(KERN_ALERT "fail to create ksm crawler daemon\n");
5218                 return;
5219         }
5220
5221         atomic_set(&ksm_scan.nr_frozen, 0);
5222         atomic_set(&ksm_scan.nr_scannable, 0);
5223         atomic_set(&ksm_state, 0);
5224         INIT_LIST_HEAD(&ksm_scan.remove_mm_list);
5225
5226         crawler_sleep = msecs_to_jiffies(1000);
5227 #ifdef CONFIG_LKSM_FILTER
5228         init_lksm_region(&heap_region, 0, LKSM_REGION_HEAP, 0);
5229         heap_region.merge_cnt = 0;
5230         heap_region.filter_cnt = 0;
5231         heap_region.filter = NULL;
5232
5233         init_lksm_region(&unknown_region, 0, LKSM_REGION_UNKNOWN, 0);
5234         unknown_region.merge_cnt = 0;
5235         unknown_region.filter_cnt = 0;
5236         unknown_region.filter = NULL;
5237
5238         spin_lock_init(&lksm_region_lock);
5239 #endif /* CONFIG_LKSM_FILTER */
5240         wake_up_process(ksm_crawld);
5241 }
5242
5243 static int __init ksm_init(void)
5244 {
5245         struct task_struct *ksm_thread;
5246         int err;
5247
5248         /* The correct value depends on page size and endianness */
5249         zero_checksum = calc_checksum(ZERO_PAGE(0));
5250         /* Default to false for backwards compatibility */
5251         ksm_use_zero_pages = false;
5252
5253         err = ksm_slab_init();
5254         if (err)
5255                 goto out;
5256
5257         ksm_thread = kthread_run(lksm_scan_thread, NULL, "ksmd");
5258         if (IS_ERR(ksm_thread)) {
5259                 pr_err("ksm: creating kthread failed\n");
5260                 err = PTR_ERR(ksm_thread);
5261                 goto out_free;
5262         }
5263
5264 #ifdef CONFIG_SYSFS
5265         err = sysfs_create_group(mm_kobj, &ksm_attr_group);
5266         if (err) {
5267                 pr_err("ksm: register sysfs failed\n");
5268                 kthread_stop(ksm_thread);
5269                 goto out_free;
5270         }
5271 #else
5272         ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */
5273
5274 #endif /* CONFIG_SYSFS */
5275         lksm_init();
5276 #ifdef CONFIG_MEMORY_HOTREMOVE
5277         /* There is no significance to this priority 100 */
5278         hotplug_memory_notifier(ksm_memory_callback, 100);
5279 #endif
5280         return 0;
5281
5282 out_free:
5283         ksm_slab_free();
5284 out:
5285         return err;
5286 }
5287 subsys_initcall(ksm_init);