Merge branch 'akpm' (more incoming from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 01:50:35 +0000 (17:50 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 01:50:35 +0000 (17:50 -0800)
Merge second patch-bomb from Andrew Morton:

 - A little DM fix

 - the MM queue

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (154 commits)
  ksm: allocate roots when needed
  mm: cleanup "swapcache" in do_swap_page
  mm,ksm: swapoff might need to copy
  mm,ksm: FOLL_MIGRATION do migration_entry_wait
  ksm: shrink 32-bit rmap_item back to 32 bytes
  ksm: treat unstable nid like in stable tree
  ksm: add some comments
  tmpfs: fix mempolicy object leaks
  tmpfs: fix use-after-free of mempolicy object
  mm/fadvise.c: drain all pagevecs if POSIX_FADV_DONTNEED fails to discard all pages
  mm: export mmu notifier invalidates
  mm: accelerate mm_populate() treatment of THP pages
  mm: use long type for page counts in mm_populate() and get_user_pages()
  mm: accurately document nr_free_*_pages functions with code comments
  HWPOISON: change order of error_states[]'s elements
  HWPOISON: fix misjudgement of page_action() for errors on mlocked pages
  memcg: stop warning on memcg_propagate_kmem
  net: change type of virtio_chan->p9_max_pages
  vmscan: change type of vm_total_pages to unsigned long
  fs/nfsd: change type of max_delegations, nfsd_drc_max_mem and nfsd_drc_mem_used
  ...

113 files changed:
Documentation/ABI/testing/sysfs-kernel-mm-ksm [new file with mode: 0644]
Documentation/kernel-parameters.txt
Documentation/vm/ksm.txt
arch/arm64/mm/mmu.c
arch/ia64/mm/contig.c
arch/ia64/mm/discontig.c
arch/ia64/mm/init.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/s390/mm/init.c
arch/s390/mm/vmem.c
arch/sh/mm/init.c
arch/sparc/mm/init_32.c
arch/sparc/mm/init_64.c
arch/tile/mm/elf.c
arch/tile/mm/init.c
arch/tile/mm/pgtable.c
arch/x86/include/asm/numa.h
arch/x86/include/asm/pgtable_types.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/setup.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/numa.c
arch/x86/mm/pageattr.c
arch/x86/mm/srat.c
block/genhd.c
drivers/acpi/acpi_memhotplug.c
drivers/acpi/numa.c
drivers/acpi/processor_driver.c
drivers/base/memory.c
drivers/base/power/runtime.c
drivers/firmware/memmap.c
drivers/md/persistent-data/dm-transaction-manager.c
drivers/staging/zcache/zbud.c
drivers/staging/zsmalloc/zsmalloc-main.c
drivers/usb/core/hub.c
fs/aio.c
fs/buffer.c
fs/nfsd/nfs4state.c
fs/nfsd/nfsd.h
fs/nfsd/nfssvc.c
fs/proc/meminfo.c
include/linux/acpi.h
include/linux/bootmem.h
include/linux/compaction.h
include/linux/firmware-map.h
include/linux/highmem.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/ksm.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory_hotplug.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mman.h
include/linux/mmzone.h
include/linux/page-flags-layout.h [new file with mode: 0644]
include/linux/page-isolation.h
include/linux/pm.h
include/linux/pm_runtime.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swap.h
include/linux/vm_event_item.h
include/linux/vmstat.h
ipc/shm.c
kernel/sched/core.c
kernel/sysctl.c
mm/Kconfig
mm/compaction.c
mm/fadvise.c
mm/fremap.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/kmemleak.c
mm/ksm.c
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mincore.c
mm/mlock.c
mm/mm_init.c
mm/mmap.c
mm/mmu_notifier.c
mm/mmzone.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/rmap.c
mm/shmem.c
mm/slob.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
net/9p/trans_virtio.c
net/core/net-sysfs.c

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
new file mode 100644 (file)
index 0000000..73e653e
--- /dev/null
@@ -0,0 +1,52 @@
+What:          /sys/kernel/mm/ksm
+Date:          September 2009
+KernelVersion: 2.6.32
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Interface for Kernel Samepage Merging (KSM)
+
+What:          /sys/kernel/mm/ksm/full_scans
+What:          /sys/kernel/mm/ksm/pages_shared
+What:          /sys/kernel/mm/ksm/pages_sharing
+What:          /sys/kernel/mm/ksm/pages_to_scan
+What:          /sys/kernel/mm/ksm/pages_unshared
+What:          /sys/kernel/mm/ksm/pages_volatile
+What:          /sys/kernel/mm/ksm/run
+What:          /sys/kernel/mm/ksm/sleep_millisecs
+Date:          September 2009
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Kernel Samepage Merging daemon sysfs interface
+
+               full_scans: how many times all mergeable areas have been
+               scanned.
+
+               pages_shared: how many shared pages are being used.
+
+               pages_sharing: how many more sites are sharing them i.e. how
+               much saved.
+
+               pages_to_scan: how many present pages to scan before ksmd goes
+               to sleep.
+
+               pages_unshared: how many pages unique but repeatedly checked
+               for merging.
+
+               pages_volatile: how many pages changing too fast to be placed
+               in a tree.
+
+               run: write 0 to disable ksm, read 0 while ksm is disabled.
+                       write 1 to run ksm, read 1 while ksm is running.
+                       write 2 to disable ksm and unmerge all its pages.
+
+               sleep_millisecs: how many milliseconds ksm should sleep between
+               scans.
+
+               See Documentation/vm/ksm.txt for more information.
+
+What:          /sys/kernel/mm/ksm/merge_across_nodes
+Date:          January 2013
+KernelVersion: 3.9
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Control merging pages across different NUMA nodes.
+
+               When it is set to 0 only pages from the same node are merged,
+               otherwise pages from all nodes can be merged together (default).
index 9aa8ff3..7660877 100644 (file)
@@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        that the amount of memory usable for all allocations
                        is not too small.
 
+       movablemem_map=acpi
+                       [KNL,X86,IA-64,PPC] This parameter is similar to
+                       memmap except it specifies the memory map of
+                       ZONE_MOVABLE.
+                       This option inform the kernel to use Hot Pluggable bit
+                       in flags from SRAT from ACPI BIOS to determine which
+                       memory devices could be hotplugged. The corresponding
+                       memory ranges will be set as ZONE_MOVABLE.
+                       NOTE: Whatever node the kernel resides in will always
+                             be un-hotpluggable.
+
+       movablemem_map=nn[KMG]@ss[KMG]
+                       [KNL,X86,IA-64,PPC] This parameter is similar to
+                       memmap except it specifies the memory map of
+                       ZONE_MOVABLE.
+                       If user specifies memory ranges, the info in SRAT will
+                       be ingored. And it works like the following:
+                       - If more ranges are all within one node, then from
+                         lowest ss to the end of the node will be ZONE_MOVABLE.
+                       - If a range is within a node, then from ss to the end
+                         of the node will be ZONE_MOVABLE.
+                       - If a range covers two or more nodes, then from ss to
+                         the end of the 1st node will be ZONE_MOVABLE, and all
+                         the rest nodes will only have ZONE_MOVABLE.
+                       If memmap is specified at the same time, the
+                       movablemem_map will be limited within the memmap
+                       areas. If kernelcore or movablecore is also specified,
+                       movablemem_map will have higher priority to be
+                       satisfied. So the administrator should be careful that
+                       the amount of movablemem_map areas are not too large.
+                       Otherwise kernel won't have enough memory to start.
+                       NOTE: We don't stop users specifying the node the
+                             kernel resides in as hotpluggable so that this
+                             option can be used as a workaround of firmware
+                              bugs.
+
        MTD_Partition=  [MTD]
                        Format: <name>,<region-number>,<size>,<offset>
 
index b392e49..f34a8ee 100644 (file)
@@ -58,6 +58,21 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
                    e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
                    Default: 20 (chosen for demonstration purposes)
 
+merge_across_nodes - specifies if pages from different numa nodes can be merged.
+                   When set to 0, ksm merges only pages which physically
+                   reside in the memory area of same NUMA node. That brings
+                   lower latency to access of shared pages. Systems with more
+                   nodes, at significant NUMA distances, are likely to benefit
+                   from the lower latency of setting 0. Smaller systems, which
+                   need to minimize memory usage, are likely to benefit from
+                   the greater sharing of setting 1 (default). You may wish to
+                   compare how your system performs under each setting, before
+                   deciding on which to use. merge_across_nodes setting can be
+                   changed only when there are no ksm shared pages in system:
+                   set run 2 to unmerge pages first, then to 1 after changing
+                   merge_across_nodes, to remerge according to the new setting.
+                   Default: 1 (merging across nodes as in earlier releases)
+
 run              - set 0 to stop ksmd from running but keep merged pages,
                    set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
                    set 2 to stop ksmd and unmerge all pages currently merged,
index f4dd585..224b44a 100644 (file)
@@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page,
        return 0;
 }
 #endif /* CONFIG_ARM64_64K_PAGES */
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
index 1516d1d..80dab50 100644 (file)
@@ -93,7 +93,7 @@ void show_mem(unsigned int filter)
        printk(KERN_INFO "%d pages swap cached\n", total_cached);
        printk(KERN_INFO "Total of %ld pages in page table cache\n",
               quicklist_total_size());
-       printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
+       printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
 }
 
 
index c641333..c2e955e 100644 (file)
@@ -666,7 +666,7 @@ void show_mem(unsigned int filter)
        printk(KERN_INFO "%d pages swap cached\n", total_cached);
        printk(KERN_INFO "Total of %ld pages in page table cache\n",
               quicklist_total_size());
-       printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
+       printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
 }
 
 /**
@@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page,
 {
        return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
 #endif
index b755ea9..20bc967 100644 (file)
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
        return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+       ret = __remove_pages(zone, start_pfn, nr_pages);
+       if (ret)
+               pr_warn("%s: Problem encountered in __remove_pages() as"
+                       " ret=%d\n", __func__,  ret);
+
+       return ret;
+}
+#endif
 #endif
 
 /*
index 95a4529..7e2246f 100644 (file)
@@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 
        return 0;
 }
+
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
index 40df7c8..f1f7409 100644 (file)
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
        return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct zone *zone;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+       return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
index ae672f4..49ce6bb 100644 (file)
@@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
                vmem_remove_mapping(start, size);
        return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       /*
+        * There is no hardware or firmware interface which could trigger a
+        * hot memory remove on s390. So there is nothing that needs to be
+        * implemented.
+        */
+       return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
index 79699f4..e21aaf4 100644 (file)
@@ -268,6 +268,10 @@ out:
        return ret;
 }
 
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 /*
  * Add memory segment to the segment list if it doesn't overlap with
  * an already present segment.
index 82cc576..1057940 100644 (file)
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+       ret = __remove_pages(zone, start_pfn, nr_pages);
+       if (unlikely(ret))
+               pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+                       ret);
+
+       return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
index dde85ef..48e0c03 100644 (file)
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
        printk("Mem-info:\n");
        show_free_areas(filter);
        printk("Free swap:       %6ldkB\n",
-              nr_swap_pages << (PAGE_SHIFT-10));
+              get_nr_swap_pages() << (PAGE_SHIFT-10));
        printk("%ld pages of RAM\n", totalram_pages);
        printk("%ld free pages\n", nr_free_pages());
 }
index 5c2c6e6..1588d33 100644 (file)
@@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void)
                node_start = 0;
        }
 }
+
+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
index 3cfa98b..743c951 100644 (file)
@@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
        if (!retval) {
                unsigned long addr = MEM_USER_INTRPT;
                addr = mmap_region(NULL, addr, INTRPT_SIZE,
-                                  MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
                                   VM_READ|VM_EXEC|
                                   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
                if (addr > (unsigned long) -PAGE_SIZE)
index ef29d6c..2749515 100644 (file)
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
        return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       /* TODO */
+       return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
index de0de0c..b3b4972 100644 (file)
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
               global_page_state(NR_PAGETABLE),
               global_page_state(NR_BOUNCE),
               global_page_state(NR_FILE_PAGES),
-              nr_swap_pages);
+              get_nr_swap_pages());
 
        for_each_zone(zone) {
                unsigned long flags, order, total = 0, largest_order = -1;
index 52560a2..1b99ee5 100644 (file)
@@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu)
 #endif
 
 #ifdef CONFIG_NUMA
-extern void __cpuinit numa_set_node(int cpu, int node);
-extern void __cpuinit numa_clear_node(int cpu);
+extern void numa_set_node(int cpu, int node);
+extern void numa_clear_node(int cpu);
 extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
index e642300..567b5d0 100644 (file)
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
 extern phys_addr_t slow_virt_to_phys(void *__address);
 
 #endif /* !__ASSEMBLY__ */
index cfc755d..230c8ea 100644 (file)
@@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
+#ifdef CONFIG_ACPI_NUMA
+       set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+
        per_cpu(x86_cpu_to_apicid, cpu) = -1;
        set_cpu_present(cpu, false);
        num_processors--;
index 915f5ef..9c857f0 100644 (file)
@@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
        setup_bios_corruption_check();
 #endif
 
+       /*
+        * In the memory hotplug case, the kernel needs info from SRAT to
+        * determine which memory is hotpluggable before allocating memory
+        * using memblock.
+        */
+       acpi_boot_table_init();
+       early_acpi_boot_init();
+       early_parse_srat();
+
 #ifdef CONFIG_X86_32
        printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
                        (max_pfn_mapped<<PAGE_SHIFT) - 1);
@@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
        /*
         * Parse the ACPI tables for possible boot-time SMP configuration.
         */
-       acpi_boot_table_init();
-
-       early_acpi_boot_init();
-
        initmem_init();
        memblock_find_dma_reserve();
 
index b299724..2d19001 100644 (file)
@@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
        return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct zone *zone;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+       return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif
 
 /*
index 3eba7f4..474e28f 100644 (file)
@@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+       struct zone *zone;
+       bool bootmem = false;
+       unsigned long magic;
+       unsigned int nr_pages = 1 << order;
+
+       /* bootmem page has reserved flag */
+       if (PageReserved(page)) {
+               __ClearPageReserved(page);
+               bootmem = true;
+
+               magic = (unsigned long)page->lru.next;
+               if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+                       while (nr_pages--)
+                               put_page_bootmem(page++);
+               } else
+                       __free_pages_bootmem(page, order);
+       } else
+               free_pages((unsigned long)page_address(page), order);
+
+       /*
+        * SECTION_INFO pages and MIX_SECTION_INFO pages
+        * are all allocated by bootmem.
+        */
+       if (bootmem) {
+               zone = page_zone(page);
+               zone_span_writelock(zone);
+               zone->present_pages += nr_pages;
+               zone_span_writeunlock(zone);
+               totalram_pages += nr_pages;
+       }
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+       pte_t *pte;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pte = pte_start + i;
+               if (pte_val(*pte))
+                       return;
+       }
+
+       /* free a pte talbe */
+       free_pagetable(pmd_page(*pmd), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pmd_clear(pmd);
+       spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+       pmd_t *pmd;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd = pmd_start + i;
+               if (pmd_val(*pmd))
+                       return;
+       }
+
+       /* free a pmd talbe */
+       free_pagetable(pud_page(*pud), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pud_clear(pud);
+       spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+       pud_t *pud;
+       int i;
+
+       for (i = 0; i < PTRS_PER_PUD; i++) {
+               pud = pud_start + i;
+               if (pud_val(*pud))
+                       return false;
+       }
+
+       /* free a pud table */
+       free_pagetable(pgd_page(*pgd), 0);
+       spin_lock(&init_mm.page_table_lock);
+       pgd_clear(pgd);
+       spin_unlock(&init_mm.page_table_lock);
+
+       return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long next, pages = 0;
+       pte_t *pte;
+       void *page_addr;
+       phys_addr_t phys_addr;
+
+       pte = pte_start + pte_index(addr);
+       for (; addr < end; addr = next, pte++) {
+               next = (addr + PAGE_SIZE) & PAGE_MASK;
+               if (next > end)
+                       next = end;
+
+               if (!pte_present(*pte))
+                       continue;
+
+               /*
+                * We mapped [0,1G) memory as identity mapping when
+                * initializing, in arch/x86/kernel/head_64.S. These
+                * pagetables cannot be removed.
+                */
+               phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+               if (phys_addr < (phys_addr_t)0x40000000)
+                       return;
+
+               if (IS_ALIGNED(addr, PAGE_SIZE) &&
+                   IS_ALIGNED(next, PAGE_SIZE)) {
+                       /*
+                        * Do not free direct mapping pages since they were
+                        * freed when offlining, or simplely not in use.
+                        */
+                       if (!direct)
+                               free_pagetable(pte_page(*pte), 0);
+
+                       spin_lock(&init_mm.page_table_lock);
+                       pte_clear(&init_mm, addr, pte);
+                       spin_unlock(&init_mm.page_table_lock);
+
+                       /* For non-direct mapping, pages means nothing. */
+                       pages++;
+               } else {
+                       /*
+                        * If we are here, we are freeing vmemmap pages since
+                        * direct mapped memory ranges to be freed are aligned.
+                        *
+                        * If we are not removing the whole page, it means
+                        * other page structs in this page are being used and
+                        * we canot remove them. So fill the unused page_structs
+                        * with 0xFD, and remove the page when it is wholly
+                        * filled with 0xFD.
+                        */
+                       memset((void *)addr, PAGE_INUSE, next - addr);
+
+                       page_addr = page_address(pte_page(*pte));
+                       if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+                               free_pagetable(pte_page(*pte), 0);
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pte_clear(&init_mm, addr, pte);
+                               spin_unlock(&init_mm.page_table_lock);
+                       }
+               }
+       }
+
+       /* Call free_pte_table() in remove_pmd_table(). */
+       flush_tlb_all();
+       if (direct)
+               update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long next, pages = 0;
+       pte_t *pte_base;
+       pmd_t *pmd;
+       void *page_addr;
+
+       pmd = pmd_start + pmd_index(addr);
+       for (; addr < end; addr = next, pmd++) {
+               next = pmd_addr_end(addr, end);
+
+               if (!pmd_present(*pmd))
+                       continue;
+
+               if (pmd_large(*pmd)) {
+                       if (IS_ALIGNED(addr, PMD_SIZE) &&
+                           IS_ALIGNED(next, PMD_SIZE)) {
+                               if (!direct)
+                                       free_pagetable(pmd_page(*pmd),
+                                                      get_order(PMD_SIZE));
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pmd_clear(pmd);
+                               spin_unlock(&init_mm.page_table_lock);
+                               pages++;
+                       } else {
+                               /* If here, we are freeing vmemmap pages. */
+                               memset((void *)addr, PAGE_INUSE, next - addr);
+
+                               page_addr = page_address(pmd_page(*pmd));
+                               if (!memchr_inv(page_addr, PAGE_INUSE,
+                                               PMD_SIZE)) {
+                                       free_pagetable(pmd_page(*pmd),
+                                                      get_order(PMD_SIZE));
+
+                                       spin_lock(&init_mm.page_table_lock);
+                                       pmd_clear(pmd);
+                                       spin_unlock(&init_mm.page_table_lock);
+                               }
+                       }
+
+                       continue;
+               }
+
+               pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+               remove_pte_table(pte_base, addr, next, direct);
+               free_pte_table(pte_base, pmd);
+       }
+
+       /* Call free_pmd_table() in remove_pud_table(). */
+       if (direct)
+               update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+                bool direct)
+{
+       unsigned long next, pages = 0;
+       pmd_t *pmd_base;
+       pud_t *pud;
+       void *page_addr;
+
+       pud = pud_start + pud_index(addr);
+       for (; addr < end; addr = next, pud++) {
+               next = pud_addr_end(addr, end);
+
+               if (!pud_present(*pud))
+                       continue;
+
+               if (pud_large(*pud)) {
+                       if (IS_ALIGNED(addr, PUD_SIZE) &&
+                           IS_ALIGNED(next, PUD_SIZE)) {
+                               if (!direct)
+                                       free_pagetable(pud_page(*pud),
+                                                      get_order(PUD_SIZE));
+
+                               spin_lock(&init_mm.page_table_lock);
+                               pud_clear(pud);
+                               spin_unlock(&init_mm.page_table_lock);
+                               pages++;
+                       } else {
+                               /* If here, we are freeing vmemmap pages. */
+                               memset((void *)addr, PAGE_INUSE, next - addr);
+
+                               page_addr = page_address(pud_page(*pud));
+                               if (!memchr_inv(page_addr, PAGE_INUSE,
+                                               PUD_SIZE)) {
+                                       free_pagetable(pud_page(*pud),
+                                                      get_order(PUD_SIZE));
+
+                                       spin_lock(&init_mm.page_table_lock);
+                                       pud_clear(pud);
+                                       spin_unlock(&init_mm.page_table_lock);
+                               }
+                       }
+
+                       continue;
+               }
+
+               pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+               remove_pmd_table(pmd_base, addr, next, direct);
+               free_pmd_table(pmd_base, pud);
+       }
+
+       if (direct)
+               update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+       unsigned long next;
+       pgd_t *pgd;
+       pud_t *pud;
+       bool pgd_changed = false;
+
+       for (; start < end; start = next) {
+               next = pgd_addr_end(start, end);
+
+               pgd = pgd_offset_k(start);
+               if (!pgd_present(*pgd))
+                       continue;
+
+               pud = (pud_t *)pgd_page_vaddr(*pgd);
+               remove_pud_table(pud, start, next, direct);
+               if (free_pud_table(pud, pgd))
+                       pgd_changed = true;
+       }
+
+       if (pgd_changed)
+               sync_global_pgds(start, end - 1);
+
+       flush_tlb_all();
+}
+
+void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+{
+       unsigned long start = (unsigned long)memmap;
+       unsigned long end = (unsigned long)(memmap + nr_pages);
+
+       remove_pagetable(start, end, false);
+}
+
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+       start = (unsigned long)__va(start);
+       end = (unsigned long)__va(end);
+
+       remove_pagetable(start, end, true);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+       kernel_physical_mapping_remove(start, start + size);
+       ret = __remove_pages(zone, start_pfn, nr_pages);
+       WARN_ON_ONCE(ret);
+
+       return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
@@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
        return 0;
 }
 
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+                                 struct page *start_page, unsigned long size)
+{
+       unsigned long addr = (unsigned long)start_page;
+       unsigned long end = (unsigned long)(start_page + size);
+       unsigned long next;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       unsigned int nr_pages;
+       struct page *page;
+
+       for (; addr < end; addr = next) {
+               pte_t *pte = NULL;
+
+               pgd = pgd_offset_k(addr);
+               if (pgd_none(*pgd)) {
+                       next = (addr + PAGE_SIZE) & PAGE_MASK;
+                       continue;
+               }
+               get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+               pud = pud_offset(pgd, addr);
+               if (pud_none(*pud)) {
+                       next = (addr + PAGE_SIZE) & PAGE_MASK;
+                       continue;
+               }
+               get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+               if (!cpu_has_pse) {
+                       next = (addr + PAGE_SIZE) & PAGE_MASK;
+                       pmd = pmd_offset(pud, addr);
+                       if (pmd_none(*pmd))
+                               continue;
+                       get_page_bootmem(section_nr, pmd_page(*pmd),
+                                        MIX_SECTION_INFO);
+
+                       pte = pte_offset_kernel(pmd, addr);
+                       if (pte_none(*pte))
+                               continue;
+                       get_page_bootmem(section_nr, pte_page(*pte),
+                                        SECTION_INFO);
+               } else {
+                       next = pmd_addr_end(addr, end);
+
+                       pmd = pmd_offset(pud, addr);
+                       if (pmd_none(*pmd))
+                               continue;
+
+                       nr_pages = 1 << (get_order(PMD_SIZE));
+                       page = pmd_page(*pmd);
+                       while (nr_pages--)
+                               get_page_bootmem(section_nr, page++,
+                                                SECTION_INFO);
+               }
+       }
+}
+#endif
+
 void __meminit vmemmap_populate_print_last(void)
 {
        if (p_start) {
index 8504f36..dfd3025 100644 (file)
@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
 /*
  * apicid, cpu, node mappings
  */
-s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 
-void __cpuinit numa_set_node(int cpu, int node)
+void numa_set_node(int cpu, int node)
 {
        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
@@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node)
                set_cpu_numa_node(cpu, node);
 }
 
-void __cpuinit numa_clear_node(int cpu)
+void numa_clear_node(int cpu)
 {
        numa_set_node(cpu, NUMA_NO_NODE);
 }
@@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
         * Allocate node data.  Try node-local memory and then any node.
         * Never allocate in DMA zone.
         */
-       nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+       nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
        if (!nd_pa) {
-               pr_err("Cannot find %zu bytes in node %d\n",
-                      nd_size, nid);
+               pr_err("Cannot find %zu bytes in any node\n", nd_size);
                return;
        }
        nd = __va(nd_pa);
@@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
        for (i = 0; i < MAX_LOCAL_APIC; i++)
                set_apicid_to_node(i, NUMA_NO_NODE);
 
-       nodes_clear(numa_nodes_parsed);
+       /*
+        * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
+        * SRAT was parsed earlier in early_parse_srat().
+        */
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
-       memset(&numa_meminfo, 0, sizeof(numa_meminfo));
        WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
        numa_reset_distance();
 
index a1b1c88..ca1f1c2 100644 (file)
@@ -529,21 +529,13 @@ out_unlock:
        return do_split;
 }
 
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
 {
        unsigned long pfn, pfninc = 1;
        unsigned int i, level;
-       pte_t *pbase, *tmp;
+       pte_t *tmp;
        pgprot_t ref_prot;
-       struct page *base;
-
-       if (!debug_pagealloc)
-               spin_unlock(&cpa_lock);
-       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-       if (!debug_pagealloc)
-               spin_lock(&cpa_lock);
-       if (!base)
-               return -ENOMEM;
+       struct page *base = virt_to_page(pbase);
 
        spin_lock(&pgd_lock);
        /*
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         * up for us already:
         */
        tmp = lookup_address(address, &level);
-       if (tmp != kpte)
-               goto out_unlock;
+       if (tmp != kpte) {
+               spin_unlock(&pgd_lock);
+               return 1;
+       }
 
-       pbase = (pte_t *)page_address(base);
        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
        /*
@@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         * going on.
         */
        __flush_tlb_all();
+       spin_unlock(&pgd_lock);
 
-       base = NULL;
+       return 0;
+}
 
-out_unlock:
-       /*
-        * If we dropped out via the lookup_address check under
-        * pgd_lock then stick the page back into the pool:
-        */
-       if (base)
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+       pte_t *pbase;
+       struct page *base;
+
+       if (!debug_pagealloc)
+               spin_unlock(&cpa_lock);
+       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+       if (!debug_pagealloc)
+               spin_lock(&cpa_lock);
+       if (!base)
+               return -ENOMEM;
+
+       pbase = (pte_t *)page_address(base);
+       if (__split_large_page(kpte, address, pbase))
                __free_page(base);
-       spin_unlock(&pgd_lock);
 
        return 0;
 }
index cdd0da9..79836d0 100644 (file)
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
 static inline int save_add_info(void) {return 0;}
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+       int overlap, i;
+       unsigned long start_pfn, end_pfn;
+
+       start_pfn = PFN_DOWN(start);
+       end_pfn = PFN_UP(end);
+
+       /*
+        * For movablemem_map=acpi:
+        *
+        * SRAT:                |_____| |_____| |_________| |_________| ......
+        * node id:                0       1         1           2
+        * hotpluggable:           n       y         y           n
+        * movablemem_map:              |_____| |_________|
+        *
+        * Using movablemem_map, we can prevent memblock from allocating memory
+        * on ZONE_MOVABLE at boot time.
+        *
+        * Before parsing SRAT, memblock has already reserve some memory ranges
+        * for other purposes, such as for kernel image. We cannot prevent
+        * kernel from using these memory, so we need to exclude these memory
+        * even if it is hotpluggable.
+        * Furthermore, to ensure the kernel has enough memory to boot, we make
+        * all the memory on the node which the kernel resides in
+        * un-hotpluggable.
+        */
+       if (hotpluggable && movablemem_map.acpi) {
+               /* Exclude ranges reserved by memblock. */
+               struct memblock_type *rgn = &memblock.reserved;
+
+               for (i = 0; i < rgn->cnt; i++) {
+                       if (end <= rgn->regions[i].base ||
+                           start >= rgn->regions[i].base +
+                           rgn->regions[i].size)
+                               continue;
+
+                       /*
+                        * If the memory range overlaps the memory reserved by
+                        * memblock, then the kernel resides in this node.
+                        */
+                       node_set(node, movablemem_map.numa_nodes_kernel);
+
+                       goto out;
+               }
+
+               /*
+                * If the kernel resides in this node, then the whole node
+                * should not be hotpluggable.
+                */
+               if (node_isset(node, movablemem_map.numa_nodes_kernel))
+                       goto out;
+
+               insert_movablemem_map(start_pfn, end_pfn);
+
+               /*
+                * numa_nodes_hotplug nodemask represents which nodes are put
+                * into movablemem_map.map[].
+                */
+               node_set(node, movablemem_map.numa_nodes_hotplug);
+               goto out;
+       }
+
+       /*
+        * For movablemem_map=nn[KMG]@ss[KMG]:
+        *
+        * SRAT:                |_____| |_____| |_________| |_________| ......
+        * node id:                0       1         1           2
+        * user specified:                |__|                 |___|
+        * movablemem_map:                |___| |_________|    |______| ......
+        *
+        * Using movablemem_map, we can prevent memblock from allocating memory
+        * on ZONE_MOVABLE at boot time.
+        *
+        * NOTE: In this case, SRAT info will be ingored.
+        */
+       overlap = movablemem_map_overlap(start_pfn, end_pfn);
+       if (overlap >= 0) {
+               /*
+                * If part of this range is in movablemem_map, we need to
+                * add the range after it to extend the range to the end
+                * of the node, because from the min address specified to
+                * the end of the node will be ZONE_MOVABLE.
+                */
+               start_pfn = max(start_pfn,
+                           movablemem_map.map[overlap].start_pfn);
+               insert_movablemem_map(start_pfn, end_pfn);
+
+               /*
+                * Set the nodemask, so that if the address range on one node
+                * is not continuse, we can add the subsequent ranges on the
+                * same node into movablemem_map.
+                */
+               node_set(node, movablemem_map.numa_nodes_hotplug);
+       } else {
+               if (node_isset(node, movablemem_map.numa_nodes_hotplug))
+                       /*
+                        * Insert the range if we already have movable ranges
+                        * on the same node.
+                        */
+                       insert_movablemem_map(start_pfn, end_pfn);
+       }
+out:
+       return;
+}
+#else          /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void
+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
+{
+}
+#endif         /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
        u64 start, end;
+       u32 hotpluggable;
        int node, pxm;
 
        if (srat_disabled())
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
                goto out_err_bad_srat;
        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
                goto out_err;
-       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+       hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+       if (hotpluggable && !save_add_info())
                goto out_err;
 
        start = ma->base_address;
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
        node_set(node, numa_nodes_parsed);
 
-       printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+       printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
               node, pxm,
-              (unsigned long long) start, (unsigned long long) end - 1);
+              (unsigned long long) start, (unsigned long long) end - 1,
+              hotpluggable ? "Hot Pluggable": "");
+
+       handle_movablemem(node, start, end, hotpluggable);
 
        return 0;
 out_err_bad_srat:
index 3993ebf..5f73c24 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
+#include <linux/pm_runtime.h>
 
 #include "blk.h"
 
@@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk)
                        return;
                }
        }
+
+       /*
+        * avoid probable deadlock caused by allocating memory with
+        * GFP_KERNEL in runtime_resume callback of its all ancestor
+        * devices
+        */
+       pm_runtime_set_memalloc_noio(ddev, true);
+
        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk)
        disk->driverfs_dev = NULL;
        if (!sysfs_deprecated)
                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+       pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
        device_del(disk_to_dev(disk));
 }
 EXPORT_SYMBOL(del_gendisk);
index 034d3e7..da1f82b 100644 (file)
@@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 
 static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
-       int result = 0;
+       int result = 0, nid;
        struct acpi_memory_info *info, *n;
 
+       nid = acpi_get_node(mem_device->device->handle);
+
        list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
                if (info->failed)
                        /* The kernel does not use this memory block */
@@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
                         */
                        return -EBUSY;
 
-               result = remove_memory(info->start_addr, info->length);
+               if (nid < 0)
+                       nid = memory_add_physaddr_to_nid(info->start_addr);
+               result = remove_memory(nid, info->start_addr, info->length);
                if (result)
                        return result;
 
index 33e609f..59844ee 100644 (file)
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
                                            handler, max_entries);
 }
 
-int __init acpi_numa_init(void)
-{
-       int cnt = 0;
+static int srat_mem_cnt;
 
+void __init early_parse_srat(void)
+{
        /*
         * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
         * SRAT cpu entries could have different order with that in MADT.
@@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
        /* SRAT: Static Resource Affinity Table */
        if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
                acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-                                    acpi_parse_x2apic_affinity, 0);
+                                     acpi_parse_x2apic_affinity, 0);
                acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-                                    acpi_parse_processor_affinity, 0);
-               cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
-                                           acpi_parse_memory_affinity,
-                                           NR_NODE_MEMBLKS);
+                                     acpi_parse_processor_affinity, 0);
+               srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+                                                    acpi_parse_memory_affinity,
+                                                    NR_NODE_MEMBLKS);
        }
+}
 
+int __init acpi_numa_init(void)
+{
        /* SLIT: System Locality Information Table */
        acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
        acpi_numa_arch_fixup();
 
-       if (cnt < 0)
-               return cnt;
+       if (srat_mem_cnt < 0)
+               return srat_mem_cnt;
        else if (!parsed_numa_memblks)
                return -ENOENT;
        return 0;
index cbf1f12..df34bd0 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
 #include <linux/acpi.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/io.h>
 #include <asm/cpu.h>
@@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device)
 
        per_cpu(processors, pr->id) = NULL;
        per_cpu(processor_device_array, pr->id) = NULL;
+       try_offline_node(cpu_to_node(pr->id));
 
 free:
        free_cpumask_var(pr->throttling.shared_cpu_map);
index 83d0b17..a51007b 100644 (file)
@@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem)
        return ret;
 }
 
+/* return true if the memory block is offlined, otherwise, return false */
+bool is_memblock_offlined(struct memory_block *mem)
+{
+       return mem->state == MEM_OFFLINE;
+}
+
 /*
  * Initialize the sysfs support for memory devices...
  */
index 3148b10..1244930 100644 (file)
@@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
 
+static int dev_memalloc_noio(struct device *dev, void *data)
+{
+       return dev->power.memalloc_noio;
+}
+
+/*
+ * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
+ * @dev: Device to handle.
+ * @enable: True for setting the flag and False for clearing the flag.
+ *
+ * Set the flag for all devices in the path from the device to the
+ * root device in the device tree if @enable is true, otherwise clear
+ * the flag for devices in the path whose siblings don't set the flag.
+ *
+ * The function should only be called by block device, or network
+ * device driver for solving the deadlock problem during runtime
+ * resume/suspend:
+ *
+ *     If memory allocation with GFP_KERNEL is called inside runtime
+ *     resume/suspend callback of any one of its ancestors(or the
+ *     block device itself), the deadlock may be triggered inside the
+ *     memory allocation since it might not complete until the block
+ *     device becomes active and the involed page I/O finishes. The
+ *     situation is pointed out first by Alan Stern. Network device
+ *     are involved in iSCSI kind of situation.
+ *
+ * The lock of dev_hotplug_mutex is held in the function for handling
+ * hotplug race because pm_runtime_set_memalloc_noio() may be called
+ * in async probe().
+ *
+ * The function should be called between device_add() and device_del()
+ * on the affected device(block/network device).
+ */
+void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
+{
+       static DEFINE_MUTEX(dev_hotplug_mutex);
+
+       mutex_lock(&dev_hotplug_mutex);
+       for (;;) {
+               bool enabled;
+
+               /* hold power lock since bitfield is not SMP-safe. */
+               spin_lock_irq(&dev->power.lock);
+               enabled = dev->power.memalloc_noio;
+               dev->power.memalloc_noio = enable;
+               spin_unlock_irq(&dev->power.lock);
+
+               /*
+                * not need to enable ancestors any more if the device
+                * has been enabled.
+                */
+               if (enabled && enable)
+                       break;
+
+               dev = dev->parent;
+
+               /*
+                * clear flag of the parent device only if all the
+                * children don't set the flag because ancestor's
+                * flag was set by any one of the descendants.
+                */
+               if (!dev || (!enable &&
+                            device_for_each_child(dev, NULL,
+                                                  dev_memalloc_noio)))
+                       break;
+       }
+       mutex_unlock(&dev_hotplug_mutex);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
+
 /**
  * rpm_check_suspend_allowed - Test whether a device may be suspended.
  * @dev: Device to test.
@@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev)
        if (!cb)
                return -ENOSYS;
 
-       retval = __rpm_callback(cb, dev);
+       if (dev->power.memalloc_noio) {
+               unsigned int noio_flag;
+
+               /*
+                * Deadlock might be caused if memory allocation with
+                * GFP_KERNEL happens inside runtime_suspend and
+                * runtime_resume callbacks of one block device's
+                * ancestor or the block device itself. Network
+                * device might be thought as part of iSCSI block
+                * device, so network device and its ancestor should
+                * be marked as memalloc_noio too.
+                */
+               noio_flag = memalloc_noio_save();
+               retval = __rpm_callback(cb, dev);
+               memalloc_noio_restore(noio_flag);
+       } else {
+               retval = __rpm_callback(cb, dev);
+       }
 
        dev->power.runtime_error = retval;
        return retval != -EACCES ? retval : -EIO;
index 90723e6..0b5b5f6 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 /*
  * Data types ------------------------------------------------------------------
@@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf);
 static ssize_t end_show(struct firmware_map_entry *entry, char *buf);
 static ssize_t type_show(struct firmware_map_entry *entry, char *buf);
 
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type);
+
 /*
  * Static data -----------------------------------------------------------------
  */
@@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = {
        .show = memmap_attr_show,
 };
 
-static struct kobj_type memmap_ktype = {
+/* Firmware memory map entries. */
+static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
+
+/*
+ * For memory hotplug, there is no way to free memory map entries allocated
+ * by boot mem after the system is up. So when we hot-remove memory whose
+ * map entry is allocated by bootmem, we need to remember the storage and
+ * reuse it when the memory is hot-added again.
+ */
+static LIST_HEAD(map_entries_bootmem);
+static DEFINE_SPINLOCK(map_entries_bootmem_lock);
+
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+       return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void __meminit release_firmware_map_entry(struct kobject *kobj)
+{
+       struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+       if (PageReserved(virt_to_page(entry))) {
+               /*
+                * Remember the storage allocated by bootmem, and reuse it when
+                * the memory is hot-added again. The entry will be added to
+                * map_entries_bootmem here, and deleted from &map_entries in
+                * firmware_map_remove_entry().
+                */
+               if (firmware_map_find_entry(entry->start, entry->end,
+                   entry->type)) {
+                       spin_lock(&map_entries_bootmem_lock);
+                       list_add(&entry->list, &map_entries_bootmem);
+                       spin_unlock(&map_entries_bootmem_lock);
+               }
+
+               return;
+       }
+
+       kfree(entry);
+}
+
+static struct kobj_type __refdata memmap_ktype = {
+       .release        = release_firmware_map_entry,
        .sysfs_ops      = &memmap_attr_ops,
        .default_attrs  = def_attrs,
 };
@@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = {
  * Registration functions ------------------------------------------------------
  */
 
-/*
- * Firmware memory map entries. No locking is needed because the
- * firmware_map_add() and firmware_map_add_early() functions are called
- * in firmware initialisation code in one single thread of execution.
- */
-static LIST_HEAD(map_entries);
-
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
  * @start: Start of the memory range.
@@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
        INIT_LIST_HEAD(&entry->list);
        kobject_init(&entry->kobj, &memmap_ktype);
 
+       spin_lock(&map_entries_lock);
        list_add_tail(&entry->list, &map_entries);
+       spin_unlock(&map_entries_lock);
 
        return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ *
+ * The caller must hold map_entries_lock, and release it properly.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+       list_del(&entry->list);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
        return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+       kobject_put(&entry->kobj);
+}
+
+/*
+ * firmware_map_find_entry_in_list() - Search memmap entry in a given list.
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (exclusive).
+ * @type:  Type of the memory range.
+ * @list:  In which to find the entry.
+ *
+ * This function is to find the memmap entey of a given memory range in a
+ * given list. The caller must hold map_entries_lock, and must not release
+ * the lock until the processing of the returned entry has completed.
+ *
+ * Return: Pointer to the entry to be found on success, or NULL on failure.
+ */
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry_in_list(u64 start, u64 end, const char *type,
+                               struct list_head *list)
+{
+       struct firmware_map_entry *entry;
+
+       list_for_each_entry(entry, list, list)
+               if ((entry->start == start) && (entry->end == end) &&
+                   (!strcmp(entry->type, type))) {
+                       return entry;
+               }
+
+       return NULL;
+}
+
+/*
+ * firmware_map_find_entry() - Search memmap entry in map_entries.
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (exclusive).
+ * @type:  Type of the memory range.
+ *
+ * This function is to find the memmap entey of a given memory range.
+ * The caller must hold map_entries_lock, and must not release the lock
+ * until the processing of the returned entry has completed.
+ *
+ * Return: Pointer to the entry to be found on success, or NULL on failure.
+ */
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+       return firmware_map_find_entry_in_list(start, end, type, &map_entries);
+}
+
+/*
+ * firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem.
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (exclusive).
+ * @type:  Type of the memory range.
+ *
+ * This function is similar to firmware_map_find_entry except that it find the
+ * given entry in map_entries_bootmem.
+ *
+ * Return: Pointer to the entry to be found on success, or NULL on failure.
+ */
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type)
+{
+       return firmware_map_find_entry_in_list(start, end, type,
+                                              &map_entries_bootmem);
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type)
 {
        struct firmware_map_entry *entry;
 
-       entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
-       if (!entry)
-               return -ENOMEM;
+       entry = firmware_map_find_entry_bootmem(start, end, type);
+       if (!entry) {
+               entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
+               if (!entry)
+                       return -ENOMEM;
+       } else {
+               /* Reuse storage allocated by bootmem. */
+               spin_lock(&map_entries_bootmem_lock);
+               list_del(&entry->list);
+               spin_unlock(&map_entries_bootmem_lock);
+
+               memset(entry, 0, sizeof(*entry));
+       }
 
        firmware_map_add_entry(start, end, type, entry);
        /* create the memmap entry */
@@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
        return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+       struct firmware_map_entry *entry;
+
+       spin_lock(&map_entries_lock);
+       entry = firmware_map_find_entry(start, end - 1, type);
+       if (!entry) {
+               spin_unlock(&map_entries_lock);
+               return -EINVAL;
+       }
+
+       firmware_map_remove_entry(entry);
+       spin_unlock(&map_entries_lock);
+
+       /* remove the memmap entry */
+       remove_sysfs_fw_map_entry(entry);
+
+       return 0;
+}
+
 /*
  * Sysfs functions -------------------------------------------------------------
  */
@@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
        return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
 }
 
-#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
+{
+       return container_of(attr, struct memmap_attribute, attr);
+}
 
 static ssize_t memmap_attr_show(struct kobject *kobj,
                                struct attribute *attr, char *buf)
index d247a35..7b17a1f 100644 (file)
@@ -25,8 +25,8 @@ struct shadow_info {
 /*
  * It would be nice if we scaled with the size of transaction.
  */
-#define HASH_SIZE 256
-#define HASH_MASK (HASH_SIZE - 1)
+#define DM_HASH_SIZE 256
+#define DM_HASH_MASK (DM_HASH_SIZE - 1)
 
 struct dm_transaction_manager {
        int is_clone;
@@ -36,7 +36,7 @@ struct dm_transaction_manager {
        struct dm_space_map *sm;
 
        spinlock_t lock;
-       struct hlist_head buckets[HASH_SIZE];
+       struct hlist_head buckets[DM_HASH_SIZE];
 };
 
 /*----------------------------------------------------------------*/
@@ -44,7 +44,7 @@ struct dm_transaction_manager {
 static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 {
        int r = 0;
-       unsigned bucket = dm_hash_block(b, HASH_MASK);
+       unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
        struct shadow_info *si;
        struct hlist_node *n;
 
@@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
        si = kmalloc(sizeof(*si), GFP_NOIO);
        if (si) {
                si->where = b;
-               bucket = dm_hash_block(b, HASH_MASK);
+               bucket = dm_hash_block(b, DM_HASH_MASK);
                spin_lock(&tm->lock);
                hlist_add_head(&si->hlist, tm->buckets + bucket);
                spin_unlock(&tm->lock);
@@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
        int i;
 
        spin_lock(&tm->lock);
-       for (i = 0; i < HASH_SIZE; i++) {
+       for (i = 0; i < DM_HASH_SIZE; i++) {
                bucket = tm->buckets + i;
                hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
                        kfree(si);
@@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
        tm->sm = sm;
 
        spin_lock_init(&tm->lock);
-       for (i = 0; i < HASH_SIZE; i++)
+       for (i = 0; i < DM_HASH_SIZE; i++)
                INIT_HLIST_HEAD(tm->buckets + i);
 
        return tm;
index 328c397..fdff5c6 100644 (file)
@@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
        else
                zbud_pers_pageframes--;
        zbudpage_spin_unlock(zbudpage);
-       reset_page_mapcount(page);
+       page_mapcount_reset(page);
        init_page_count(page);
        page->index = 0;
        return page;
index 06f73a9..e78d262 100644 (file)
@@ -472,7 +472,7 @@ static void reset_page(struct page *page)
        set_page_private(page, 0);
        page->mapping = NULL;
        page->freelist = NULL;
-       reset_page_mapcount(page);
+       page_mapcount_reset(page);
 }
 
 static void free_zspage(struct page *first_page)
index 1775ad4..5480352 100644 (file)
@@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev)
 {
        int ret;
        int i;
+       unsigned int noio_flag;
        struct usb_host_config *config = udev->actconfig;
 
        if (udev->state == USB_STATE_NOTATTACHED ||
@@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev)
                return -EINVAL;
        }
 
+       /*
+        * Don't allocate memory with GFP_KERNEL in current
+        * context to avoid possible deadlock if usb mass
+        * storage interface or usbnet interface(iSCSI case)
+        * is included in current configuration. The easist
+        * approach is to do it for every device reset,
+        * because the device 'memalloc_noio' flag may have
+        * not been set before reseting the usb device.
+        */
+       noio_flag = memalloc_noio_save();
+
        /* Prevent autosuspend during the reset */
        usb_autoresume_device(udev);
 
@@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev)
        }
 
        usb_autosuspend_device(udev);
+       memalloc_noio_restore(noio_flag);
        return ret;
 }
 EXPORT_SYMBOL_GPL(usb_reset_device);
index 71f613c..064bfbe 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        struct aio_ring *ring;
        struct aio_ring_info *info = &ctx->ring_info;
        unsigned nr_events = ctx->max_reqs;
-       unsigned long size;
+       unsigned long size, populate;
        int nr_pages;
 
        /* Compensate for the ring buffer's head/tail overlap entry */
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
        down_write(&ctx->mm->mmap_sem);
        info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
                                        PROT_READ|PROT_WRITE,
-                                       MAP_ANONYMOUS|MAP_PRIVATE, 0);
+                                       MAP_ANONYMOUS|MAP_PRIVATE, 0,
+                                       &populate);
        if (IS_ERR((void *)info->mmap_base)) {
                up_write(&ctx->mm->mmap_sem);
                info->mmap_size = 0;
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
                aio_free_ring(ctx);
                return -EAGAIN;
        }
+       if (populate)
+               mm_populate(info->mmap_base, populate);
 
        ctx->user_id = info->mmap_base;
 
index 2ea9cd4..62169c1 100644 (file)
@@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
-static int max_buffer_heads;
+static unsigned long max_buffer_heads;
 
 int buffer_heads_over_limit;
 
@@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read);
 
 void __init buffer_init(void)
 {
-       int nrpages;
+       unsigned long nrpages;
 
        bh_cachep = kmem_cache_create("buffer_head",
                        sizeof(struct buffer_head), 0,
index ac8ed96..499e957 100644 (file)
@@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
 }
 
 static int num_delegations;
-unsigned int max_delegations;
+unsigned long max_delegations;
 
 /*
  * Open owner state (share locks)
@@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
        num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
 
        spin_lock(&nfsd_drc_lock);
-       avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
-                       nfsd_drc_max_mem - nfsd_drc_mem_used);
+       avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
+                   nfsd_drc_max_mem - nfsd_drc_mem_used);
        num = min_t(int, num, avail / slotsize);
        nfsd_drc_mem_used += num * slotsize;
        spin_unlock(&nfsd_drc_lock);
index de23db2..07a473f 100644 (file)
@@ -56,8 +56,8 @@ extern struct svc_version     nfsd_version2, nfsd_version3,
 extern u32                     nfsd_supported_minorversion;
 extern struct mutex            nfsd_mutex;
 extern spinlock_t              nfsd_drc_lock;
-extern unsigned int            nfsd_drc_max_mem;
-extern unsigned int            nfsd_drc_mem_used;
+extern unsigned long           nfsd_drc_max_mem;
+extern unsigned long           nfsd_drc_mem_used;
 
 extern const struct seq_operations nfs_exports_op;
 
@@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
  * NFSv4 State
  */
 #ifdef CONFIG_NFSD_V4
-extern unsigned int max_delegations;
+extern unsigned long max_delegations;
 void nfs4_state_init(void);
 int nfsd4_init_slabs(void);
 void nfsd4_free_slabs(void);
index cee62ab..be7af50 100644 (file)
@@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex);
  * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
  */
 spinlock_t     nfsd_drc_lock;
-unsigned int   nfsd_drc_max_mem;
-unsigned int   nfsd_drc_mem_used;
+unsigned long  nfsd_drc_max_mem;
+unsigned long  nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat nfsd_acl_svcstats;
@@ -342,7 +342,7 @@ static void set_max_drc(void)
                                        >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
        nfsd_drc_mem_used = 0;
        spin_lock_init(&nfsd_drc_lock);
-       dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
+       dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
 }
 
 static int nfsd_get_default_max_blksize(void)
index 80e4645..1efaaa1 100644 (file)
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                * sysctl_overcommit_ratio / 100) + total_swap_pages;
 
        cached = global_page_state(NR_FILE_PAGES) -
-                       total_swapcache_pages - i.bufferram;
+                       total_swapcache_pages() - i.bufferram;
        if (cached < 0)
                cached = 0;
 
@@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeram),
                K(i.bufferram),
                K(cached),
-               K(total_swapcache_pages),
+               K(total_swapcache_pages()),
                K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
                K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
                K(pages[LRU_ACTIVE_ANON]),
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                vmi.used >> 10,
                vmi.largest_chunk >> 10
 #ifdef CONFIG_MEMORY_FAILURE
-               ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+               ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
index bcbdd74..f46cfd7 100644 (file)
@@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
 
 #endif /* !CONFIG_ACPI */
 
+#ifdef CONFIG_ACPI_NUMA
+void __init early_parse_srat(void);
+#else
+static inline void early_parse_srat(void)
+{
+}
+#endif
+
 #ifdef CONFIG_ACPI
 void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
                               u32 pm1a_ctrl,  u32 pm1b_ctrl));
index 3cd16ba..cdc3bab 100644 (file)
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
                              unsigned long size);
 extern void free_bootmem(unsigned long physaddr, unsigned long size);
 extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
index cc7bdde..091d72e 100644 (file)
@@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask,
                        bool sync, bool *contended);
-extern int compact_pgdat(pg_data_t *pgdat, int order);
+extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
@@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
        return COMPACT_CONTINUE;
 }
 
-static inline int compact_pgdat(pg_data_t *pgdat, int order)
+static inline void compact_pgdat(pg_data_t *pgdat, int order)
 {
-       return COMPACT_CONTINUE;
 }
 
 static inline void reset_isolation_suitable(pg_data_t *pgdat)
index 43fe52f..71d4fa7 100644 (file)
@@ -25,6 +25,7 @@
 
 int firmware_map_add_early(u64 start, u64 end, const char *type);
 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
+int firmware_map_remove(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
        return 0;
 }
 
+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
+{
+       return 0;
+}
+
 #endif /* CONFIG_FIRMWARE_MEMMAP */
 
 #endif /* _LINUX_FIRMWARE_MAP_H */
index ef788b5..7fb31da 100644 (file)
@@ -219,12 +219,6 @@ static inline void zero_user(struct page *page,
        zero_user_segments(page, start, start + size, 0, 0);
 }
 
-static inline void __deprecated memclear_highpage_flush(struct page *page,
-                       unsigned int offset, unsigned int size)
-{
-       zero_user(page, offset, size);
-}
-
 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
 
 static inline void copy_user_highpage(struct page *to, struct page *from,
index 1d76f8c..ee1c244 100644 (file)
@@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
        do {                                                            \
                pmd_t *____pmd = (__pmd);                               \
                anon_vma_lock_write(__anon_vma);                        \
-               anon_vma_unlock(__anon_vma);                            \
+               anon_vma_unlock_write(__anon_vma);                      \
                BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
                       pmd_trans_huge(*____pmd));                       \
        } while (0)
index 0c80d3f..eedc334 100644 (file)
@@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 #endif
 
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
-                       struct page **, struct vm_area_struct **,
-                       unsigned long *, int *, int, unsigned int flags);
+long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
+                        struct page **, struct vm_area_struct **,
+                        unsigned long *, unsigned long *, long, unsigned int);
 void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
index 3319a69..45c9b6a 100644 (file)
@@ -16,9 +16,6 @@
 struct stable_node;
 struct mem_cgroup;
 
-struct page *ksm_does_need_to_copy(struct page *page,
-                       struct vm_area_struct *vma, unsigned long address);
-
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags);
@@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page,
  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
  * but what if the vma was unmerged while the page was swapped out?
  */
-static inline int ksm_might_need_to_copy(struct page *page,
-                       struct vm_area_struct *vma, unsigned long address)
-{
-       struct anon_vma *anon_vma = page_anon_vma(page);
-
-       return anon_vma &&
-               (anon_vma->root != vma->anon_vma->root ||
-                page->index != linear_page_index(vma, address));
-}
+struct page *ksm_might_need_to_copy(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address);
 
 int page_referenced_ksm(struct page *page,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);
@@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
        return 0;
 }
 
-static inline int ksm_might_need_to_copy(struct page *page,
+static inline struct page *ksm_might_need_to_copy(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
-       return 0;
+       return page;
 }
 
 static inline int page_referenced_ksm(struct page *page,
index f388203..3e5ecb2 100644 (file)
@@ -42,6 +42,7 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
+extern struct movablemem_map movablemem_map;
 
 #define memblock_dbg(fmt, ...) \
        if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
                          unsigned long *out_end_pfn, int *out_nid);
 
index 28bd5fa..d6183f0 100644 (file)
@@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
  * For memory reclaim.
  */
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
@@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return 1;
 }
 
-static inline int
-mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
-{
-       return 1;
-}
-
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
index 4a45c4e..b6a3be7 100644 (file)
@@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
+extern int arch_remove_memory(u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* reasonably generic interface to expand the physical pages in a zone  */
@@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+#else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
 }
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
-extern void put_page_bootmem(struct page *page);
 #endif
+extern void put_page_bootmem(struct page *page);
+extern void get_page_bootmem(unsigned long ingo, struct page *page,
+                            unsigned long type);
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
@@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {}
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
 extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern void try_offline_node(int nid);
 
 #else
 static inline int is_mem_section_removable(unsigned long pfn,
@@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
 {
        return 0;
 }
+
+static inline void try_offline_node(int nid) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern int mem_online_node(int nid);
@@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int offline_memory_block(struct memory_block *mem);
-extern int remove_memory(u64 start, u64 size);
+extern bool is_memblock_offlined(struct memory_block *mem);
+extern int remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                                                                int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
index 1e9f627..a405d3d 100644 (file)
@@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
                        struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-                       unsigned long private, bool offlining,
-                       enum migrate_mode mode, int reason);
+               unsigned long private, enum migrate_mode mode, int reason);
 extern int migrate_huge_page(struct page *, new_page_t x,
-                       unsigned long private, bool offlining,
-                       enum migrate_mode mode);
+               unsigned long private, enum migrate_mode mode);
 
 extern int fail_migrate_page(struct address_space *,
                        struct page *, struct page *);
@@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-               unsigned long private, bool offlining,
-               enum migrate_mode mode, int reason) { return -ENOSYS; }
+               unsigned long private, enum migrate_mode mode, int reason)
+       { return -ENOSYS; }
 static inline int migrate_huge_page(struct page *page, new_page_t x,
-               unsigned long private, bool offlining,
-               enum migrate_mode mode) { return -ENOSYS; }
+               unsigned long private, enum migrate_mode mode)
+       { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
index 9d9dcc3..e7c3f9a 100644 (file)
@@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
 
+#define VM_POPULATE     0x00001000
 #define VM_LOCKED      0x00002000
 #define VM_IO           0x00004000     /* Memory mapped I/O or similar */
 
@@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page)
  * both from it and to it can be tracked, using atomic_inc_and_test
  * and atomic_add_negative(-1).
  */
-static inline void reset_page_mapcount(struct page *page)
+static inline void page_mapcount_reset(struct page *page)
 {
        atomic_set(&(page)->_mapcount, -1);
 }
@@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  * sets it, so none of the operations on it need to be atomic.
  */
 
-
-/*
- * page->flags layout:
- *
- * There are three possibilities for how page->flags get
- * laid out.  The first is for the normal case, without
- * sparsemem.  The second is for sparsemem when there is
- * plenty of space for node and section.  The last is when
- * we have run out of space and have to fall back to an
- * alternate (slower) way of determining the node.
- *
- * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE | ... | FLAGS |
- * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
- * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
- */
-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
-#define SECTIONS_WIDTH         SECTIONS_SHIFT
-#else
-#define SECTIONS_WIDTH         0
-#endif
-
-#define ZONES_WIDTH            ZONES_SHIFT
-
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
-#define NODES_WIDTH            NODES_SHIFT
-#else
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-#error "Vmemmap: No space for nodes field in page flags"
-#endif
-#define NODES_WIDTH            0
-#endif
-
-/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
 #define SECTIONS_PGOFF         ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
 #define NODES_PGOFF            (SECTIONS_PGOFF - NODES_WIDTH)
 #define ZONES_PGOFF            (NODES_PGOFF - ZONES_WIDTH)
-
-/*
- * We are going to use the flags for the page to node mapping if its in
- * there.  This includes the case where there is no node, so it is implicit.
- */
-#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
-#define NODE_NOT_IN_PAGE_FLAGS
-#endif
+#define LAST_NID_PGOFF         (ZONES_PGOFF - LAST_NID_WIDTH)
 
 /*
  * Define the bit shifts to access each section.  For non-existent
@@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 #define SECTIONS_PGSHIFT       (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
 #define NODES_PGSHIFT          (NODES_PGOFF * (NODES_WIDTH != 0))
 #define ZONES_PGSHIFT          (ZONES_PGOFF * (ZONES_WIDTH != 0))
+#define LAST_NID_PGSHIFT       (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
 
 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
 #ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 #define ZONES_MASK             ((1UL << ZONES_WIDTH) - 1)
 #define NODES_MASK             ((1UL << NODES_WIDTH) - 1)
 #define SECTIONS_MASK          ((1UL << SECTIONS_WIDTH) - 1)
+#define LAST_NID_MASK          ((1UL << LAST_NID_WIDTH) - 1)
 #define ZONEID_MASK            ((1UL << ZONEID_SHIFT) - 1)
 
 static inline enum zone_type page_zonenum(const struct page *page)
@@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page)
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+#define SECTION_IN_PAGE_FLAGS
+#endif
+
 /*
  * The identification function is only used by the buddy allocator for
  * determining if two pages could be buddies. We are not really
@@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page)
 #endif
 
 #ifdef CONFIG_NUMA_BALANCING
-static inline int page_xchg_last_nid(struct page *page, int nid)
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+static inline int page_nid_xchg_last(struct page *page, int nid)
 {
        return xchg(&page->_last_nid, nid);
 }
 
-static inline int page_last_nid(struct page *page)
+static inline int page_nid_last(struct page *page)
 {
        return page->_last_nid;
 }
-static inline void reset_page_last_nid(struct page *page)
+static inline void page_nid_reset_last(struct page *page)
 {
        page->_last_nid = -1;
 }
 #else
-static inline int page_xchg_last_nid(struct page *page, int nid)
+static inline int page_nid_last(struct page *page)
+{
+       return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
+}
+
+extern int page_nid_xchg_last(struct page *page, int nid);
+
+static inline void page_nid_reset_last(struct page *page)
+{
+       int nid = (1 << LAST_NID_SHIFT) - 1;
+
+       page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
+       page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+}
+#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
+#else
+static inline int page_nid_xchg_last(struct page *page, int nid)
 {
        return page_to_nid(page);
 }
 
-static inline int page_last_nid(struct page *page)
+static inline int page_nid_last(struct page *page)
 {
        return page_to_nid(page);
 }
 
-static inline void reset_page_last_nid(struct page *page)
+static inline void page_nid_reset_last(struct page *page)
 {
 }
 #endif
@@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page)
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
 }
 
-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+#ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
@@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 {
        set_page_zone(page, zone);
        set_page_node(page, node);
-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
 #endif
 }
@@ -817,18 +802,7 @@ void page_address_init(void);
 #define PAGE_MAPPING_KSM       2
 #define PAGE_MAPPING_FLAGS     (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
-extern struct address_space swapper_space;
-static inline struct address_space *page_mapping(struct page *page)
-{
-       struct address_space *mapping = page->mapping;
-
-       VM_BUG_ON(PageSlab(page));
-       if (unlikely(PageSwapCache(page)))
-               mapping = &swapper_space;
-       else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
-               mapping = NULL;
-       return mapping;
-}
+extern struct address_space *page_mapping(struct page *page);
 
 /* Neutral page->mapping pointer to address_space or anon_vma or other */
 static inline void *page_rmapping(struct page *page)
@@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
-extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, int write);
 
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long start, int len, unsigned int foll_flags,
-                    struct page **pages, struct vm_area_struct **vmas,
-                    int *nonblocking);
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                       unsigned long start, int nr_pages, int write, int force,
-                       struct page **pages, struct vm_area_struct **vmas);
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, unsigned long nr_pages,
+                     unsigned int foll_flags, struct page **pages,
+                     struct vm_area_struct **vmas, int *nonblocking);
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                   unsigned long start, unsigned long nr_pages,
+                   int write, int force, struct page **pages,
+                   struct vm_area_struct **vmas);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        struct page **pages);
 struct kvec;
@@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid,
                                                unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
 
+#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
+struct movablemem_entry {
+       unsigned long start_pfn;    /* start pfn of memory segment */
+       unsigned long end_pfn;      /* end pfn of memory segment (exclusive) */
+};
+
+struct movablemem_map {
+       bool acpi;      /* true if using SRAT info */
+       int nr_map;
+       struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
+       nodemask_t numa_nodes_hotplug;  /* on which nodes we specify memory */
+       nodemask_t numa_nodes_kernel;   /* on which nodes kernel resides in */
+};
+
+extern void __init insert_movablemem_map(unsigned long start_pfn,
+                                        unsigned long end_pfn);
+extern int __init movablemem_map_overlap(unsigned long start_pfn,
+                                        unsigned long end_pfn);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
@@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void);
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
 
+/* page_alloc.c */
+extern int min_free_kbytes;
+
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
@@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm,
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long flags,
-       vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+       unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
+extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, unsigned long *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+#ifdef CONFIG_MMU
+extern int __mm_populate(unsigned long addr, unsigned long len,
+                        int ignore_errors);
+static inline void mm_populate(unsigned long addr, unsigned long len)
+{
+       /* Ignore errors */
+       (void) __mm_populate(addr, len, 1);
+}
+#else
+static inline void mm_populate(unsigned long addr, unsigned long len) {}
+#endif
+
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
@@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
 
-struct page *follow_page(struct vm_area_struct *, unsigned long address,
-                       unsigned int foll_flags);
+struct page *follow_page_mask(struct vm_area_struct *vma,
+                             unsigned long address, unsigned int foll_flags,
+                             unsigned int *page_mask);
+
+static inline struct page *follow_page(struct vm_area_struct *vma,
+               unsigned long address, unsigned int foll_flags)
+{
+       unsigned int unused_page_mask;
+       return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
+}
+
 #define FOLL_WRITE     0x01    /* check pte is writable */
 #define FOLL_TOUCH     0x02    /* mark page accessed */
 #define FOLL_GET       0x04    /* do get_page on page */
@@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_SPLIT     0x80    /* don't return transhuge pages, split them */
 #define FOLL_HWPOISON  0x100   /* check page is hwpoisoned */
 #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
+#define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
@@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page,
                                                unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
-
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);
+#endif
+void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
+                                 unsigned long size);
 
 enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
@@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
-extern atomic_long_t mce_bad_pages;
+extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
 
 extern void dump_page(struct page *page);
index f8f5162..ace9a5f 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
 #include <linux/uprobes.h>
+#include <linux/page-flags-layout.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -173,7 +174,7 @@ struct page {
        void *shadow;
 #endif
 
-#ifdef CONFIG_NUMA_BALANCING
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
        int _last_nid;
 #endif
 }
@@ -414,9 +415,9 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
        /*
-        * numa_next_scan is the next time when the PTEs will me marked
-        * pte_numa to gather statistics and migrate pages to new nodes
-        * if necessary
+        * numa_next_scan is the next time that the PTEs will be marked
+        * pte_numa. NUMA hinting faults will gather statistics and migrate
+        * pages to new nodes if necessary.
         */
        unsigned long numa_next_scan;
 
index 9aa863d..61c7a87 100644 (file)
@@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags)
 {
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-              _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+              ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) |
+              (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ?
+                                                       VM_POPULATE : 0);
 }
 #endif /* _LINUX_MMAN_H */
index 73b64a3..ede2749 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/seqlock.h>
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
-#include <generated/bounds.h>
+#include <linux/page-flags-layout.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 
@@ -57,7 +57,9 @@ enum {
         */
        MIGRATE_CMA,
 #endif
+#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
+#endif
        MIGRATE_TYPES
 };
 
@@ -308,24 +310,6 @@ enum zone_type {
 
 #ifndef __GENERATING_BOUNDS_H
 
-/*
- * When a memory allocation must conform to specific limitations (such
- * as being suitable for DMA) the caller will pass in hints to the
- * allocator in the gfp_mask, in the zone modifier bits.  These bits
- * are used to select a priority ordered list of memory zones which
- * match the requested limits. See gfp_zone() in include/linux/gfp.h
- */
-
-#if MAX_NR_ZONES < 2
-#define ZONES_SHIFT 0
-#elif MAX_NR_ZONES <= 2
-#define ZONES_SHIFT 1
-#elif MAX_NR_ZONES <= 4
-#define ZONES_SHIFT 2
-#else
-#error ZONES_SHIFT -- too many zones configured adjust calculation
-#endif
-
 struct zone {
        /* Fields commonly accessed by the page allocator */
 
@@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone)
        return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
+static inline unsigned zone_end_pfn(const struct zone *zone)
+{
+       return zone->zone_start_pfn + zone->spanned_pages;
+}
+
+static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
+{
+       return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
+}
+
+static inline bool zone_is_initialized(struct zone *zone)
+{
+       return !!zone->wait_table;
+}
+
+static inline bool zone_is_empty(struct zone *zone)
+{
+       return zone->spanned_pages == 0;
+}
+
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -752,11 +756,17 @@ typedef struct pglist_data {
 #define nid_page_nr(nid, pagenr)       pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
 
-#define node_end_pfn(nid) ({\
-       pg_data_t *__pgdat = NODE_DATA(nid);\
-       __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
-})
+static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
+{
+       return pgdat->node_start_pfn + pgdat->node_spanned_pages;
+}
+
+static inline bool pgdat_is_empty(pg_data_t *pgdat)
+{
+       return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
+}
 
 #include <linux/memory_hotplug.h>
 
@@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
  * PA_SECTION_SHIFT            physical address to/from section number
  * PFN_SECTION_SHIFT           pfn to/from section number
  */
-#define SECTIONS_SHIFT         (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
-
 #define PA_SECTION_SHIFT       (SECTION_SIZE_BITS)
 #define PFN_SECTION_SHIFT      (SECTION_SIZE_BITS - PAGE_SHIFT)
 
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
new file mode 100644 (file)
index 0000000..93506a1
--- /dev/null
@@ -0,0 +1,88 @@
+#ifndef PAGE_FLAGS_LAYOUT_H
+#define PAGE_FLAGS_LAYOUT_H
+
+#include <linux/numa.h>
+#include <generated/bounds.h>
+
+/*
+ * When a memory allocation must conform to specific limitations (such
+ * as being suitable for DMA) the caller will pass in hints to the
+ * allocator in the gfp_mask, in the zone modifier bits.  These bits
+ * are used to select a priority ordered list of memory zones which
+ * match the requested limits. See gfp_zone() in include/linux/gfp.h
+ */
+#if MAX_NR_ZONES < 2
+#define ZONES_SHIFT 0
+#elif MAX_NR_ZONES <= 2
+#define ZONES_SHIFT 1
+#elif MAX_NR_ZONES <= 4
+#define ZONES_SHIFT 2
+#else
+#error ZONES_SHIFT -- too many zones configured adjust calculation
+#endif
+
+#ifdef CONFIG_SPARSEMEM
+#include <asm/sparsemem.h>
+
+/* SECTION_SHIFT       #bits space required to store a section # */
+#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
+
+#endif /* CONFIG_SPARSEMEM */
+
+/*
+ * page->flags layout:
+ *
+ * There are five possibilities for how page->flags get laid out.  The first
+ * pair is for the normal case without sparsemem. The second pair is for
+ * sparsemem when there is plenty of space for node and section information.
+ * The last is when there is insufficient space in page->flags and a separate
+ * lookup is necessary.
+ *
+ * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |          ... | FLAGS |
+ *         " plus space for last_nid: |       NODE     | ZONE | LAST_NID ... | FLAGS |
+ * classic sparse with space for node:| SECTION | NODE | ZONE |          ... | FLAGS |
+ *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
+ * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
+ */
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+#define SECTIONS_WIDTH         SECTIONS_SHIFT
+#else
+#define SECTIONS_WIDTH         0
+#endif
+
+#define ZONES_WIDTH            ZONES_SHIFT
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#define NODES_WIDTH            NODES_SHIFT
+#else
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#error "Vmemmap: No space for nodes field in page flags"
+#endif
+#define NODES_WIDTH            0
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+#define LAST_NID_SHIFT NODES_SHIFT
+#else
+#define LAST_NID_SHIFT 0
+#endif
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#define LAST_NID_WIDTH LAST_NID_SHIFT
+#else
+#define LAST_NID_WIDTH 0
+#endif
+
+/*
+ * We are going to use the flags for the page to node mapping if its in
+ * there.  This includes the case where there is no node, so it is implicit.
+ */
+#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+#define NODE_NOT_IN_PAGE_FLAGS
+#endif
+
+#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
+#define LAST_NID_NOT_IN_PAGE_FLAGS
+#endif
+
+#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
index a92061e..3fff8e7 100644 (file)
@@ -1,6 +1,25 @@
 #ifndef __LINUX_PAGEISOLATION_H
 #define __LINUX_PAGEISOLATION_H
 
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline bool is_migrate_isolate_page(struct page *page)
+{
+       return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
+}
+static inline bool is_migrate_isolate(int migratetype)
+{
+       return migratetype == MIGRATE_ISOLATE;
+}
+#else
+static inline bool is_migrate_isolate_page(struct page *page)
+{
+       return false;
+}
+static inline bool is_migrate_isolate(int migratetype)
+{
+       return false;
+}
+#endif
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                         bool skip_hwpoisoned_pages);
index 97bcf23..e5d7230 100644 (file)
@@ -537,6 +537,7 @@ struct dev_pm_info {
        unsigned int            irq_safe:1;
        unsigned int            use_autosuspend:1;
        unsigned int            timer_autosuspends:1;
+       unsigned int            memalloc_noio:1;
        enum rpm_request        request;
        enum rpm_status         runtime_status;
        int                     runtime_error;
index c785c21..7d7e09e 100644 (file)
@@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
 extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
 extern void pm_runtime_update_max_time_suspended(struct device *dev,
                                                 s64 delta_ns);
+extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
 
 static inline bool pm_children_suspended(struct device *dev)
 {
@@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
                                                int delay) {}
 static inline unsigned long pm_runtime_autosuspend_expiration(
                                struct device *dev) { return 0; }
+static inline void pm_runtime_set_memalloc_noio(struct device *dev,
+                                               bool enable){}
 
 #endif /* !CONFIG_PM_RUNTIME */
 
index c20635c..6dacb93 100644 (file)
@@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
        down_write(&anon_vma->root->rwsem);
 }
 
-static inline void anon_vma_unlock(struct anon_vma *anon_vma)
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 {
        up_write(&anon_vma->root->rwsem);
 }
index e4112aa..c2182b5 100644 (file)
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
+#include <linux/gfp.h>
 
 #include <asm/processor.h>
 
@@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_FROZEN      0x00010000      /* frozen for system suspend */
 #define PF_FSTRANS     0x00020000      /* inside a filesystem transaction */
 #define PF_KSWAPD      0x00040000      /* I am kswapd */
+#define PF_MEMALLOC_NOIO 0x00080000    /* Allocating memory without IO involved */
 #define PF_LESS_THROTTLE 0x00100000    /* Throttle me less: I clean memory */
 #define PF_KTHREAD     0x00200000      /* I am a kernel thread */
 #define PF_RANDOMIZE   0x00400000      /* randomize virtual address space */
@@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
+static inline gfp_t memalloc_noio_flags(gfp_t flags)
+{
+       if (unlikely(current->flags & PF_MEMALLOC_NOIO))
+               flags &= ~__GFP_IO;
+       return flags;
+}
+
+static inline unsigned int memalloc_noio_save(void)
+{
+       unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
+       current->flags |= PF_MEMALLOC_NOIO;
+       return flags;
+}
+
+static inline void memalloc_noio_restore(unsigned int flags)
+{
+       current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
+}
+
 /*
  * task->jobctl flags
  */
index 68df9c1..2818a12 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/node.h>
-
+#include <linux/fs.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 
@@ -156,7 +156,7 @@ enum {
        SWP_SCANNING    = (1 << 8),     /* refcount in scan_swap_map */
 };
 
-#define SWAP_CLUSTER_MAX 32
+#define SWAP_CLUSTER_MAX 32UL
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
 /*
@@ -202,6 +202,18 @@ struct swap_info_struct {
        unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
        atomic_t frontswap_pages;       /* frontswap pages in-use counter */
 #endif
+       spinlock_t lock;                /*
+                                        * protect map scan related fields like
+                                        * swap_map, lowest_bit, highest_bit,
+                                        * inuse_pages, cluster_next,
+                                        * cluster_nr, lowest_alloc and
+                                        * highest_alloc. other fields are only
+                                        * changed at swapon/swapoff, so are
+                                        * protected by swap_lock. changing
+                                        * flags need hold this lock and
+                                        * swap_lock. If both locks need hold,
+                                        * hold swap_lock first.
+                                        */
 };
 
 struct swap_list_t {
@@ -209,15 +221,12 @@ struct swap_list_t {
        int next;       /* swapfile to be used next */
 };
 
-/* Swap 50% full? Release swapcache more aggressively.. */
-#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
-
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
 extern unsigned long dirty_balance_reserve;
-extern unsigned int nr_free_buffer_pages(void);
-extern unsigned int nr_free_pagecache_pages(void);
+extern unsigned long nr_free_buffer_pages(void);
+extern unsigned long nr_free_pagecache_pages(void);
 
 /* Definition of global_page_state not available yet */
 #define nr_free_pages() global_page_state(NR_FREE_PAGES)
@@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
-extern long vm_total_pages;
+extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
@@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);
 
 /* linux/mm/swap_state.c */
-extern struct address_space swapper_space;
-#define total_swapcache_pages  swapper_space.nrpages
+extern struct address_space swapper_spaces[];
+#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
+extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
@@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
                        struct vm_area_struct *vma, unsigned long addr);
 
 /* linux/mm/swapfile.c */
-extern long nr_swap_pages;
+extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+
+/* Swap 50% full? Release swapcache more aggressively.. */
+static inline bool vm_swap_full(void)
+{
+       return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
+}
+
+static inline long get_nr_swap_pages(void)
+{
+       return atomic_long_read(&nr_swap_pages);
+}
+
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
@@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
 #else /* CONFIG_SWAP */
 
-#define nr_swap_pages                          0L
+#define get_nr_swap_pages()                    0L
 #define total_swap_pages                       0L
-#define total_swapcache_pages                  0UL
+#define total_swapcache_pages()                        0UL
+#define vm_swap_full()                         0
 
 #define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
index fce0a27..bd6cf61 100644 (file)
@@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
                PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
                KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
-               KSWAPD_SKIP_CONGESTION_WAIT,
                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_NUMA_BALANCING
                NUMA_PTE_UPDATES,
index a13291f..5fd71a7 100644 (file)
@@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_numa_events(x, y) count_vm_events(x, y)
 #else
 #define count_vm_numa_event(x) do {} while (0)
-#define count_vm_numa_events(x, y) do {} while (0)
+#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
 #endif /* CONFIG_NUMA_BALANCING */
 
 #define __count_zone_vm_events(item, zone, delta) \
index 4fa6d8f..be3ec9a 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
        unsigned long flags;
        unsigned long prot;
        int acc_mode;
-       unsigned long user_addr;
        struct ipc_namespace *ns;
        struct shm_file_data *sfd;
        struct path path;
        fmode_t f_mode;
+       unsigned long populate = 0;
 
        err = -EINVAL;
        if (shmid < 0)
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
                        goto invalid;
        }
                
-       user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
-       *raddr = user_addr;
+       addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+       *raddr = addr;
        err = 0;
-       if (IS_ERR_VALUE(user_addr))
-               err = (long)user_addr;
+       if (IS_ERR_VALUE(addr))
+               err = (long)addr;
 invalid:
        up_write(&current->mm->mmap_sem);
+       if (populate)
+               mm_populate(addr, populate);
 
 out_fput:
        fput(file);
index 3a673a3..053dfd7 100644 (file)
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       int nid = cpu_to_node(cpu);
+       const struct cpumask *nodemask = NULL;
        enum { cpuset, possible, fail } state = cpuset;
        int dest_cpu;
 
-       /* Look for allowed, online CPU in same node. */
-       for_each_cpu(dest_cpu, nodemask) {
-               if (!cpu_online(dest_cpu))
-                       continue;
-               if (!cpu_active(dest_cpu))
-                       continue;
-               if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-                       return dest_cpu;
+       /*
+        * If the node that the cpu is on has been offlined, cpu_to_node()
+        * will return -1. There is no cpu on the node, and we should
+        * select the cpu on the other node.
+        */
+       if (nid != -1) {
+               nodemask = cpumask_of_node(nid);
+
+               /* Look for allowed, online CPU in same node. */
+               for_each_cpu(dest_cpu, nodemask) {
+                       if (!cpu_online(dest_cpu))
+                               continue;
+                       if (!cpu_active(dest_cpu))
+                               continue;
+                       if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                               return dest_cpu;
+               }
        }
 
        for (;;) {
index 467d8b9..95e9e55 100644 (file)
@@ -105,7 +105,6 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
-extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
index 0b23db9..2c7aea7 100644 (file)
@@ -162,10 +162,16 @@ config MOVABLE_NODE
          Say Y here if you want to hotplug a whole node.
          Say N here if you want kernel to use memory on all nodes evenly.
 
+#
+# Only be set on architectures that have completely implemented memory hotplug
+# feature. If you are not sure, don't touch it.
+#
+config HAVE_BOOTMEM_INFO_NODE
+       def_bool n
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
-       select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
 
 config MEMORY_HOTREMOVE
        bool "Allow for memory hot remove"
+       select MEMORY_ISOLATION
+       select HAVE_BOOTMEM_INFO_NODE if X86_64
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
 
index c62bd06..05ccb4c 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
+#include <linux/page-isolation.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 static void __reset_isolation_suitable(struct zone *zone)
 {
        unsigned long start_pfn = zone->zone_start_pfn;
-       unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
 
        zone->compact_cached_migrate_pfn = start_pfn;
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
        int migratetype = get_pageblock_migratetype(page);
 
        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-       if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+       if (migratetype == MIGRATE_RESERVE)
+               return false;
+
+       if (is_migrate_isolate(migratetype))
                return false;
 
        /* If the page is a large free page, then allow migration */
@@ -611,8 +615,7 @@ check_compact_cluster:
                continue;
 
 next_pageblock:
-               low_pfn += pageblock_nr_pages;
-               low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+               low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
                last_pageblock_nr = pageblock_nr;
        }
 
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
                                struct compact_control *cc)
 {
        struct page *page;
-       unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
+       unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
 
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
         */
        high_pfn = min(low_pfn, pfn);
 
-       zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       z_end_pfn = zone_end_pfn(zone);
 
        /*
         * Isolate free pages until enough are available to migrate the
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
                 * only scans within a pageblock
                 */
                end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-               end_pfn = min(end_pfn, zone_end_pfn);
+               end_pfn = min(end_pfn, z_end_pfn);
                isolated = isolate_freepages_block(cc, pfn, end_pfn,
                                                   freelist, false);
                nr_freepages += isolated;
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
 
        /* Only scan within a pageblock boundary */
-       end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+       end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
 
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
        unsigned long start_pfn = zone->zone_start_pfn;
-       unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       unsigned long end_pfn = zone_end_pfn(zone);
 
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                               (unsigned long)cc, false,
+                               (unsigned long)cc,
                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
                                MR_COMPACTION);
                update_nr_listpages(cc);
@@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 
 /* Compact all zones within a node */
-static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 {
        int zoneid;
        struct zone *zone;
@@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                VM_BUG_ON(!list_empty(&cc->freepages));
                VM_BUG_ON(!list_empty(&cc->migratepages));
        }
-
-       return 0;
 }
 
-int compact_pgdat(pg_data_t *pgdat, int order)
+void compact_pgdat(pg_data_t *pgdat, int order)
 {
        struct compact_control cc = {
                .order = order,
                .sync = false,
        };
 
-       return __compact_pgdat(pgdat, &cc);
+       __compact_pgdat(pgdat, &cc);
 }
 
-static int compact_node(int nid)
+static void compact_node(int nid)
 {
        struct compact_control cc = {
                .order = -1,
                .sync = true,
        };
 
-       return __compact_pgdat(NODE_DATA(nid), &cc);
+       __compact_pgdat(NODE_DATA(nid), &cc);
 }
 
 /* Compact all nodes in the system */
index a47f0f5..909ec55 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/fadvise.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
+#include <linux/swap.h>
 
 #include <asm/unistd.h>
 
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
                end_index = (endbyte >> PAGE_CACHE_SHIFT);
 
-               if (end_index >= start_index)
-                       invalidate_mapping_pages(mapping, start_index,
+               if (end_index >= start_index) {
+                       unsigned long count = invalidate_mapping_pages(mapping,
+                                               start_index, end_index);
+
+                       /*
+                        * If fewer pages were invalidated than expected then
+                        * it is possible that some of the pages were on
+                        * a per-cpu pagevec for a remote CPU. Drain all
+                        * pagevecs and try again.
+                        */
+                       if (count < (end_index - start_index + 1)) {
+                               lru_add_drain_all();
+                               invalidate_mapping_pages(mapping, start_index,
                                                end_index);
+                       }
+               }
                break;
        default:
                ret = -EINVAL;
index a0aaf0e..0cd4c11 100644 (file)
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        struct vm_area_struct *vma;
        int err = -EINVAL;
        int has_write_lock = 0;
+       vm_flags_t vm_flags;
 
        if (prot)
                return err;
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        /*
         * Make sure the vma is shared, that it supports prefaulting,
         * and that the remapped range is valid and fully within
-        * the single existing vma.  vm_private_data is used as a
-        * swapout cursor in a VM_NONLINEAR vma.
+        * the single existing vma.
         */
        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;
 
-       if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
-               goto out;
-
        if (!vma->vm_ops || !vma->vm_ops->remap_pages)
                goto out;
 
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
        /* Must set VM_NONLINEAR before any pages are populated. */
        if (!(vma->vm_flags & VM_NONLINEAR)) {
+               /*
+                * vm_private_data is used as a swapout cursor
+                * in a VM_NONLINEAR vma.
+                */
+               if (vma->vm_private_data)
+                       goto out;
+
                /* Don't need a nonlinear mapping, exit success */
                if (pgoff == linear_page_index(vma, start)) {
                        err = 0;
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                }
 
                if (!has_write_lock) {
+get_write_lock:
                        up_read(&mm->mmap_sem);
                        down_write(&mm->mmap_sem);
                        has_write_lock = 1;
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        unsigned long addr;
                        struct file *file = get_file(vma->vm_file);
 
-                       flags &= MAP_NONBLOCK;
-                       addr = mmap_region(file, start, size,
-                                       flags, vma->vm_flags, pgoff);
+                       vm_flags = vma->vm_flags;
+                       if (!(flags & MAP_NONBLOCK))
+                               vm_flags |= VM_POPULATE;
+                       addr = mmap_region(file, start, size, vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_unlock(&mapping->i_mmap_mutex);
        }
 
+       if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
+               if (!has_write_lock)
+                       goto get_write_lock;
+               vma->vm_flags |= VM_POPULATE;
+       }
+
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
                 */
-               vm_flags_t saved_flags = vma->vm_flags;
+               if (!has_write_lock)
+                       goto get_write_lock;
+               vm_flags = vma->vm_flags;
                munlock_vma_pages_range(vma, start, start + size);
-               vma->vm_flags = saved_flags;
+               vma->vm_flags = vm_flags;
        }
 
        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
-       if (!err && !(flags & MAP_NONBLOCK)) {
-               if (vma->vm_flags & VM_LOCKED) {
-                       /*
-                        * might be mapping previously unmapped range of file
-                        */
-                       mlock_vma_pages_range(vma, start, start + size);
-               } else {
-                       if (unlikely(has_write_lock)) {
-                               downgrade_write(&mm->mmap_sem);
-                               has_write_lock = 0;
-                       }
-                       make_pages_present(start, start+size);
-               }
-       }
 
        /*
         * We can't clear VM_NONLINEAR because we'd have to do
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
         */
 
 out:
+       vm_flags = vma->vm_flags;
        if (likely(!has_write_lock))
                up_read(&mm->mmap_sem);
        else
                up_write(&mm->mmap_sem);
+       if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
+               mm_populate(start, size);
 
        return err;
 }
index b5783d8..bfa142e 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
+#include <linux/hashtable.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 
 static int khugepaged(void *none);
-static int mm_slots_hash_init(void);
 static int khugepaged_slab_init(void);
-static void khugepaged_slab_free(void);
 
-#define MM_SLOTS_HASH_HEADS 1024
-static struct hlist_head *mm_slots_hash __read_mostly;
+#define MM_SLOTS_HASH_BITS 10
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
 static struct kmem_cache *mm_slot_cache __read_mostly;
 
 /**
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;
-       extern int min_free_kbytes;
 
        if (!khugepaged_enabled())
                return 0;
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
        if (err)
                goto out;
 
-       err = mm_slots_hash_init();
-       if (err) {
-               khugepaged_slab_free();
-               goto out;
-       }
-
        register_shrinker(&huge_zero_page_shrinker);
 
        /*
@@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int target_nid;
        int current_nid = -1;
        bool migrated;
-       bool page_locked = false;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Acquire the page lock to serialise THP migrations */
        spin_unlock(&mm->page_table_lock);
        lock_page(page);
-       page_locked = true;
 
        /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
@@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /* Migrate the THP to the requested node */
        migrated = migrate_misplaced_transhuge_page(mm, vma,
-                               pmdp, pmd, addr,
-                               page, target_nid);
-       if (migrated)
-               current_nid = target_nid;
-       else {
-               spin_lock(&mm->page_table_lock);
-               if (unlikely(!pmd_same(pmd, *pmdp))) {
-                       unlock_page(page);
-                       goto out_unlock;
-               }
-               goto clear_pmdnuma;
-       }
+                               pmdp, pmd, addr, page, target_nid);
+       if (!migrated)
+               goto check_same;
 
-       task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+       task_numa_fault(target_nid, HPAGE_PMD_NR, true);
        return 0;
 
+check_same:
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(pmd, *pmdp)))
+               goto out_unlock;
 clear_pmdnuma:
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
-       if (page_locked)
-               unlock_page(page);
-
 out_unlock:
        spin_unlock(&mm->page_table_lock);
        if (current_nid != -1)
-               task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+               task_numa_fault(current_nid, HPAGE_PMD_NR, false);
        return 0;
 }
 
@@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
 
                page_tail->index = page->index + i;
-               page_xchg_last_nid(page_tail, page_last_nid(page));
+               page_nid_xchg_last(page_tail, page_nid_last(page));
 
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
@@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page)
 
        BUG_ON(PageCompound(page));
 out_unlock:
-       anon_vma_unlock(anon_vma);
+       anon_vma_unlock_write(anon_vma);
        put_anon_vma(anon_vma);
 out:
        return ret;
@@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
        return 0;
 }
 
-static void __init khugepaged_slab_free(void)
-{
-       kmem_cache_destroy(mm_slot_cache);
-       mm_slot_cache = NULL;
-}
-
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
        kmem_cache_free(mm_slot_cache, mm_slot);
 }
 
-static int __init mm_slots_hash_init(void)
-{
-       mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
-                               GFP_KERNEL);
-       if (!mm_slots_hash)
-               return -ENOMEM;
-       return 0;
-}
-
-#if 0
-static void __init mm_slots_hash_free(void)
-{
-       kfree(mm_slots_hash);
-       mm_slots_hash = NULL;
-}
-#endif
-
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
        struct mm_slot *mm_slot;
-       struct hlist_head *bucket;
        struct hlist_node *node;
 
-       bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
-                               % MM_SLOTS_HASH_HEADS];
-       hlist_for_each_entry(mm_slot, node, bucket, hash) {
+       hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm)
                if (mm == mm_slot->mm)
                        return mm_slot;
-       }
+
        return NULL;
 }
 
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
                                    struct mm_slot *mm_slot)
 {
-       struct hlist_head *bucket;
-
-       bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
-                               % MM_SLOTS_HASH_HEADS];
        mm_slot->mm = mm;
-       hlist_add_head(&mm_slot->hash, bucket);
+       hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
 }
 
 static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm)
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
-               hlist_del(&mm_slot->hash);
+               hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
                free = 1;
        }
@@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                BUG_ON(!pmd_none(*pmd));
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
-               anon_vma_unlock(vma->anon_vma);
+               anon_vma_unlock_write(vma->anon_vma);
                goto out;
        }
 
@@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
-       anon_vma_unlock(vma->anon_vma);
+       anon_vma_unlock_write(vma->anon_vma);
 
        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
        pte_unmap(pte);
@@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
-       int node = -1;
+       int node = NUMA_NO_NODE;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 * be more sophisticated and look at more pages,
                 * but isn't for now.
                 */
-               if (node == -1)
+               if (node == NUMA_NO_NODE)
                        node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
@@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 
        if (khugepaged_test_exit(mm)) {
                /* free mm_slot */
-               hlist_del(&mm_slot->hash);
+               hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
 
                /*
index 546db81..cdb64e4 100644 (file)
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
 
        for_each_hstate(h) {
                char buf[32];
-               printk(KERN_INFO "HugeTLB registered %s page size, "
-                                "pre-allocated %ld pages\n",
+               pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
                        memfmt(buf, huge_page_size(h)),
                        h->free_huge_pages);
        }
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
                err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
                                         hstate_kobjs, &hstate_attr_group);
                if (err)
-                       printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
-                                                               h->name);
+                       pr_err("Hugetlb: Unable to add hstate %s", h->name);
        }
 }
 
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
                                                nhs->hstate_kobjs,
                                                &per_node_hstate_attr_group);
                if (err) {
-                       printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
-                                       " for node %d\n",
-                                               h->name, node->dev.id);
+                       pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+                               h->name, node->dev.id);
                        hugetlb_unregister_node(node);
                        break;
                }
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
        unsigned long i;
 
        if (size_to_hstate(PAGE_SIZE << order)) {
-               printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+               pr_warning("hugepagesz= specified twice, ignoring\n");
                return;
        }
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
                mhp = &parsed_hstate->max_huge_pages;
 
        if (mhp == last_mhp) {
-               printk(KERN_WARNING "hugepages= specified twice without "
-                       "interleaving hugepagesz=, ignoring\n");
+               pr_warning("hugepages= specified twice without "
+                          "interleaving hugepagesz=, ignoring\n");
                return 1;
        }
 
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * COW. Warn that such a situation has occurred as it may not be obvious
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-               printk(KERN_WARNING
-                       "PID %d killed due to inadequate hugepage pool\n",
-                       current->pid);
+               pr_warning("PID %d killed due to inadequate hugepage pool\n",
+                          current->pid);
                return ret;
        }
 
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
 
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                       struct page **pages, struct vm_area_struct **vmas,
-                       unsigned long *position, int *length, int i,
-                       unsigned int flags)
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                        struct page **pages, struct vm_area_struct **vmas,
+                        unsigned long *position, unsigned long *nr_pages,
+                        long i, unsigned int flags)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
-       int remainder = *length;
+       unsigned long remainder = *nr_pages;
        struct hstate *h = hstate_vma(vma);
 
        spin_lock(&mm->page_table_lock);
@@ -3001,7 +2997,7 @@ same_page:
                }
        }
        spin_unlock(&mm->page_table_lock);
-       *length = remainder;
+       *nr_pages = remainder;
        *position = vaddr;
 
        return i ? i : -EFAULT;
index 9ba2110..1c0c4cc 100644 (file)
@@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
 
 #ifdef CONFIG_MMU
-extern long mlock_vma_pages_range(struct vm_area_struct *vma,
-                       unsigned long start, unsigned long end);
+extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end, int *nonblocking);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
index 752a705..83dd5fb 100644 (file)
@@ -1300,9 +1300,8 @@ static void kmemleak_scan(void)
         */
        lock_memory_hotplug();
        for_each_online_node(i) {
-               pg_data_t *pgdat = NODE_DATA(i);
-               unsigned long start_pfn = pgdat->node_start_pfn;
-               unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+               unsigned long start_pfn = node_start_pfn(i);
+               unsigned long end_pfn = node_end_pfn(i);
                unsigned long pfn;
 
                for (pfn = start_pfn; pfn < end_pfn; pfn++) {
index 5157385..ab2ba9a 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/ksm.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/freezer.h>
 #include <linux/oom.h>
+#include <linux/numa.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
 
+#ifdef CONFIG_NUMA
+#define NUMA(x)                (x)
+#define DO_NUMA(x)     do { (x); } while (0)
+#else
+#define NUMA(x)                (0)
+#define DO_NUMA(x)     do { } while (0)
+#endif
+
 /*
  * A few notes about the KSM scanning process,
  * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
  *    take 10 attempts to find a page in the unstable tree, once it is found,
  *    it is secured in the stable tree.  (When we scan a new page, we first
  *    compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
  */
 
 /**
@@ -113,19 +125,32 @@ struct ksm_scan {
 /**
  * struct stable_node - node of the stable rbtree
  * @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
  * @hlist: hlist head of rmap_items using this ksm page
- * @kpfn: page frame number of this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
-       struct rb_node node;
+       union {
+               struct rb_node node;    /* when node of stable tree */
+               struct {                /* when listed for migration */
+                       struct list_head *head;
+                       struct list_head list;
+               };
+       };
        struct hlist_head hlist;
        unsigned long kpfn;
+#ifdef CONFIG_NUMA
+       int nid;
+#endif
 };
 
 /**
  * struct rmap_item - reverse mapping item for virtual addresses
  * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
  * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
  * @mm: the memory structure this rmap_item is pointing into
  * @address: the virtual address this rmap_item tracks (+ flags in low bits)
  * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
  */
 struct rmap_item {
        struct rmap_item *rmap_list;
-       struct anon_vma *anon_vma;      /* when stable */
+       union {
+               struct anon_vma *anon_vma;      /* when stable */
+#ifdef CONFIG_NUMA
+               int nid;                /* when node of unstable tree */
+#endif
+       };
        struct mm_struct *mm;
        unsigned long address;          /* + low bits used for flags below */
        unsigned int oldchecksum;       /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
 #define STABLE_FLAG    0x200   /* is listed from the stable tree */
 
 /* The stable and unstable tree heads */
-static struct rb_root root_stable_tree = RB_ROOT;
-static struct rb_root root_unstable_tree = RB_ROOT;
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
 
-#define MM_SLOTS_HASH_SHIFT 10
-#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
-static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
+/* Recently migrated nodes of stable tree, pending proper placement */
+static LIST_HEAD(migrate_nodes);
+
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
 static struct mm_slot ksm_mm_head = {
        .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
 /* Milliseconds ksmd should sleep between batches */
 static unsigned int ksm_thread_sleep_millisecs = 20;
 
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes 1U
+#define ksm_nr_node_ids                1
+#endif
+
 #define KSM_RUN_STOP   0
 #define KSM_RUN_MERGE  1
 #define KSM_RUN_UNMERGE        2
-static unsigned int ksm_run = KSM_RUN_STOP;
+#define KSM_RUN_OFFLINE        4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
 
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
 
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
-       struct mm_slot *mm_slot;
-       struct hlist_head *bucket;
        struct hlist_node *node;
+       struct mm_slot *slot;
+
+       hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm)
+               if (slot->mm == mm)
+                       return slot;
 
-       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-       hlist_for_each_entry(mm_slot, node, bucket, link) {
-               if (mm == mm_slot->mm)
-                       return mm_slot;
-       }
        return NULL;
 }
 
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
                                    struct mm_slot *mm_slot)
 {
-       struct hlist_head *bucket;
-
-       bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
        mm_slot->mm = mm;
-       hlist_add_head(&mm_slot->link, bucket);
-}
-
-static inline int in_stable_tree(struct rmap_item *rmap_item)
-{
-       return rmap_item->address & STABLE_FLAG;
+       hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
 }
 
 /*
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 
        do {
                cond_resched();
-               page = follow_page(vma, addr, FOLL_GET);
+               page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
@@ -447,6 +482,17 @@ out:               page = NULL;
        return page;
 }
 
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+       return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
+}
+
 static void remove_node_from_stable_tree(struct stable_node *stable_node)
 {
        struct rmap_item *rmap_item;
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
                cond_resched();
        }
 
-       rb_erase(&stable_node->node, &root_stable_tree);
+       if (stable_node->head == &migrate_nodes)
+               list_del(&stable_node->list);
+       else
+               rb_erase(&stable_node->node,
+                        root_stable_tree + NUMA(stable_node->nid));
        free_stable_node(stable_node);
 }
 
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
  * In which case we can trust the content of the page, and it
  * returns the gotten page; but if the page has now been zapped,
  * remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
  *
  * You would expect the stable_node to hold a reference to the ksm page.
  * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
  * pointing back to this stable node.  This relies on freeing a PageAnon
  * page to reset its page->mapping to NULL, and relies on no other use of
  * a page to put something that might look like our key in page->mapping.
- *
- * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
- * but this is different - made simpler by ksm_thread_mutex being held, but
- * interesting for assuming that no other use of the struct page could ever
- * put our expected_mapping into page->mapping (or a field of the union which
- * coincides with page->mapping).  The RCU calls are not for KSM at all, but
- * to keep the page_count protocol described with page_cache_get_speculative.
- *
- * Note: it is possible that get_ksm_page() will return NULL one moment,
- * then page the next, if the page is in between page_freeze_refs() and
- * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
  * is on its way to being freed; but it is an anomaly to bear in mind.
  */
-static struct page *get_ksm_page(struct stable_node *stable_node)
+static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
 {
        struct page *page;
        void *expected_mapping;
+       unsigned long kpfn;
 
-       page = pfn_to_page(stable_node->kpfn);
        expected_mapping = (void *)stable_node +
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-       rcu_read_lock();
-       if (page->mapping != expected_mapping)
-               goto stale;
-       if (!get_page_unless_zero(page))
+again:
+       kpfn = ACCESS_ONCE(stable_node->kpfn);
+       page = pfn_to_page(kpfn);
+
+       /*
+        * page is computed from kpfn, so on most architectures reading
+        * page->mapping is naturally ordered after reading node->kpfn,
+        * but on Alpha we need to be more careful.
+        */
+       smp_read_barrier_depends();
+       if (ACCESS_ONCE(page->mapping) != expected_mapping)
                goto stale;
-       if (page->mapping != expected_mapping) {
+
+       /*
+        * We cannot do anything with the page while its refcount is 0.
+        * Usually 0 means free, or tail of a higher-order page: in which
+        * case this node is no longer referenced, and should be freed;
+        * however, it might mean that the page is under page_freeze_refs().
+        * The __remove_mapping() case is easy, again the node is now stale;
+        * but if page is swapcache in migrate_page_move_mapping(), it might
+        * still be our page, in which case it's essential to keep the node.
+        */
+       while (!get_page_unless_zero(page)) {
+               /*
+                * Another check for page->mapping != expected_mapping would
+                * work here too.  We have chosen the !PageSwapCache test to
+                * optimize the common case, when the page is or is about to
+                * be freed: PageSwapCache is cleared (under spin_lock_irq)
+                * in the freeze_refs section of __remove_mapping(); but Anon
+                * page->mapping reset to NULL later, in free_pages_prepare().
+                */
+               if (!PageSwapCache(page))
+                       goto stale;
+               cpu_relax();
+       }
+
+       if (ACCESS_ONCE(page->mapping) != expected_mapping) {
                put_page(page);
                goto stale;
        }
-       rcu_read_unlock();
+
+       if (lock_it) {
+               lock_page(page);
+               if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto stale;
+               }
+       }
        return page;
+
 stale:
-       rcu_read_unlock();
+       /*
+        * We come here from above when page->mapping or !PageSwapCache
+        * suggests that the node is stale; but it might be under migration.
+        * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+        * before checking whether node->kpfn has been changed.
+        */
+       smp_rmb();
+       if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+               goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
 }
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                struct page *page;
 
                stable_node = rmap_item->head;
-               page = get_ksm_page(stable_node);
+               page = get_ksm_page(stable_node, true);
                if (!page)
                        goto out;
 
-               lock_page(page);
                hlist_del(&rmap_item->hlist);
                unlock_page(page);
                put_page(page);
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
                if (!age)
-                       rb_erase(&rmap_item->node, &root_unstable_tree);
-
+                       rb_erase(&rmap_item->node,
+                                root_unstable_tree + NUMA(rmap_item->nid));
                ksm_pages_unshared--;
                rmap_item->address &= PAGE_MASK;
        }
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
 }
 
 /*
- * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
  * than check every pte of a given vma, the locking doesn't quite work for
  * that - an rmap_item is assigned to the stable tree after inserting ksm
  * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 /*
  * Only called through the sysfs control interface:
  */
+static int remove_stable_node(struct stable_node *stable_node)
+{
+       struct page *page;
+       int err;
+
+       page = get_ksm_page(stable_node, true);
+       if (!page) {
+               /*
+                * get_ksm_page did remove_node_from_stable_tree itself.
+                */
+               return 0;
+       }
+
+       if (WARN_ON_ONCE(page_mapped(page))) {
+               /*
+                * This should not happen: but if it does, just refuse to let
+                * merge_across_nodes be switched - there is no need to panic.
+                */
+               err = -EBUSY;
+       } else {
+               /*
+                * The stable node did not yet appear stale to get_ksm_page(),
+                * since that allows for an unmapped ksm page to be recognized
+                * right up until it is freed; but the node is safe to remove.
+                * This page might be in a pagevec waiting to be freed,
+                * or it might be PageSwapCache (perhaps under writeback),
+                * or it might have been removed from swapcache a moment ago.
+                */
+               set_page_stable_node(page, NULL);
+               remove_node_from_stable_tree(stable_node);
+               err = 0;
+       }
+
+       unlock_page(page);
+       put_page(page);
+       return err;
+}
+
+static int remove_all_stable_nodes(void)
+{
+       struct stable_node *stable_node;
+       struct list_head *this, *next;
+       int nid;
+       int err = 0;
+
+       for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+               while (root_stable_tree[nid].rb_node) {
+                       stable_node = rb_entry(root_stable_tree[nid].rb_node,
+                                               struct stable_node, node);
+                       if (remove_stable_node(stable_node)) {
+                               err = -EBUSY;
+                               break;  /* proceed to next nid */
+                       }
+                       cond_resched();
+               }
+       }
+       list_for_each_safe(this, next, &migrate_nodes) {
+               stable_node = list_entry(this, struct stable_node, list);
+               if (remove_stable_node(stable_node))
+                       err = -EBUSY;
+               cond_resched();
+       }
+       return err;
+}
+
 static int unmerge_and_remove_all_rmap_items(void)
 {
        struct mm_slot *mm_slot;
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void)
                ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
                                                struct mm_slot, mm_list);
                if (ksm_test_exit(mm)) {
-                       hlist_del(&mm_slot->link);
+                       hash_del(&mm_slot->link);
                        list_del(&mm_slot->mm_list);
                        spin_unlock(&ksm_mmlist_lock);
 
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void)
                }
        }
 
+       /* Clean up stable nodes, but don't worry if some are still busy */
+       remove_all_stable_nodes();
        ksm_scan.seqnr = 0;
        return 0;
 
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        if (err)
                goto out;
 
+       /* Unstable nid is in union with stable anon_vma: remove first */
+       remove_rmap_item_from_tree(rmap_item);
+
        /* Must get reference to anon_vma while still holding mmap_sem */
        rmap_item->anon_vma = vma->anon_vma;
        get_anon_vma(vma->anon_vma);
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
  */
 static struct page *stable_tree_search(struct page *page)
 {
-       struct rb_node *node = root_stable_tree.rb_node;
+       int nid;
+       struct rb_root *root;
+       struct rb_node **new;
+       struct rb_node *parent;
        struct stable_node *stable_node;
+       struct stable_node *page_node;
 
-       stable_node = page_stable_node(page);
-       if (stable_node) {                      /* ksm page forked */
+       page_node = page_stable_node(page);
+       if (page_node && page_node->head != &migrate_nodes) {
+               /* ksm page forked */
                get_page(page);
                return page;
        }
 
-       while (node) {
+       nid = get_kpfn_nid(page_to_pfn(page));
+       root = root_stable_tree + nid;
+again:
+       new = &root->rb_node;
+       parent = NULL;
+
+       while (*new) {
                struct page *tree_page;
                int ret;
 
                cond_resched();
-               stable_node = rb_entry(node, struct stable_node, node);
-               tree_page = get_ksm_page(stable_node);
+               stable_node = rb_entry(*new, struct stable_node, node);
+               tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
                        return NULL;
 
                ret = memcmp_pages(page, tree_page);
+               put_page(tree_page);
 
-               if (ret < 0) {
-                       put_page(tree_page);
-                       node = node->rb_left;
-               } else if (ret > 0) {
-                       put_page(tree_page);
-                       node = node->rb_right;
-               } else
-                       return tree_page;
+               parent = *new;
+               if (ret < 0)
+                       new = &parent->rb_left;
+               else if (ret > 0)
+                       new = &parent->rb_right;
+               else {
+                       /*
+                        * Lock and unlock the stable_node's page (which
+                        * might already have been migrated) so that page
+                        * migration is sure to notice its raised count.
+                        * It would be more elegant to return stable_node
+                        * than kpage, but that involves more changes.
+                        */
+                       tree_page = get_ksm_page(stable_node, true);
+                       if (tree_page) {
+                               unlock_page(tree_page);
+                               if (get_kpfn_nid(stable_node->kpfn) !=
+                                               NUMA(stable_node->nid)) {
+                                       put_page(tree_page);
+                                       goto replace;
+                               }
+                               return tree_page;
+                       }
+                       /*
+                        * There is now a place for page_node, but the tree may
+                        * have been rebalanced, so re-evaluate parent and new.
+                        */
+                       if (page_node)
+                               goto again;
+                       return NULL;
+               }
        }
 
-       return NULL;
+       if (!page_node)
+               return NULL;
+
+       list_del(&page_node->list);
+       DO_NUMA(page_node->nid = nid);
+       rb_link_node(&page_node->node, parent, new);
+       rb_insert_color(&page_node->node, root);
+       get_page(page);
+       return page;
+
+replace:
+       if (page_node) {
+               list_del(&page_node->list);
+               DO_NUMA(page_node->nid = nid);
+               rb_replace_node(&stable_node->node, &page_node->node, root);
+               get_page(page);
+       } else {
+               rb_erase(&stable_node->node, root);
+               page = NULL;
+       }
+       stable_node->head = &migrate_nodes;
+       list_add(&stable_node->list, stable_node->head);
+       return page;
 }
 
 /*
- * stable_tree_insert - insert rmap_item pointing to new ksm page
+ * stable_tree_insert - insert stable tree node pointing to new ksm page
  * into the stable tree.
  *
  * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page)
  */
 static struct stable_node *stable_tree_insert(struct page *kpage)
 {
-       struct rb_node **new = &root_stable_tree.rb_node;
+       int nid;
+       unsigned long kpfn;
+       struct rb_root *root;
+       struct rb_node **new;
        struct rb_node *parent = NULL;
        struct stable_node *stable_node;
 
+       kpfn = page_to_pfn(kpage);
+       nid = get_kpfn_nid(kpfn);
+       root = root_stable_tree + nid;
+       new = &root->rb_node;
+
        while (*new) {
                struct page *tree_page;
                int ret;
 
                cond_resched();
                stable_node = rb_entry(*new, struct stable_node, node);
-               tree_page = get_ksm_page(stable_node);
+               tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
                        return NULL;
 
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
        if (!stable_node)
                return NULL;
 
-       rb_link_node(&stable_node->node, parent, new);
-       rb_insert_color(&stable_node->node, &root_stable_tree);
-
        INIT_HLIST_HEAD(&stable_node->hlist);
-
-       stable_node->kpfn = page_to_pfn(kpage);
+       stable_node->kpfn = kpfn;
        set_page_stable_node(kpage, stable_node);
+       DO_NUMA(stable_node->nid = nid);
+       rb_link_node(&stable_node->node, parent, new);
+       rb_insert_color(&stable_node->node, root);
 
        return stable_node;
 }
@@ -1104,10 +1325,15 @@ static
 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                                              struct page *page,
                                              struct page **tree_pagep)
-
 {
-       struct rb_node **new = &root_unstable_tree.rb_node;
+       struct rb_node **new;
+       struct rb_root *root;
        struct rb_node *parent = NULL;
+       int nid;
+
+       nid = get_kpfn_nid(page_to_pfn(page));
+       root = root_unstable_tree + nid;
+       new = &root->rb_node;
 
        while (*new) {
                struct rmap_item *tree_rmap_item;
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                } else if (ret > 0) {
                        put_page(tree_page);
                        new = &parent->rb_right;
+               } else if (!ksm_merge_across_nodes &&
+                          page_to_nid(tree_page) != nid) {
+                       /*
+                        * If tree_page has been migrated to another NUMA node,
+                        * it will be flushed out and put in the right unstable
+                        * tree next time: only merge with it when across_nodes.
+                        */
+                       put_page(tree_page);
+                       return NULL;
                } else {
                        *tree_pagep = tree_page;
                        return tree_rmap_item;
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
 
        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+       DO_NUMA(rmap_item->nid = nid);
        rb_link_node(&rmap_item->node, parent, new);
-       rb_insert_color(&rmap_item->node, &root_unstable_tree);
+       rb_insert_color(&rmap_item->node, root);
 
        ksm_pages_unshared++;
        return NULL;
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
        unsigned int checksum;
        int err;
 
-       remove_rmap_item_from_tree(rmap_item);
+       stable_node = page_stable_node(page);
+       if (stable_node) {
+               if (stable_node->head != &migrate_nodes &&
+                   get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
+                       rb_erase(&stable_node->node,
+                                root_stable_tree + NUMA(stable_node->nid));
+                       stable_node->head = &migrate_nodes;
+                       list_add(&stable_node->list, stable_node->head);
+               }
+               if (stable_node->head != &migrate_nodes &&
+                   rmap_item->head == stable_node)
+                       return;
+       }
 
        /* We first start with searching the page inside the stable tree */
        kpage = stable_tree_search(page);
+       if (kpage == page && rmap_item->head == stable_node) {
+               put_page(kpage);
+               return;
+       }
+
+       remove_rmap_item_from_tree(rmap_item);
+
        if (kpage) {
                err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
                if (!err) {
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
                kpage = try_to_merge_two_pages(rmap_item, page,
                                                tree_rmap_item, tree_page);
                put_page(tree_page);
-               /*
-                * As soon as we merge this page, we want to remove the
-                * rmap_item of the page we have merged with from the unstable
-                * tree, and insert it instead as new node in the stable tree.
-                */
                if (kpage) {
-                       remove_rmap_item_from_tree(tree_rmap_item);
-
+                       /*
+                        * The pages were successfully merged: insert new
+                        * node in the stable tree and add both rmap_items.
+                        */
                        lock_page(kpage);
                        stable_node = stable_tree_insert(kpage);
                        if (stable_node) {
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct rmap_item *rmap_item;
+       int nid;
 
        if (list_empty(&ksm_mm_head.mm_list))
                return NULL;
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
                 */
                lru_add_drain_all();
 
-               root_unstable_tree = RB_ROOT;
+               /*
+                * Whereas stale stable_nodes on the stable_tree itself
+                * get pruned in the regular course of stable_tree_search(),
+                * those moved out to the migrate_nodes list can accumulate:
+                * so prune them once before each full scan.
+                */
+               if (!ksm_merge_across_nodes) {
+                       struct stable_node *stable_node;
+                       struct list_head *this, *next;
+                       struct page *page;
+
+                       list_for_each_safe(this, next, &migrate_nodes) {
+                               stable_node = list_entry(this,
+                                               struct stable_node, list);
+                               page = get_ksm_page(stable_node, false);
+                               if (page)
+                                       put_page(page);
+                               cond_resched();
+                       }
+               }
+
+               for (nid = 0; nid < ksm_nr_node_ids; nid++)
+                       root_unstable_tree[nid] = RB_ROOT;
 
                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1667,7 @@ next_mm:
                 * or when all VM_MERGEABLE areas have been unmapped (and
                 * mmap_sem then protects against race with MADV_MERGEABLE).
                 */
-               hlist_del(&slot->link);
+               hash_del(&slot->link);
                list_del(&slot->mm_list);
                spin_unlock(&ksm_mmlist_lock);
 
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages)
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
                        return;
-               if (!PageKsm(page) || !in_stable_tree(rmap_item))
-                       cmp_and_merge_page(page, rmap_item);
+               cmp_and_merge_page(page, rmap_item);
                put_page(page);
        }
 }
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing)
 
        while (!kthread_should_stop()) {
                mutex_lock(&ksm_thread_mutex);
+               wait_while_offlining();
                if (ksmd_should_run())
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm)
        spin_lock(&ksm_mmlist_lock);
        insert_to_mm_slots_hash(mm, mm_slot);
        /*
-        * Insert just behind the scanning cursor, to let the area settle
+        * When KSM_RUN_MERGE (or KSM_RUN_STOP),
+        * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
+        *
+        * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
+        * scanning cursor, otherwise KSM pages in newly forked mms will be
+        * missed: then we might as well insert at the end of the list.
         */
-       list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+       if (ksm_run & KSM_RUN_UNMERGE)
+               list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+       else
+               list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
        spin_unlock(&ksm_mmlist_lock);
 
        set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm)
        mm_slot = get_mm_slot(mm);
        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
                if (!mm_slot->rmap_list) {
-                       hlist_del(&mm_slot->link);
+                       hash_del(&mm_slot->link);
                        list_del(&mm_slot->mm_list);
                        easy_to_free = 1;
                } else {
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm)
        }
 }
 
-struct page *ksm_does_need_to_copy(struct page *page,
+struct page *ksm_might_need_to_copy(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
+       struct anon_vma *anon_vma = page_anon_vma(page);
        struct page *new_page;
 
+       if (PageKsm(page)) {
+               if (page_stable_node(page) &&
+                   !(ksm_run & KSM_RUN_UNMERGE))
+                       return page;    /* no need to copy it */
+       } else if (!anon_vma) {
+               return page;            /* no need to copy it */
+       } else if (anon_vma->root == vma->anon_vma->root &&
+                page->index == linear_page_index(vma, address)) {
+               return page;            /* still no need to copy it */
+       }
+       if (!PageUptodate(page))
+               return page;            /* let do_swap_page report the error */
+
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (new_page) {
                copy_user_highpage(new_page, page, address, vma);
 
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-               SetPageSwapBacked(new_page);
                __set_page_locked(new_page);
-
-               if (!mlocked_vma_newpage(vma, new_page))
-                       lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
-               else
-                       add_page_to_unevictable_list(new_page);
        }
 
        return new_page;
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
        if (stable_node) {
                VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
                stable_node->kpfn = page_to_pfn(newpage);
+               /*
+                * newpage->mapping was set in advance; now we need smp_wmb()
+                * to make sure that the new stable_node->kpfn is visible
+                * to get_ksm_page() before it can see that oldpage->mapping
+                * has gone stale (or that PageSwapCache has been cleared).
+                */
+               smp_wmb();
+               set_page_stable_node(oldpage, NULL);
        }
 }
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
-                                                unsigned long end_pfn)
+static int just_wait(void *word)
 {
-       struct rb_node *node;
+       schedule();
+       return 0;
+}
 
-       for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
-               struct stable_node *stable_node;
+static void wait_while_offlining(void)
+{
+       while (ksm_run & KSM_RUN_OFFLINE) {
+               mutex_unlock(&ksm_thread_mutex);
+               wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
+                               just_wait, TASK_UNINTERRUPTIBLE);
+               mutex_lock(&ksm_thread_mutex);
+       }
+}
 
-               stable_node = rb_entry(node, struct stable_node, node);
+static void ksm_check_stable_tree(unsigned long start_pfn,
+                                 unsigned long end_pfn)
+{
+       struct stable_node *stable_node;
+       struct list_head *this, *next;
+       struct rb_node *node;
+       int nid;
+
+       for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+               node = rb_first(root_stable_tree + nid);
+               while (node) {
+                       stable_node = rb_entry(node, struct stable_node, node);
+                       if (stable_node->kpfn >= start_pfn &&
+                           stable_node->kpfn < end_pfn) {
+                               /*
+                                * Don't get_ksm_page, page has already gone:
+                                * which is why we keep kpfn instead of page*
+                                */
+                               remove_node_from_stable_tree(stable_node);
+                               node = rb_first(root_stable_tree + nid);
+                       } else
+                               node = rb_next(node);
+                       cond_resched();
+               }
+       }
+       list_for_each_safe(this, next, &migrate_nodes) {
+               stable_node = list_entry(this, struct stable_node, list);
                if (stable_node->kpfn >= start_pfn &&
                    stable_node->kpfn < end_pfn)
-                       return stable_node;
+                       remove_node_from_stable_tree(stable_node);
+               cond_resched();
        }
-       return NULL;
 }
 
 static int ksm_memory_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
 {
        struct memory_notify *mn = arg;
-       struct stable_node *stable_node;
 
        switch (action) {
        case MEM_GOING_OFFLINE:
                /*
-                * Keep it very simple for now: just lock out ksmd and
-                * MADV_UNMERGEABLE while any memory is going offline.
-                * mutex_lock_nested() is necessary because lockdep was alarmed
-                * that here we take ksm_thread_mutex inside notifier chain
-                * mutex, and later take notifier chain mutex inside
-                * ksm_thread_mutex to unlock it.   But that's safe because both
-                * are inside mem_hotplug_mutex.
+                * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
+                * and remove_all_stable_nodes() while memory is going offline:
+                * it is unsafe for them to touch the stable tree at this time.
+                * But unmerge_ksm_pages(), rmap lookups and other entry points
+                * which do not need the ksm_thread_mutex are all safe.
                 */
-               mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
+               mutex_lock(&ksm_thread_mutex);
+               ksm_run |= KSM_RUN_OFFLINE;
+               mutex_unlock(&ksm_thread_mutex);
                break;
 
        case MEM_OFFLINE:
                /*
                 * Most of the work is done by page migration; but there might
                 * be a few stable_nodes left over, still pointing to struct
-                * pages which have been offlined: prune those from the tree.
+                * pages which have been offlined: prune those from the tree,
+                * otherwise get_ksm_page() might later try to access a
+                * non-existent struct page.
                 */
-               while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
-                                       mn->start_pfn + mn->nr_pages)) != NULL)
-                       remove_node_from_stable_tree(stable_node);
+               ksm_check_stable_tree(mn->start_pfn,
+                                     mn->start_pfn + mn->nr_pages);
                /* fallthrough */
 
        case MEM_CANCEL_OFFLINE:
+               mutex_lock(&ksm_thread_mutex);
+               ksm_run &= ~KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);
+
+               smp_mb();       /* wake_up_bit advises this */
+               wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
                break;
        }
        return NOTIFY_OK;
 }
+#else
+static void wait_while_offlining(void)
+{
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 #ifdef CONFIG_SYSFS
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan);
 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf)
 {
-       return sprintf(buf, "%u\n", ksm_run);
+       return sprintf(buf, "%lu\n", ksm_run);
 }
 
 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
         */
 
        mutex_lock(&ksm_thread_mutex);
+       wait_while_offlining();
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 KSM_ATTR(run);
 
+#ifdef CONFIG_NUMA
+static ssize_t merge_across_nodes_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+}
+
+static ssize_t merge_across_nodes_store(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       int err;
+       unsigned long knob;
+
+       err = kstrtoul(buf, 10, &knob);
+       if (err)
+               return err;
+       if (knob > 1)
+               return -EINVAL;
+
+       mutex_lock(&ksm_thread_mutex);
+       wait_while_offlining();
+       if (ksm_merge_across_nodes != knob) {
+               if (ksm_pages_shared || remove_all_stable_nodes())
+                       err = -EBUSY;
+               else if (root_stable_tree == one_stable_tree) {
+                       struct rb_root *buf;
+                       /*
+                        * This is the first time that we switch away from the
+                        * default of merging across nodes: must now allocate
+                        * a buffer to hold as many roots as may be needed.
+                        * Allocate stable and unstable together:
+                        * MAXSMP NODES_SHIFT 10 will use 16kB.
+                        */
+                       buf = kcalloc(nr_node_ids + nr_node_ids,
+                               sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
+                       /* Let us assume that RB_ROOT is NULL is zero */
+                       if (!buf)
+                               err = -ENOMEM;
+                       else {
+                               root_stable_tree = buf;
+                               root_unstable_tree = buf + nr_node_ids;
+                               /* Stable tree is empty but not the unstable */
+                               root_unstable_tree[0] = one_unstable_tree[0];
+                       }
+               }
+               if (!err) {
+                       ksm_merge_across_nodes = knob;
+                       ksm_nr_node_ids = knob ? 1 : nr_node_ids;
+               }
+       }
+       mutex_unlock(&ksm_thread_mutex);
+
+       return err ? err : count;
+}
+KSM_ATTR(merge_across_nodes);
+#endif
+
 static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = {
        &pages_unshared_attr.attr,
        &pages_volatile_attr.attr,
        &full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+       &merge_across_nodes_attr.attr,
+#endif
        NULL,
 };
 
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void)
 #endif /* CONFIG_SYSFS */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-       /*
-        * Choose a high priority since the callback takes ksm_thread_mutex:
-        * later callbacks could only be taking locks which nest within that.
-        */
+       /* There is no significance to this priority 100 */
        hotplug_memory_notifier(ksm_memory_callback, 100);
 #endif
        return 0;
index 03dfa5c..c58c94b 100644 (file)
@@ -16,6 +16,9 @@
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,84 @@ out:
        return error;
 }
 
+#ifdef CONFIG_SWAP
+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+       unsigned long end, struct mm_walk *walk)
+{
+       pte_t *orig_pte;
+       struct vm_area_struct *vma = walk->private;
+       unsigned long index;
+
+       if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+               return 0;
+
+       for (index = start; index != end; index += PAGE_SIZE) {
+               pte_t pte;
+               swp_entry_t entry;
+               struct page *page;
+               spinlock_t *ptl;
+
+               orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+               pte = *(orig_pte + ((index - start) / PAGE_SIZE));
+               pte_unmap_unlock(orig_pte, ptl);
+
+               if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+                       continue;
+               entry = pte_to_swp_entry(pte);
+               if (unlikely(non_swap_entry(entry)))
+                       continue;
+
+               page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+                                                               vma, index);
+               if (page)
+                       page_cache_release(page);
+       }
+
+       return 0;
+}
+
+static void force_swapin_readahead(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end)
+{
+       struct mm_walk walk = {
+               .mm = vma->vm_mm,
+               .pmd_entry = swapin_walk_pmd_entry,
+               .private = vma,
+       };
+
+       walk_page_range(start, end, &walk);
+
+       lru_add_drain();        /* Push any new pages onto the LRU now */
+}
+
+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end,
+               struct address_space *mapping)
+{
+       pgoff_t index;
+       struct page *page;
+       swp_entry_t swap;
+
+       for (; start < end; start += PAGE_SIZE) {
+               index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+               page = find_get_page(mapping, index);
+               if (!radix_tree_exceptional_entry(page)) {
+                       if (page)
+                               page_cache_release(page);
+                       continue;
+               }
+               swap = radix_to_swp_entry(page);
+               page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
+                                                               NULL, 0);
+               if (page)
+                       page_cache_release(page);
+       }
+
+       lru_add_drain();        /* Push any new pages onto the LRU now */
+}
+#endif         /* CONFIG_SWAP */
+
 /*
  * Schedule all required I/O operations.  Do not wait for completion.
  */
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
        struct file *file = vma->vm_file;
 
+#ifdef CONFIG_SWAP
+       if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+               *prev = vma;
+               if (!file)
+                       force_swapin_readahead(vma, start, end);
+               else
+                       force_shm_swapin_readahead(vma, start, end,
+                                               file->f_mapping);
+               return 0;
+       }
+#endif
+
        if (!file)
                return -EBADF;
 
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        int error = -EINVAL;
        int write;
        size_t len;
+       struct blk_plug plug;
 
 #ifdef CONFIG_MEMORY_FAILURE
        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        if (vma && start > vma->vm_start)
                prev = vma;
 
+       blk_start_plug(&plug);
        for (;;) {
                /* Still start < end. */
                error = -ENOMEM;
                if (!vma)
-                       goto out;
+                       goto out_plug;
 
                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
-                               goto out;
+                               goto out_plug;
                }
 
                /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = madvise_vma(vma, &prev, start, tmp, behavior);
                if (error)
-                       goto out;
+                       goto out_plug;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                error = unmapped_error;
                if (start >= end)
-                       goto out;
+                       goto out_plug;
                if (prev)
                        vma = prev->vm_next;
                else    /* madvise_remove dropped mmap_sem */
                        vma = find_vma(current->mm, start);
        }
+out_plug:
+       blk_finish_plug(&plug);
 out:
        if (write)
                up_write(&current->mm->mmap_sem);
index b8d9147..1bcd9b9 100644 (file)
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
  *
  * Find @size free area aligned to @align in the specified range and node.
  *
+ * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
+ * memory we found if not in hotpluggable ranges.
+ *
  * RETURNS:
  * Found address on success, %0 on failure.
  */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+                                       phys_addr_t end, phys_addr_t size,
+                                       phys_addr_t align, int nid)
+{
+       phys_addr_t this_start, this_end, cand;
+       u64 i;
+       int curr = movablemem_map.nr_map - 1;
+
+       /* pump up @end */
+       if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+               end = memblock.current_limit;
+
+       /* avoid allocating the first page */
+       start = max_t(phys_addr_t, start, PAGE_SIZE);
+       end = max(start, end);
+
+       for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+               this_start = clamp(this_start, start, end);
+               this_end = clamp(this_end, start, end);
+
+restart:
+               if (this_end <= this_start || this_end < size)
+                       continue;
+
+               for (; curr >= 0; curr--) {
+                       if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
+                           < this_end)
+                               break;
+               }
+
+               cand = round_down(this_end - size, align);
+               if (curr >= 0 &&
+                   cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
+                       this_end = movablemem_map.map[curr].start_pfn
+                                  << PAGE_SHIFT;
+                       goto restart;
+               }
+
+               if (cand >= this_start)
+                       return cand;
+       }
+
+       return 0;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
        }
        return 0;
 }
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
  * memblock_find_in_range - find free area in given range
index fbb60b1..53b8201 100644 (file)
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
        "pgmajfault",
 };
 
+static const char * const mem_cgroup_lru_names[] = {
+       "inactive_anon",
+       "active_anon",
+       "inactive_file",
+       "active_file",
+       "unevictable",
+};
+
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
 };
 
 struct mem_cgroup_lru_info {
-       struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+       struct mem_cgroup_per_node *nodeinfo[0];
 };
 
 /*
@@ -276,17 +284,6 @@ struct mem_cgroup {
         */
        struct res_counter kmem;
        /*
-        * Per cgroup active and inactive list, similar to the
-        * per zone LRU lists.
-        */
-       struct mem_cgroup_lru_info info;
-       int last_scanned_node;
-#if MAX_NUMNODES > 1
-       nodemask_t      scan_nodes;
-       atomic_t        numainfo_events;
-       atomic_t        numainfo_updating;
-#endif
-       /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
         /* Index in the kmem_cache->memcg_params->memcg_caches array */
        int kmemcg_id;
 #endif
+
+       int last_scanned_node;
+#if MAX_NUMNODES > 1
+       nodemask_t      scan_nodes;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
+#endif
+       /*
+        * Per cgroup active and inactive list, similar to the
+        * per zone LRU lists.
+        *
+        * WARNING: This has to be the last element of the struct. Don't
+        * add new fields after this point.
+        */
+       struct mem_cgroup_lru_info info;
 };
 
+static size_t memcg_size(void)
+{
+       return sizeof(struct mem_cgroup) +
+               nr_node_ids * sizeof(struct mem_cgroup_per_node);
+}
+
 /* internal only representation about the status of kmem accounting. */
 enum {
        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 
 /* Stuffs for move charges at task migration. */
 /*
- * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
- * left-shifted bitmap of these types.
+ * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
  */
 enum move_type {
        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
+       unsigned long immigrate_flags;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
 
 static bool move_anon(void)
 {
-       return test_bit(MOVE_CHARGE_TYPE_ANON,
-                                       &mc.to->move_charge_at_immigrate);
+       return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 }
 
 static bool move_file(void)
 {
-       return test_bit(MOVE_CHARGE_TYPE_FILE,
-                                       &mc.to->move_charge_at_immigrate);
+       return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 }
 
 /*
@@ -471,6 +488,13 @@ enum res_type {
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
 #define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 
+/*
+ * The memcg_create_mutex will be held whenever a new cgroup is created.
+ * As a consequence, any change that needs to protect against new child cgroups
+ * appearing has to hold it as well.
+ */
+static DEFINE_MUTEX(memcg_create_mutex);
+
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
 
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
+       VM_BUG_ON((unsigned)nid >= nr_node_ids);
        return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
 
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
 
-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
-{
-       unsigned long active;
-       unsigned long inactive;
-
-       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
-       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
-
-       return (active > inactive);
-}
-
 #define mem_cgroup_from_res_counter(counter, member)   \
        container_of(counter, struct mem_cgroup, member)
 
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
        spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
 
+#define K(x) ((x) << (PAGE_SHIFT-10))
 /**
- * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         */
        static char memcg_name[PATH_MAX];
        int ret;
+       struct mem_cgroup *iter;
+       unsigned int i;
 
-       if (!memcg || !p)
+       if (!p)
                return;
 
        rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        }
        rcu_read_unlock();
 
-       printk(KERN_INFO "Task in %s killed", memcg_name);
+       pr_info("Task in %s killed", memcg_name);
 
        rcu_read_lock();
        ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        /*
         * Continues from above, so we don't need an KERN_ level
         */
-       printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+       pr_cont(" as a result of limit of %s\n", memcg_name);
 done:
 
-       printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+       pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
                res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->res, RES_FAILCNT));
-       printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
-               "failcnt %llu\n",
+       pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
-       printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+       pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+
+       for_each_mem_cgroup_tree(iter, memcg) {
+               pr_info("Memory cgroup stats");
+
+               rcu_read_lock();
+               ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
+               if (!ret)
+                       pr_cont(" for %s", memcg_name);
+               rcu_read_unlock();
+               pr_cont(":");
+
+               for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+                       if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                               continue;
+                       pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+                               K(mem_cgroup_read_stat(iter, i)));
+               }
+
+               for (i = 0; i < NR_LRU_LISTS; i++)
+                       pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+                               K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+               pr_cont("\n");
+       }
 }
 
 /*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 
+static void __init memcg_stock_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct memcg_stock_pcp *stock =
+                                       &per_cpu(memcg_stock, cpu);
+               INIT_WORK(&stock->work, drain_local_stock);
+       }
+}
+
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
@@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
 
        pc = lookup_page_cgroup_used(page);
        if (pc) {
-               printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
-                      pc, pc->flags, pc->mem_cgroup);
+               pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+                        pc, pc->flags, pc->mem_cgroup);
        }
 }
 #endif
@@ -4719,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 }
 
 /*
+ * This mainly exists for tests during the setting of set of use_hierarchy.
+ * Since this is the very setting we are changing, the current hierarchy value
+ * is meaningless
+ */
+static inline bool __memcg_has_children(struct mem_cgroup *memcg)
+{
+       struct cgroup *pos;
+
+       /* bounce at first found */
+       cgroup_for_each_child(pos, memcg->css.cgroup)
+               return true;
+       return false;
+}
+
+/*
+ * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
+ * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
+ * from mem_cgroup_count_children(), in the sense that we don't really care how
+ * many children we have; we only need to know if we have any.  It also counts
+ * any memcg without hierarchy as infertile.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+       return memcg->use_hierarchy && __memcg_has_children(memcg);
+}
+
+/*
  * Reclaims as many pages from the given memcg as possible and moves
  * the rest to the parent.
  *
@@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        if (parent)
                parent_memcg = mem_cgroup_from_cont(parent);
 
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
 
        if (memcg->use_hierarchy == val)
                goto out;
@@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
         */
        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
                                (val == 1 || val == 0)) {
-               if (list_empty(&cont->children))
+               if (!__memcg_has_children(memcg))
                        memcg->use_hierarchy = val;
                else
                        retval = -EBUSY;
@@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                retval = -EINVAL;
 
 out:
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
 
        return retval;
 }
@@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 {
        int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-       bool must_inc_static_branch = false;
-
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
         *
         * After it first became limited, changes in the value of the limit are
         * of course permitted.
-        *
-        * Taking the cgroup_lock is really offensive, but it is so far the only
-        * way to guarantee that no children will appear. There are plenty of
-        * other offenders, and they should all go away. Fine grained locking
-        * is probably the way to go here. When we are fully hierarchical, we
-        * can also get rid of the use_hierarchy check.
         */
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
        mutex_lock(&set_limit_mutex);
        if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
-               if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
-                                               !list_empty(&cont->children))) {
+               if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
                        ret = -EBUSY;
                        goto out;
                }
@@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
                        goto out;
                }
-               must_inc_static_branch = true;
+               static_key_slow_inc(&memcg_kmem_enabled_key);
+               /*
+                * setting the active bit after the inc will guarantee no one
+                * starts accounting before all call sites are patched
+                */
+               memcg_kmem_set_active(memcg);
+
                /*
                 * kmem charges can outlive the cgroup. In the case of slab
                 * pages, for instance, a page contain objects from various
@@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                ret = res_counter_set_limit(&memcg->kmem, val);
 out:
        mutex_unlock(&set_limit_mutex);
-       cgroup_unlock();
-
-       /*
-        * We are by now familiar with the fact that we can't inc the static
-        * branch inside cgroup_lock. See disarm functions for details. A
-        * worker here is overkill, but also wrong: After the limit is set, we
-        * must start accounting right away. Since this operation can't fail,
-        * we can safely defer it to here - no rollback will be needed.
-        *
-        * The boolean used to control this is also safe, because
-        * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
-        * able to set it to true;
-        */
-       if (must_inc_static_branch) {
-               static_key_slow_inc(&memcg_kmem_enabled_key);
-               /*
-                * setting the active bit after the inc will guarantee no one
-                * starts accounting before all call sites are patched
-                */
-               memcg_kmem_set_active(memcg);
-       }
-
+       mutex_unlock(&memcg_create_mutex);
 #endif
        return ret;
 }
 
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
        int ret = 0;
@@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
                goto out;
 
        memcg->kmem_account_flags = parent->kmem_account_flags;
-#ifdef CONFIG_MEMCG_KMEM
        /*
         * When that happen, we need to disable the static branch only on those
         * memcgs that enabled it. To achieve this, we would be forced to
@@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
        mutex_lock(&set_limit_mutex);
        ret = memcg_update_cache_sizes(memcg);
        mutex_unlock(&set_limit_mutex);
-#endif
 out:
        return ret;
 }
+#endif /* CONFIG_MEMCG_KMEM */
 
 /*
  * The user of this function is...
@@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 
        if (val >= (1 << NR_MOVE_TYPE))
                return -EINVAL;
+
        /*
-        * We check this value several times in both in can_attach() and
-        * attach(), so we need cgroup lock to prevent this value from being
-        * inconsistent.
+        * No kind of locking is needed in here, because ->can_attach() will
+        * check this value once in the beginning of the process, and then carry
+        * on with stale data. This means that changes to this value will only
+        * affect task migrations starting after the change.
         */
-       cgroup_lock();
        memcg->move_charge_at_immigrate = val;
-       cgroup_unlock();
-
        return 0;
 }
 #else
@@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 }
 #endif /* CONFIG_NUMA */
 
-static const char * const mem_cgroup_lru_names[] = {
-       "inactive_anon",
-       "active_anon",
-       "inactive_file",
-       "active_file",
-       "unevictable",
-};
-
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 
        parent = mem_cgroup_from_cont(cgrp->parent);
 
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
 
        /* If under hierarchy, only empty-root can set this value */
-       if ((parent->use_hierarchy) ||
-           (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
-               cgroup_unlock();
+       if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+               mutex_unlock(&memcg_create_mutex);
                return -EINVAL;
        }
 
        memcg->swappiness = val;
 
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
 
        return 0;
 }
@@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 
        parent = mem_cgroup_from_cont(cgrp->parent);
 
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
        /* oom-kill-disable is a flag for subhierarchy. */
-       if ((parent->use_hierarchy) ||
-           (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
-               cgroup_unlock();
+       if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+               mutex_unlock(&memcg_create_mutex);
                return -EINVAL;
        }
        memcg->oom_kill_disable = val;
        if (!val)
                memcg_oom_recover(memcg);
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
        return 0;
 }
 
@@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
                .read_seq_string = memcg_numa_stat_show,
        },
 #endif
-#ifdef CONFIG_MEMCG_SWAP
-       {
-               .name = "memsw.usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-               .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
-       },
-       {
-               .name = "memsw.max_usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-               .trigger = mem_cgroup_reset,
-               .read = mem_cgroup_read,
-       },
-       {
-               .name = "memsw.limit_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-               .write_string = mem_cgroup_write,
-               .read = mem_cgroup_read,
-       },
-       {
-               .name = "memsw.failcnt",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-               .trigger = mem_cgroup_reset,
-               .read = mem_cgroup_read,
-       },
-#endif
 #ifdef CONFIG_MEMCG_KMEM
        {
                .name = "kmem.limit_in_bytes",
@@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
        { },    /* terminate */
 };
 
+#ifdef CONFIG_MEMCG_SWAP
+static struct cftype memsw_cgroup_files[] = {
+       {
+               .name = "memsw.usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+               .read = mem_cgroup_read,
+               .register_event = mem_cgroup_usage_register_event,
+               .unregister_event = mem_cgroup_usage_unregister_event,
+       },
+       {
+               .name = "memsw.max_usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+               .trigger = mem_cgroup_reset,
+               .read = mem_cgroup_read,
+       },
+       {
+               .name = "memsw.limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+               .write_string = mem_cgroup_write,
+               .read = mem_cgroup_read,
+       },
+       {
+               .name = "memsw.failcnt",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+               .trigger = mem_cgroup_reset,
+               .read = mem_cgroup_read,
+       },
+       { },    /* terminate */
+};
+#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *memcg;
-       int size = sizeof(struct mem_cgroup);
+       size_t size = memcg_size();
 
-       /* Can be very big if MAX_NUMNODES is very big */
+       /* Can be very big if nr_node_ids is very big */
        if (size < PAGE_SIZE)
                memcg = kzalloc(size, GFP_KERNEL);
        else
@@ -5935,7 +5981,7 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
-       int size = sizeof(struct mem_cgroup);
+       size_t size = memcg_size();
 
        mem_cgroup_remove_from_trees(memcg);
        free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 
-#ifdef CONFIG_MEMCG_SWAP
-static void __init enable_swap_cgroup(void)
-{
-       if (!mem_cgroup_disabled() && really_do_swap_account)
-               do_swap_account = 1;
-}
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
-#endif
-
-static int mem_cgroup_soft_limit_tree_init(void)
+static void __init mem_cgroup_soft_limit_tree_init(void)
 {
        struct mem_cgroup_tree_per_node *rtpn;
        struct mem_cgroup_tree_per_zone *rtpz;
@@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
                if (!node_state(node, N_NORMAL_MEMORY))
                        tmp = -1;
                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-               if (!rtpn)
-                       goto err_cleanup;
+               BUG_ON(!rtpn);
 
                soft_limit_tree.rb_tree_per_node[node] = rtpn;
 
@@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
                        spin_lock_init(&rtpz->lock);
                }
        }
-       return 0;
-
-err_cleanup:
-       for_each_node(node) {
-               if (!soft_limit_tree.rb_tree_per_node[node])
-                       break;
-               kfree(soft_limit_tree.rb_tree_per_node[node]);
-               soft_limit_tree.rb_tree_per_node[node] = NULL;
-       }
-       return 1;
-
 }
 
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup *cont)
 {
-       struct mem_cgroup *memcg, *parent;
+       struct mem_cgroup *memcg;
        long error = -ENOMEM;
        int node;
 
@@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
        /* root ? */
        if (cont->parent == NULL) {
-               int cpu;
-               enable_swap_cgroup();
-               parent = NULL;
-               if (mem_cgroup_soft_limit_tree_init())
-                       goto free_out;
                root_mem_cgroup = memcg;
-               for_each_possible_cpu(cpu) {
-                       struct memcg_stock_pcp *stock =
-                                               &per_cpu(memcg_stock, cpu);
-                       INIT_WORK(&stock->work, drain_local_stock);
-               }
-       } else {
-               parent = mem_cgroup_from_cont(cont->parent);
-               memcg->use_hierarchy = parent->use_hierarchy;
-               memcg->oom_kill_disable = parent->oom_kill_disable;
+               res_counter_init(&memcg->res, NULL);
+               res_counter_init(&memcg->memsw, NULL);
+               res_counter_init(&memcg->kmem, NULL);
        }
 
-       if (parent && parent->use_hierarchy) {
+       memcg->last_scanned_node = MAX_NUMNODES;
+       INIT_LIST_HEAD(&memcg->oom_notify);
+       atomic_set(&memcg->refcnt, 1);
+       memcg->move_charge_at_immigrate = 0;
+       mutex_init(&memcg->thresholds_lock);
+       spin_lock_init(&memcg->move_lock);
+
+       return &memcg->css;
+
+free_out:
+       __mem_cgroup_free(memcg);
+       return ERR_PTR(error);
+}
+
+static int
+mem_cgroup_css_online(struct cgroup *cont)
+{
+       struct mem_cgroup *memcg, *parent;
+       int error = 0;
+
+       if (!cont->parent)
+               return 0;
+
+       mutex_lock(&memcg_create_mutex);
+       memcg = mem_cgroup_from_cont(cont);
+       parent = mem_cgroup_from_cont(cont->parent);
+
+       memcg->use_hierarchy = parent->use_hierarchy;
+       memcg->oom_kill_disable = parent->oom_kill_disable;
+       memcg->swappiness = mem_cgroup_swappiness(parent);
+
+       if (parent->use_hierarchy) {
                res_counter_init(&memcg->res, &parent->res);
                res_counter_init(&memcg->memsw, &parent->memsw);
                res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                 * much sense so let cgroup subsystem know about this
                 * unfortunate state in our controller.
                 */
-               if (parent && parent != root_mem_cgroup)
+               if (parent != root_mem_cgroup)
                        mem_cgroup_subsys.broken_hierarchy = true;
        }
-       memcg->last_scanned_node = MAX_NUMNODES;
-       INIT_LIST_HEAD(&memcg->oom_notify);
-
-       if (parent)
-               memcg->swappiness = mem_cgroup_swappiness(parent);
-       atomic_set(&memcg->refcnt, 1);
-       memcg->move_charge_at_immigrate = 0;
-       mutex_init(&memcg->thresholds_lock);
-       spin_lock_init(&memcg->move_lock);
 
        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+       mutex_unlock(&memcg_create_mutex);
        if (error) {
                /*
                 * We call put now because our (and parent's) refcnts
@@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                 * call __mem_cgroup_free, so return directly
                 */
                mem_cgroup_put(memcg);
-               return ERR_PTR(error);
+               if (parent->use_hierarchy)
+                       mem_cgroup_put(parent);
        }
-       return &memcg->css;
-free_out:
-       __mem_cgroup_free(memcg);
-       return ERR_PTR(error);
+       return error;
 }
 
 static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
         * Because lookup_swap_cache() updates some statistics counter,
         * we call find_get_page() with swapper_space directly.
         */
-       page = find_get_page(&swapper_space, ent.val);
+       page = find_get_page(swap_address_space(ent), ent.val);
        if (do_swap_account)
                entry->val = ent.val;
 
@@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                swp_entry_t swap = radix_to_swp_entry(page);
                if (do_swap_account)
                        *entry = swap;
-               page = find_get_page(&swapper_space, swap.val);
+               page = find_get_page(swap_address_space(swap), swap.val);
        }
 #endif
        return page;
@@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+       unsigned long move_charge_at_immigrate;
 
-       if (memcg->move_charge_at_immigrate) {
+       /*
+        * We are now commited to this value whatever it is. Changes in this
+        * tunable will only affect upcoming migrations, not the current one.
+        * So we need to save it, and keep it going.
+        */
+       move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
+       if (move_charge_at_immigrate) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
 
@@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
+                       mc.immigrate_flags = move_charge_at_immigrate;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
 
@@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .name = "memory",
        .subsys_id = mem_cgroup_subsys_id,
        .css_alloc = mem_cgroup_css_alloc,
+       .css_online = mem_cgroup_css_online,
        .css_offline = mem_cgroup_css_offline,
        .css_free = mem_cgroup_css_free,
        .can_attach = mem_cgroup_can_attach,
@@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .use_id = 1,
 };
 
-/*
- * The rest of init is performed during ->css_alloc() for root css which
- * happens before initcalls.  hotcpu_notifier() can't be done together as
- * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
- * dependency.  Do it from a subsys_initcall().
- */
-static int __init mem_cgroup_init(void)
-{
-       hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-       return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
@@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s)
 }
 __setup("swapaccount=", enable_swap_account);
 
+static void __init memsw_file_init(void)
+{
+       WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+}
+
+static void __init enable_swap_cgroup(void)
+{
+       if (!mem_cgroup_disabled() && really_do_swap_account) {
+               do_swap_account = 1;
+               memsw_file_init();
+       }
+}
+
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
 #endif
+
+/*
+ * subsys_initcall() for memory controller.
+ *
+ * Some parts like hotcpu_notifier() have to be initialized from this context
+ * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
+ * everything that doesn't depend on a specific mem_cgroup structure should
+ * be initialized from here.
+ */
+static int __init mem_cgroup_init(void)
+{
+       hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+       enable_swap_cgroup();
+       mem_cgroup_soft_limit_tree_init();
+       memcg_stock_init();
+       return 0;
+}
+subsys_initcall(mem_cgroup_init);
index c6e4dd3..df0694c 100644 (file)
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
 
 int sysctl_memory_failure_recovery __read_mostly = 1;
 
-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
 
@@ -784,12 +784,12 @@ static struct page_state {
        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
 
-       { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
-       { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
-
        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
        { mlock,        mlock,          "clean mlocked LRU",    me_pagecache_clean },
 
+       { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+       { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
+
        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
 
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page *hpage;
        int res;
        unsigned int nr_pages;
+       unsigned long page_flags;
 
        if (!sysctl_memory_failure_recovery)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
 
-       nr_pages = 1 << compound_trans_order(hpage);
-       atomic_long_add(nr_pages, &mce_bad_pages);
+       /*
+        * Currently errors on hugetlbfs pages are measured in hugepage units,
+        * so nr_pages should be 1 << compound_order.  OTOH when errors are on
+        * transparent hugepages, they are supposed to be split and error
+        * measurement is done in normal page units.  So nr_pages should be one
+        * in this case.
+        */
+       if (PageHuge(p))
+               nr_pages = 1 << compound_order(hpage);
+       else /* normal page or thp */
+               nr_pages = 1;
+       atomic_long_add(nr_pages, &num_poisoned_pages);
 
        /*
         * We need/can do nothing about count=0 pages.
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        if (!PageHWPoison(hpage)
                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
                            || (p != hpage && TestSetPageHWPoison(hpage))) {
-                               atomic_long_sub(nr_pages, &mce_bad_pages);
+                               atomic_long_sub(nr_pages, &num_poisoned_pages);
                                return 0;
                        }
                        set_page_hwpoison_huge_page(hpage);
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        lock_page(hpage);
 
        /*
+        * We use page flags to determine what action should be taken, but
+        * the flags can be modified by the error containment action.  One
+        * example is an mlocked page, where PG_mlocked is cleared by
+        * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+        * correctly, we save a copy of the page flags at this time.
+        */
+       page_flags = p->flags;
+
+       /*
         * unpoison always clear PG_hwpoison inside page lock
         */
        if (!PageHWPoison(p)) {
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                       atomic_long_sub(nr_pages, &mce_bad_pages);
+                       atomic_long_sub(nr_pages, &num_poisoned_pages);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        res = -EBUSY;
-       for (ps = error_states;; ps++) {
-               if ((p->flags & ps->mask) == ps->res) {
-                       res = page_action(ps, p, pfn);
+       /*
+        * The first check uses the current page flags which may not have any
+        * relevant information. The second check with the saved page flagss is
+        * carried out only if the first check can't determine the page status.
+        */
+       for (ps = error_states;; ps++)
+               if ((p->flags & ps->mask) == ps->res)
                        break;
-               }
-       }
+       if (!ps->mask)
+               for (ps = error_states;; ps++)
+                       if ((page_flags & ps->mask) == ps->res)
+                               break;
+       res = page_action(ps, p, pfn);
 out:
        unlock_page(hpage);
        return res;
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                       atomic_long_sub(nr_pages, &mce_bad_pages);
+                       atomic_long_sub(nr_pages, &num_poisoned_pages);
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
         */
        if (TestClearPageHWPoison(page)) {
                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-               atomic_long_sub(nr_pages, &mce_bad_pages);
+               atomic_long_sub(nr_pages, &num_poisoned_pages);
                freeit = 1;
                if (PageHuge(page))
                        clear_page_hwpoison_huge_page(page);
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
  * that is not free, and 1 for any other page type.
  * For 1 the page is returned with increased page count, otherwise not.
  */
-static int get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 {
        int ret;
 
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        if (!get_page_unless_zero(compound_head(p))) {
                if (PageHuge(p)) {
                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
-                       ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+                       ret = 0;
                } else if (is_free_buddy_page(p)) {
                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
-                       /* Set hwpoison bit while page is still isolated */
-                       SetPageHWPoison(p);
                        ret = 0;
                } else {
                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        return ret;
 }
 
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
+{
+       int ret = __get_any_page(page, pfn, flags);
+
+       if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+               /*
+                * Try to free it.
+                */
+               put_page(page);
+               shake_page(page, 1);
+
+               /*
+                * Did it turn free?
+                */
+               ret = __get_any_page(page, pfn, 0);
+               if (!PageLRU(page)) {
+                       pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                               pfn, page->flags);
+                       return -EIO;
+               }
+       }
+       return ret;
+}
+
 static int soft_offline_huge_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
 
-       ret = get_any_page(page, pfn, flags);
-       if (ret < 0)
-               return ret;
-       if (ret == 0)
-               goto done;
-
+       /*
+        * This double-check of PageHWPoison is to avoid the race with
+        * memory_failure(). See also comment in __soft_offline_page().
+        */
+       lock_page(hpage);
        if (PageHWPoison(hpage)) {
+               unlock_page(hpage);
                put_page(hpage);
                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
+       unlock_page(hpage);
 
        /* Keep page count to indicate a given hugepage is isolated. */
-       ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
+       ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC);
        put_page(hpage);
        if (ret) {
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-               return ret;
-       }
-done:
-       if (!PageHWPoison(hpage))
+       } else {
+               set_page_hwpoison_huge_page(hpage);
+               dequeue_hwpoisoned_huge_page(hpage);
                atomic_long_add(1 << compound_trans_order(hpage),
-                               &mce_bad_pages);
-       set_page_hwpoison_huge_page(hpage);
-       dequeue_hwpoisoned_huge_page(hpage);
+                               &num_poisoned_pages);
+       }
        /* keep elevated page count for bad page */
        return ret;
 }
 
+static int __soft_offline_page(struct page *page, int flags);
+
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_trans_head(page);
 
-       if (PageHuge(page))
-               return soft_offline_huge_page(page, flags);
-       if (PageTransHuge(hpage)) {
+       if (PageHWPoison(page)) {
+               pr_info("soft offline: %#lx page already poisoned\n", pfn);
+               return -EBUSY;
+       }
+       if (!PageHuge(page) && PageTransHuge(hpage)) {
                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
                        pr_info("soft offline: %#lx: failed to split THP\n",
                                pfn);
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
-       if (ret == 0)
-               goto done;
-
-       /*
-        * Page cache page we can handle?
-        */
-       if (!PageLRU(page)) {
-               /*
-                * Try to free it.
-                */
-               put_page(page);
-               shake_page(page, 1);
-
-               /*
-                * Did it turn free?
-                */
-               ret = get_any_page(page, pfn, 0);
-               if (ret < 0)
-                       return ret;
-               if (ret == 0)
-                       goto done;
-       }
-       if (!PageLRU(page)) {
-               pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
-                       pfn, page->flags);
-               return -EIO;
+       if (ret) { /* for in-use pages */
+               if (PageHuge(page))
+                       ret = soft_offline_huge_page(page, flags);
+               else
+                       ret = __soft_offline_page(page, flags);
+       } else { /* for free pages */
+               if (PageHuge(page)) {
+                       set_page_hwpoison_huge_page(hpage);
+                       dequeue_hwpoisoned_huge_page(hpage);
+                       atomic_long_add(1 << compound_trans_order(hpage),
+                                       &num_poisoned_pages);
+               } else {
+                       SetPageHWPoison(page);
+                       atomic_long_inc(&num_poisoned_pages);
+               }
        }
+       /* keep elevated page count for bad page */
+       return ret;
+}
 
-       lock_page(page);
-       wait_on_page_writeback(page);
+static int __soft_offline_page(struct page *page, int flags)
+{
+       int ret;
+       unsigned long pfn = page_to_pfn(page);
 
        /*
-        * Synchronized using the page lock with memory_failure()
+        * Check PageHWPoison again inside page lock because PageHWPoison
+        * is set by memory_failure() outside page lock. Note that
+        * memory_failure() also double-checks PageHWPoison inside page lock,
+        * so there's no race between soft_offline_page() and memory_failure().
         */
+       lock_page(page);
+       wait_on_page_writeback(page);
        if (PageHWPoison(page)) {
                unlock_page(page);
                put_page(page);
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
-
        /*
         * Try to invalidate first. This should work for
         * non dirty unmapped page cache pages.
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
         */
        if (ret == 1) {
                put_page(page);
-               ret = 0;
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
-               goto done;
+               SetPageHWPoison(page);
+               atomic_long_inc(&num_poisoned_pages);
+               return 0;
        }
 
        /*
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
        if (!ret) {
                LIST_HEAD(pagelist);
                inc_zone_page_state(page, NR_ISOLATED_ANON +
-                                           page_is_file_cache(page));
+                                       page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                       false, MIGRATE_SYNC,
-                                                       MR_MEMORY_FAILURE);
+                                       MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
+               } else {
+                       SetPageHWPoison(page);
+                       atomic_long_inc(&num_poisoned_pages);
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
                        pfn, ret, page_count(page), page->flags);
        }
-       if (ret)
-               return ret;
-
-done:
-       atomic_long_add(1, &mce_bad_pages);
-       SetPageHWPoison(page);
-       /* keep elevated page count for bad page */
        return ret;
 }
index bb1369f..705473a 100644 (file)
 
 #include "internal.h"
 
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#endif
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
 /**
- * follow_page - look up a page descriptor from a user-virtual address
+ * follow_page_mask - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
  * @address: virtual address to look up
  * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
  *
  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
  *
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
  * an error pointer if there is a mapping to something not represented
  * by a page descriptor (see also vm_normal_page()).
  */
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
-                       unsigned int flags)
+struct page *follow_page_mask(struct vm_area_struct *vma,
+                             unsigned long address, unsigned int flags,
+                             unsigned int *page_mask)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;
 
+       *page_mask = 0;
+
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
        if (!IS_ERR(page)) {
                BUG_ON(flags & FOLL_GET);
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
                                spin_unlock(&mm->page_table_lock);
+                               *page_mask = HPAGE_PMD_NR - 1;
                                goto out;
                        }
                } else
@@ -1539,8 +1548,24 @@ split_fallthrough:
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 
        pte = *ptep;
-       if (!pte_present(pte))
-               goto no_page;
+       if (!pte_present(pte)) {
+               swp_entry_t entry;
+               /*
+                * KSM's break_ksm() relies upon recognizing a ksm page
+                * even while it is being migrated, so for that case we
+                * need migration_entry_wait().
+                */
+               if (likely(!(flags & FOLL_MIGRATION)))
+                       goto no_page;
+               if (pte_none(pte) || pte_file(pte))
+                       goto no_page;
+               entry = pte_to_swp_entry(pte);
+               if (!is_migration_entry(entry))
+                       goto no_page;
+               pte_unmap_unlock(ptep, ptl);
+               migration_entry_wait(mm, pmd, address);
+               goto split_fallthrough;
+       }
        if ((flags & FOLL_NUMA) && pte_numa(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
  * instead of __get_user_pages. __get_user_pages should be used only if
  * you need some special @gup_flags.
  */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long start, int nr_pages, unsigned int gup_flags,
-                    struct page **pages, struct vm_area_struct **vmas,
-                    int *nonblocking)
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, unsigned long nr_pages,
+               unsigned int gup_flags, struct page **pages,
+               struct vm_area_struct **vmas, int *nonblocking)
 {
-       int i;
+       long i;
        unsigned long vm_flags;
+       unsigned int page_mask;
 
-       if (nr_pages <= 0)
+       if (!nr_pages)
                return 0;
 
        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                get_page(page);
                        }
                        pte_unmap(pte);
+                       page_mask = 0;
                        goto next_page;
                }
 
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                do {
                        struct page *page;
                        unsigned int foll_flags = gup_flags;
+                       unsigned int page_increm;
 
                        /*
                         * If we have a pending SIGKILL, don't keep faulting
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                return i ? i : -ERESTARTSYS;
 
                        cond_resched();
-                       while (!(page = follow_page(vma, start, foll_flags))) {
+                       while (!(page = follow_page_mask(vma, start,
+                                               foll_flags, &page_mask))) {
                                int ret;
                                unsigned int fault_flags = 0;
 
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
                                flush_anon_page(vma, page, start);
                                flush_dcache_page(page);
+                               page_mask = 0;
                        }
 next_page:
-                       if (vmas)
+                       if (vmas) {
                                vmas[i] = vma;
-                       i++;
-                       start += PAGE_SIZE;
-                       nr_pages--;
+                               page_mask = 0;
+                       }
+                       page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+                       if (page_increm > nr_pages)
+                               page_increm = nr_pages;
+                       i += page_increm;
+                       start += page_increm * PAGE_SIZE;
+                       nr_pages -= page_increm;
                } while (nr_pages && start < vma->vm_end);
        } while (nr_pages);
        return i;
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
  *
  * See also get_user_pages_fast, for performance critical applications.
  */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-               unsigned long start, int nr_pages, int write, int force,
-               struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, unsigned long nr_pages, int write,
+               int force, struct page **pages, struct vm_area_struct **vmas)
 {
        int flags = FOLL_TOUCH;
 
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned int flags, pte_t orig_pte)
 {
        spinlock_t *ptl;
-       struct page *page, *swapcache = NULL;
+       struct page *page, *swapcache;
        swp_entry_t entry;
        pte_t pte;
        int locked;
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               swapcache = page;
                goto out_release;
        }
 
+       swapcache = page;
        locked = lock_page_or_retry(page, mm, flags);
 
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
                goto out_page;
 
-       if (ksm_might_need_to_copy(page, vma, address)) {
-               swapcache = page;
-               page = ksm_does_need_to_copy(page, vma, address);
-
-               if (unlikely(!page)) {
-                       ret = VM_FAULT_OOM;
-                       page = swapcache;
-                       swapcache = NULL;
-                       goto out_page;
-               }
+       page = ksm_might_need_to_copy(page, vma, address);
+       if (unlikely(!page)) {
+               ret = VM_FAULT_OOM;
+               page = swapcache;
+               goto out_page;
        }
 
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
-       do_page_add_anon_rmap(page, vma, address, exclusive);
+       if (page == swapcache)
+               do_page_add_anon_rmap(page, vma, address, exclusive);
+       else /* ksm created a completely new copy */
+               page_add_new_anon_rmap(page, vma, address);
        /* It's better to call commit-charge after rmap is established */
        mem_cgroup_commit_charge_swapin(page, ptr);
 
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
-       if (swapcache) {
+       if (page != swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
@@ -3085,7 +3120,7 @@ out_page:
        unlock_page(page);
 out_release:
        page_cache_release(page);
-       if (swapcache) {
+       if (page != swapcache) {
                unlock_page(swapcache);
                page_cache_release(swapcache);
        }
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int make_pages_present(unsigned long addr, unsigned long end)
-{
-       int ret, len, write;
-       struct vm_area_struct * vma;
-
-       vma = find_vma(current->mm, addr);
-       if (!vma)
-               return -ENOMEM;
-       /*
-        * We want to touch writable mappings with a write fault in order
-        * to break COW, except for shared mappings because these don't COW
-        * and we would not want to dirty them for nothing.
-        */
-       write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
-       BUG_ON(addr >= end);
-       BUG_ON(end > vma->vm_end);
-       len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
-       ret = get_user_pages(current, current->mm, addr,
-                       len, write, 0, NULL, NULL);
-       if (ret < 0)
-               return ret;
-       return ret == len ? 0 : -EFAULT;
-}
-
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
index d04ed87..b81a367 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
 
 #include <asm/tlbflush.h>
 
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page,
-                            unsigned long type)
+void get_page_bootmem(unsigned long info,  struct page *page,
+                     unsigned long type)
 {
        page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
                mutex_lock(&ppb_lock);
                __free_pages_bootmem(page, 0);
                mutex_unlock(&ppb_lock);
+               totalram_pages++;
        }
 
 }
 
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+       unsigned long *usemap, mapsize, section_nr, i;
+       struct mem_section *ms;
+       struct page *page, *memmap;
+
+       if (!pfn_valid(start_pfn))
+               return;
+
+       section_nr = pfn_to_section_nr(start_pfn);
+       ms = __nr_to_section(section_nr);
+
+       memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+       register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+       usemap = __nr_to_section(section_nr)->pageblock_flags;
+       page = virt_to_page(usemap);
+
+       mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+       for (i = 0; i < mapsize; i++, page++)
+               get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        }
 
        pfn = pgdat->node_start_pfn;
-       end_pfn = pfn + pgdat->node_spanned_pages;
+       end_pfn = pgdat_end_pfn(pgdat);
 
        /* register_section info */
        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
                        register_page_bootmem_info_section(pfn);
        }
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
                           unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
                set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 }
 
+/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
+ * alloc_bootmem_node_nopanic() */
+static int __ref ensure_zone_is_initialized(struct zone *zone,
+                       unsigned long start_pfn, unsigned long num_pages)
+{
+       if (!zone_is_initialized(zone))
+               return init_currently_empty_zone(zone, start_pfn, num_pages,
+                                                MEMMAP_HOTPLUG);
+       return 0;
+}
+
 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z1_start_pfn;
 
-       if (!z1->wait_table) {
-               ret = init_currently_empty_zone(z1, start_pfn,
-                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
+       ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
+       if (ret)
+               return ret;
 
        pgdat_resize_lock(z1->zone_pgdat, &flags);
 
        /* can't move pfns which are higher than @z2 */
-       if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+       if (end_pfn > zone_end_pfn(z2))
                goto out_fail;
        /* the move out part mast at the left most of @z2 */
        if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                z1_start_pfn = start_pfn;
 
        resize_zone(z1, z1_start_pfn, end_pfn);
-       resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+       resize_zone(z2, end_pfn, zone_end_pfn(z2));
 
        pgdat_resize_unlock(z1->zone_pgdat, &flags);
 
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z2_end_pfn;
 
-       if (!z2->wait_table) {
-               ret = init_currently_empty_zone(z2, start_pfn,
-                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
+       ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
+       if (ret)
+               return ret;
 
        pgdat_resize_lock(z1->zone_pgdat, &flags);
 
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        if (z1->zone_start_pfn > start_pfn)
                goto out_fail;
        /* the move out part mast at the right most of @z1 */
-       if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+       if (zone_end_pfn(z1) >  end_pfn)
                goto out_fail;
        /* must included/overlap */
-       if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+       if (start_pfn >= zone_end_pfn(z1))
                goto out_fail;
 
        /* use end_pfn for z2's end_pfn if z2 is empty */
        if (z2->spanned_pages)
-               z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+               z2_end_pfn = zone_end_pfn(z2);
        else
                z2_end_pfn = end_pfn;
 
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int nid = pgdat->node_id;
        int zone_type;
        unsigned long flags;
+       int ret;
 
        zone_type = zone - pgdat->node_zones;
-       if (!zone->wait_table) {
-               int ret;
+       ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
+       if (ret)
+               return ret;
 
-               ret = init_currently_empty_zone(zone, phys_start_pfn,
-                                               nr_pages, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
        pgdat_resize_lock(zone->zone_pgdat, &flags);
        grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
        grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+                                    unsigned long start_pfn,
+                                    unsigned long end_pfn)
+{
+       struct mem_section *ms;
+
+       for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(start_pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(start_pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+                       continue;
+
+               return start_pfn;
+       }
+
+       return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+                                   unsigned long start_pfn,
+                                   unsigned long end_pfn)
+{
+       struct mem_section *ms;
+       unsigned long pfn;
+
+       /* pfn is the end pfn of a memory section. */
+       pfn = end_pfn - 1;
+       for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(pfn)))
+                       continue;
+
+               return pfn;
+       }
+
+       return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+                            unsigned long end_pfn)
 {
+       unsigned long zone_start_pfn =  zone->zone_start_pfn;
+       unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = zone_to_nid(zone);
+
+       zone_span_writelock(zone);
+       if (zone_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the zone, it need
+                * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+                                               zone_end_pfn);
+               if (pfn) {
+                       zone->zone_start_pfn = pfn;
+                       zone->spanned_pages = zone_end_pfn - pfn;
+               }
+       } else if (zone_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the zone, it need
+                * shrink zone->spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       zone->spanned_pages = pfn - zone_start_pfn + 1;
+       }
+
        /*
-        * XXX: Freeing memmap with vmemmap is not implement yet.
-        *      This should be removed later.
+        * The section is not biggest or smallest mem_section in the zone, it
+        * only creates a hole in the zone. So in this case, we need not
+        * change the zone. But perhaps, the zone has only hole data. Thus
+        * it check the zone has only hole or not.
         */
-       return -EBUSY;
+       pfn = zone_start_pfn;
+       for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (page_zone(pfn_to_page(pfn)) != zone)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               zone_span_writeunlock(zone);
+               return;
+       }
+
+       /* The zone has no valid section */
+       zone->zone_start_pfn = 0;
+       zone->spanned_pages = 0;
+       zone_span_writeunlock(zone);
 }
-#else
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+                             unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+       unsigned long pgdat_end_pfn =
+               pgdat->node_start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = pgdat->node_id;
+
+       if (pgdat_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the pgdat, it need
+                * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+                                               pgdat_end_pfn);
+               if (pfn) {
+                       pgdat->node_start_pfn = pfn;
+                       pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+               }
+       } else if (pgdat_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the pgdat, it need
+                * shrink pgdat->node_spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+       }
+
+       /*
+        * If the section is not biggest or smallest mem_section in the pgdat,
+        * it only creates a hole in the pgdat. So in this case, we need not
+        * change the pgdat.
+        * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+        * has only hole or not.
+        */
+       pfn = pgdat_start_pfn;
+       for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               return;
+       }
+
+       /* The pgdat has no valid section */
+       pgdat->node_start_pfn = 0;
+       pgdat->node_spanned_pages = 0;
+}
+
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 {
-       unsigned long flags;
        struct pglist_data *pgdat = zone->zone_pgdat;
+       int nr_pages = PAGES_PER_SECTION;
+       int zone_type;
+       unsigned long flags;
+
+       zone_type = zone - pgdat->node_zones;
+
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+       shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+       unsigned long start_pfn;
+       int scn_nr;
        int ret = -EINVAL;
 
        if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        if (ret)
                return ret;
 
-       pgdat_resize_lock(pgdat, &flags);
+       scn_nr = __section_nr(ms);
+       start_pfn = section_nr_to_pfn(scn_nr);
+       __remove_zone(zone, start_pfn);
+
        sparse_remove_one_section(zone, ms);
-       pgdat_resize_unlock(pgdat, &flags);
        return 0;
 }
-#endif
 
 /*
  * Reasonably generic function for adding memory.  It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        unsigned long zholes_size[MAX_NR_ZONES] = {0};
        unsigned long start_pfn = start >> PAGE_SHIFT;
 
-       pgdat = arch_alloc_nodedata(nid);
-       if (!pgdat)
-               return NULL;
+       pgdat = NODE_DATA(nid);
+       if (!pgdat) {
+               pgdat = arch_alloc_nodedata(nid);
+               if (!pgdat)
+                       return NULL;
 
-       arch_refresh_nodedata(nid, pgdat);
+               arch_refresh_nodedata(nid, pgdat);
+       }
 
        /* we can use NODE_DATA(nid) from here */
 
@@ -854,7 +1080,8 @@ out:
 int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
-       int new_pgdat = 0;
+       bool new_pgdat;
+       bool new_node;
        struct resource *res;
        int ret;
 
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (!res)
                goto out;
 
-       if (!node_online(nid)) {
+       {       /* Stupid hack to suppress address-never-null warning */
+               void *p = NODE_DATA(nid);
+               new_pgdat = !p;
+       }
+       new_node = !node_online(nid);
+       if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
                        goto error;
-               new_pgdat = 1;
        }
 
        /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
 
-       if (new_pgdat) {
+       if (new_node) {
                ret = register_one_node(nid);
                /*
                 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
-       if (res)
-               release_memory_resource(res);
+       release_memory_resource(res);
 
 out:
        unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                       true, MIGRATE_SYNC,
-                                                       MR_MEMORY_HOTPLUG);
+                                       MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
 
-int remove_memory(u64 start, u64 size)
+/**
+ * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
+ * @start_pfn: start pfn of the memory range
+ * @end_pfn: end pft of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present mem sections in range
+ * [start_pfn, end_pfn) and call func on each mem section.
+ *
+ * Returns the return value of func.
+ */
+static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+               void *arg, int (*func)(struct memory_block *, void *))
 {
        struct memory_block *mem = NULL;
        struct mem_section *section;
-       unsigned long start_pfn, end_pfn;
        unsigned long pfn, section_nr;
        int ret;
 
-       start_pfn = PFN_DOWN(start);
-       end_pfn = start_pfn + PFN_DOWN(size);
-
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                section_nr = pfn_to_section_nr(pfn);
                if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
                if (!mem)
                        continue;
 
-               ret = offline_memory_block(mem);
+               ret = func(mem, arg);
                if (ret) {
                        kobject_put(&mem->dev.kobj);
                        return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
 
        return 0;
 }
+
+/**
+ * offline_memory_block_cb - callback function for offlining memory block
+ * @mem: the memory block to be offlined
+ * @arg: buffer to hold error msg
+ *
+ * Always return 0, and put the error msg in arg if any.
+ */
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+       int *ret = arg;
+       int error = offline_memory_block(mem);
+
+       if (error != 0 && *ret == 0)
+               *ret = error;
+
+       return 0;
+}
+
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+       int ret = !is_memblock_offlined(mem);
+
+       if (unlikely(ret))
+               pr_warn("removing memory fails, because memory "
+                       "[%#010llx-%#010llx] is onlined\n",
+                       PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+                       PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+
+       return ret;
+}
+
+static int check_cpu_on_node(void *data)
+{
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_present_cpu(cpu) {
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       /*
+                        * the cpu on this node isn't removed, and we can't
+                        * offline this node.
+                        */
+                       return -EBUSY;
+       }
+
+       return 0;
+}
+
+static void unmap_cpu_on_node(void *data)
+{
+#ifdef CONFIG_ACPI_NUMA
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       numa_clear_node(cpu);
+#endif
+}
+
+static int check_and_unmap_cpu_on_node(void *data)
+{
+       int ret = check_cpu_on_node(data);
+
+       if (ret)
+               return ret;
+
+       /*
+        * the node will be offlined when we come here, so we can clear
+        * the cpu_to_node() now.
+        */
+
+       unmap_cpu_on_node(data);
+       return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+void try_offline_node(int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       unsigned long start_pfn = pgdat->node_start_pfn;
+       unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct page *pgdat_page = virt_to_page(pgdat);
+       int i;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+               unsigned long section_nr = pfn_to_section_nr(pfn);
+
+               if (!present_section_nr(section_nr))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+               /*
+                * some memory sections of this node are not removed, and we
+                * can't offline node now.
+                */
+               return;
+       }
+
+       if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+               return;
+
+       /*
+        * all memory/cpu of this node are removed, we can offline this
+        * node now.
+        */
+       node_set_offline(nid);
+       unregister_one_node(nid);
+
+       if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+               /* node data is allocated from boot memory */
+               return;
+
+       /* free waittable in each zone */
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (zone->wait_table)
+                       vfree(zone->wait_table);
+       }
+
+       /*
+        * Since there is no way to guarentee the address of pgdat/zone is not
+        * on stack of any kernel threads or used by other kernel objects
+        * without reference counting or other symchronizing method, do not
+        * reset node_data and free pgdat here. Just reset it to 0 and reuse
+        * the memory when the node is online again.
+        */
+       memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+
+int __ref remove_memory(int nid, u64 start, u64 size)
+{
+       unsigned long start_pfn, end_pfn;
+       int ret = 0;
+       int retry = 1;
+
+       start_pfn = PFN_DOWN(start);
+       end_pfn = start_pfn + PFN_DOWN(size);
+
+       /*
+        * When CONFIG_MEMCG is on, one memory block may be used by other
+        * blocks to store page cgroup when onlining pages. But we don't know
+        * in what order pages are onlined. So we iterate twice to offline
+        * memory:
+        * 1st iterate: offline every non primary memory block.
+        * 2nd iterate: offline primary (i.e. first added) memory block.
+        */
+repeat:
+       walk_memory_range(start_pfn, end_pfn, &ret,
+                         offline_memory_block_cb);
+       if (ret) {
+               if (!retry)
+                       return ret;
+
+               retry = 0;
+               ret = 0;
+               goto repeat;
+       }
+
+       lock_memory_hotplug();
+
+       /*
+        * we have offlined all memory blocks like this:
+        *   1. lock memory hotplug
+        *   2. offline a memory block
+        *   3. unlock memory hotplug
+        *
+        * repeat step1-3 to offline the memory block. All memory blocks
+        * must be offlined before removing memory. But we don't hold the
+        * lock in the whole operation. So we should check whether all
+        * memory blocks are offlined.
+        */
+
+       ret = walk_memory_range(start_pfn, end_pfn, NULL,
+                               is_memblock_offlined_cb);
+       if (ret) {
+               unlock_memory_hotplug();
+               return ret;
+       }
+
+       /* remove memmap entry */
+       firmware_map_remove(start, start + size, "System RAM");
+
+       arch_remove_memory(start, size);
+
+       try_offline_node(nid);
+
+       unlock_memory_hotplug();
+
+       return 0;
+}
 #else
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 start, u64 size)
 {
        return -EINVAL;
 }
index e2df1c1..31d2663 100644 (file)
@@ -26,7 +26,7 @@
  *                the allocation to memory nodes instead
  *
  * preferred       Try a specific node first before normal fallback.
- *                As a special case node -1 here means do the allocation
+ *                As a special case NUMA_NO_NODE here means do the allocation
  *                on the local CPU. This is normally identical to default,
  *                but useful to set in a VMA when you have a non default
  *                process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
 
        if (!pol) {
                node = numa_node_id();
-               if (node != -1)
+               if (node != NUMA_NO_NODE)
                        pol = &preferred_node_policy[node];
 
                /* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(const nodemask_t *nodemask)
 {
-       int nd, k;
-
-       for_each_node_mask(nd, *nodemask) {
-               struct zone *z;
-
-               for (k = 0; k <= policy_zone; k++) {
-                       z = &NODE_DATA(nd)->node_zones[k];
-                       if (z->present_pages > 0)
-                               return 1;
-               }
-       }
-
-       return 0;
+       return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 }
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        struct mempolicy *policy;
 
        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
-                mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
+                mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                /*
                 * vm_normal_page() filters out zero pages, but there might
                 * still be PageReserved pages to skip, perhaps in a VDSO.
-                * And we cannot move PageKsm pages sensibly or safely yet.
                 */
-               if (PageReserved(page) || PageKsm(page))
+               if (PageReserved(page))
                        continue;
                nid = page_to_nid(page);
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                       false, MIGRATE_SYNC,
-                                                       MR_SYSCALL);
+                                       MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
                 start, start + len, mode, mode_flags,
-                nmask ? nodes_addr(*nmask)[0] : -1);
+                nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
 
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                               (unsigned long)vma,
-                                               false, MIGRATE_SYNC,
-                                               MR_MEMPOLICY_MBIND);
+                                       (unsigned long)vma,
+                                       MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
        return pol;
 }
 
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+{
+       enum zone_type dynamic_policy_zone = policy_zone;
+
+       BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+
+       /*
+        * if policy->v.nodes has movable memory only,
+        * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+        *
+        * policy->v.nodes is intersect with node_states[N_MEMORY].
+        * so if the following test faile, it implies
+        * policy->v.nodes has movable memory only.
+        */
+       if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+               dynamic_policy_zone = ZONE_MOVABLE;
+
+       return zone >= dynamic_policy_zone;
+}
+
 /*
  * Return a nodemask representing a mempolicy for filtering nodes for
  * page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
        /* Lower zones don't get a nodemask applied for MPOL_BIND */
        if (unlikely(policy->mode == MPOL_BIND) &&
-                       gfp_zone(gfp) >= policy_zone &&
+                       apply_policy_zone(policy, gfp_zone(gfp)) &&
                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
                return &policy->v.nodes;
 
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                 * it less likely we act on an unlikely task<->page
                 * relation.
                 */
-               last_nid = page_xchg_last_nid(page, polnid);
+               last_nid = page_nid_xchg_last(page, polnid);
                if (last_nid != polnid)
                        goto out;
        }
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
                 vma->vm_pgoff,
                 sz, npol ? npol->mode : -1,
                 npol ? npol->flags : -1,
-                npol ? nodes_addr(npol->v.nodes)[0] : -1);
+                npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
 
        if (npol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
index 2fd8b4a..3bbaf5d 100644 (file)
@@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 
        mlock_migrate_page(newpage, page);
        ksm_migrate_page(newpage, page);
-
+       /*
+        * Please do not reorder this without considering how mm/ksm.c's
+        * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+        */
        ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
@@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 }
 
 static int __unmap_and_move(struct page *page, struct page *newpage,
-                       int force, bool offlining, enum migrate_mode mode)
+                               int force, enum migrate_mode mode)
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
@@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                lock_page(page);
        }
 
-       /*
-        * Only memory hotplug's offline_pages() caller has locked out KSM,
-        * and can safely migrate a KSM page.  The other cases have skipped
-        * PageKsm along with PageReserved - but it is only now when we have
-        * the page lock that we can be certain it will not go KSM beneath us
-        * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
-        * its pagecount raised, but only here do we take the page lock which
-        * serializes that).
-        */
-       if (PageKsm(page) && !offlining) {
-               rc = -EBUSY;
-               goto unlock;
-       }
-
        /* charge against new page */
        mem_cgroup_prepare_migration(page, newpage, &mem);
 
@@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
-       if (PageAnon(page)) {
+       if (PageAnon(page) && !PageKsm(page)) {
                /*
                 * Only page_lock_anon_vma_read() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
@@ -848,7 +837,6 @@ uncharge:
        mem_cgroup_end_migration(mem, page, newpage,
                                 (rc == MIGRATEPAGE_SUCCESS ||
                                  rc == MIGRATEPAGE_BALLOON_SUCCESS));
-unlock:
        unlock_page(page);
 out:
        return rc;
@@ -859,8 +847,7 @@ out:
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                       struct page *page, int force, bool offlining,
-                       enum migrate_mode mode)
+                       struct page *page, int force, enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                if (unlikely(split_huge_page(page)))
                        goto out;
 
-       rc = __unmap_and_move(page, newpage, force, offlining, mode);
+       rc = __unmap_and_move(page, newpage, force, mode);
 
        if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
                /*
@@ -938,8 +925,7 @@ out:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                               int force, bool offlining,
-                               enum migrate_mode mode)
+                               int force, enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -1001,9 +987,8 @@ out:
  *
  * Return: Number of pages not migrated or error code.
  */
-int migrate_pages(struct list_head *from,
-               new_page_t get_new_page, unsigned long private, bool offlining,
-               enum migrate_mode mode, int reason)
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+               unsigned long private, enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
 
                        rc = unmap_and_move(get_new_page, private,
-                                               page, pass > 2, offlining,
-                                               mode);
+                                               page, pass > 2, mode);
 
                        switch(rc) {
                        case -ENOMEM:
@@ -1058,15 +1042,13 @@ out:
 }
 
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                     unsigned long private, bool offlining,
-                     enum migrate_mode mode)
+                     unsigned long private, enum migrate_mode mode)
 {
        int pass, rc;
 
        for (pass = 0; pass < 10; pass++) {
-               rc = unmap_and_move_huge_page(get_new_page,
-                                             private, hpage, pass > 2, offlining,
-                                             mode);
+               rc = unmap_and_move_huge_page(get_new_page, private,
+                                               hpage, pass > 2, mode);
                switch (rc) {
                case -ENOMEM:
                        goto out;
@@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                        goto set_status;
 
                /* Use PageReserved to check for zero page */
-               if (PageReserved(page) || PageKsm(page))
+               if (PageReserved(page))
                        goto put_and_set;
 
                pp->page = page;
@@ -1189,8 +1171,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                               (unsigned long)pm, 0, MIGRATE_SYNC,
-                               MR_SYSCALL);
+                               (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 
                err = -ENOENT;
                /* Use PageReserved to check for zero page */
-               if (!page || PageReserved(page) || PageKsm(page))
+               if (!page || PageReserved(page))
                        goto set_status;
 
                err = page_to_nid(page);
@@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
  * pages. Currently it only checks the watermarks which crude
  */
 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
-                                  int nr_migrate_pages)
+                                  unsigned long nr_migrate_pages)
 {
        int z;
        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                          __GFP_NOWARN) &
                                         ~GFP_IOFS, 0);
        if (newpage)
-               page_xchg_last_nid(newpage, page_last_nid(page));
+               page_nid_xchg_last(newpage, page_nid_last(page));
 
        return newpage;
 }
@@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
 
 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
-       int ret = 0;
+       int page_lru;
+
+       VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
 
        /* Avoid migrating to a node that is nearly full */
-       if (migrate_balanced_pgdat(pgdat, 1)) {
-               int page_lru;
+       if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+               return 0;
 
-               if (isolate_lru_page(page)) {
-                       put_page(page);
-                       return 0;
-               }
+       if (isolate_lru_page(page))
+               return 0;
 
-               /* Page is isolated */
-               ret = 1;
-               page_lru = page_is_file_cache(page);
-               if (!PageTransHuge(page))
-                       inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
-               else
-                       mod_zone_page_state(page_zone(page),
-                                       NR_ISOLATED_ANON + page_lru,
-                                       HPAGE_PMD_NR);
+       /*
+        * migrate_misplaced_transhuge_page() skips page migration's usual
+        * check on page_count(), so we must do it here, now that the page
+        * has been isolated: a GUP pin, or any other pin, prevents migration.
+        * The expected page count is 3: 1 for page's mapcount and 1 for the
+        * caller's pin and 1 for the reference taken by isolate_lru_page().
+        */
+       if (PageTransHuge(page) && page_count(page) != 3) {
+               putback_lru_page(page);
+               return 0;
        }
 
+       page_lru = page_is_file_cache(page);
+       mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+                               hpage_nr_pages(page));
+
        /*
-        * Page is either isolated or there is not enough space on the target
-        * node. If isolated, then it has taken a reference count and the
-        * callers reference can be safely dropped without the page
-        * disappearing underneath us during migration. Otherwise the page is
-        * not to be migrated but the callers reference should still be
-        * dropped so it does not leak.
+        * Isolating the page has taken another reference, so the
+        * caller's reference can be safely dropped without the page
+        * disappearing underneath us during migration.
         */
        put_page(page);
-
-       return ret;
+       return 1;
 }
 
 /*
@@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 int migrate_misplaced_page(struct page *page, int node)
 {
        pg_data_t *pgdat = NODE_DATA(node);
-       int isolated = 0;
+       int isolated;
        int nr_remaining;
        LIST_HEAD(migratepages);
 
@@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
         * Don't migrate pages that are mapped in multiple processes.
         * TODO: Handle false sharing detection instead of this hammer
         */
-       if (page_mapcount(page) != 1) {
-               put_page(page);
+       if (page_mapcount(page) != 1)
                goto out;
-       }
 
        /*
         * Rate-limit the amount of data that is being migrated to a node.
         * Optimal placement is no good if the memory bus is saturated and
         * all the time is being spent migrating!
         */
-       if (numamigrate_update_ratelimit(pgdat, 1)) {
-               put_page(page);
+       if (numamigrate_update_ratelimit(pgdat, 1))
                goto out;
-       }
 
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated)
                goto out;
 
        list_add(&page->lru, &migratepages);
-       nr_remaining = migrate_pages(&migratepages,
-                       alloc_misplaced_dst_page,
-                       node, false, MIGRATE_ASYNC,
-                       MR_NUMA_MISPLACED);
+       nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+                                    node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
        if (nr_remaining) {
                putback_lru_pages(&migratepages);
                isolated = 0;
        } else
                count_vm_numa_event(NUMA_PAGE_MIGRATE);
        BUG_ON(!list_empty(&migratepages));
-out:
        return isolated;
+
+out:
+       put_page(page);
+       return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * Migrates a THP to a given target node. page must be locked and is unlocked
+ * before returning.
+ */
 int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                                struct vm_area_struct *vma,
                                pmd_t *pmd, pmd_t entry,
@@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
        new_page = alloc_pages_node(node,
                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
-       if (!new_page) {
-               count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-               goto out_dropref;
-       }
-       page_xchg_last_nid(new_page, page_last_nid(page));
+       if (!new_page)
+               goto out_fail;
 
-       isolated = numamigrate_isolate_page(pgdat, page);
+       page_nid_xchg_last(new_page, page_nid_last(page));
 
-       /*
-        * Failing to isolate or a GUP pin prevents migration. The expected
-        * page count is 2. 1 for anonymous pages without a mapping and 1
-        * for the callers pin. If the page was isolated, the page will
-        * need to be put back on the LRU.
-        */
-       if (!isolated || page_count(page) != 2) {
-               count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+       isolated = numamigrate_isolate_page(pgdat, page);
+       if (!isolated) {
                put_page(new_page);
-               if (isolated) {
-                       putback_lru_page(page);
-                       isolated = 0;
-                       goto out;
-               }
-               goto out_keep_locked;
+               goto out_fail;
        }
 
        /* Prepare a page as a migration target */
@@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                putback_lru_page(page);
 
                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+               isolated = 0;
                goto out;
        }
 
@@ -1772,9 +1742,11 @@ out:
                        -HPAGE_PMD_NR);
        return isolated;
 
+out_fail:
+       count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 out_dropref:
+       unlock_page(page);
        put_page(page);
-out_keep_locked:
        return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
index 936b4ce..da2be56 100644 (file)
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
        /* shmem/tmpfs may return swap: account for swapcache page too. */
        if (radix_tree_exceptional_entry(page)) {
                swp_entry_t swap = radix_to_swp_entry(page);
-               page = find_get_page(&swapper_space, swap.val);
+               page = find_get_page(swap_address_space(swap), swap.val);
        }
 #endif
        if (page) {
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        } else {
 #ifdef CONFIG_SWAP
                                pgoff = entry.val;
-                               *vec = mincore_page(&swapper_space, pgoff);
+                               *vec = mincore_page(swap_address_space(entry),
+                                       pgoff);
 #else
                                WARN_ON(1);
                                *vec = 1;
index c9bd528..e6638f5 100644 (file)
@@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page)
  *
  * vma->vm_mm->mmap_sem must be held for at least read.
  */
-static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                   unsigned long start, unsigned long end,
-                                   int *nonblocking)
+long __mlock_vma_pages_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end, int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
-       int nr_pages = (end - start) / PAGE_SIZE;
+       unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
 
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
                gup_flags |= FOLL_FORCE;
 
+       /*
+        * We made sure addr is within a VMA, so the following will
+        * not result in a stack expansion that recurses back here.
+        */
        return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
                                NULL, NULL, nonblocking);
 }
@@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval)
        return retval;
 }
 
-/**
- * mlock_vma_pages_range() - mlock pages in specified vma range.
- * @vma - the vma containing the specfied address range
- * @start - starting address in @vma to mlock
- * @end   - end address [+1] in @vma to mlock
- *
- * For mmap()/mremap()/expansion of mlocked vma.
- *
- * return 0 on success for "normal" vmas.
- *
- * return number of pages [> 0] to be removed from locked_vm on success
- * of "special" vmas.
- */
-long mlock_vma_pages_range(struct vm_area_struct *vma,
-                       unsigned long start, unsigned long end)
-{
-       int nr_pages = (end - start) / PAGE_SIZE;
-       BUG_ON(!(vma->vm_flags & VM_LOCKED));
-
-       /*
-        * filter unlockable vmas
-        */
-       if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-               goto no_mlock;
-
-       if (!((vma->vm_flags & VM_DONTEXPAND) ||
-                       is_vm_hugetlb_page(vma) ||
-                       vma == get_gate_vma(current->mm))) {
-
-               __mlock_vma_pages_range(vma, start, end, NULL);
-
-               /* Hide errors from mmap() and other callers */
-               return 0;
-       }
-
-       /*
-        * User mapped kernel pages or huge pages:
-        * make these pages present to populate the ptes, but
-        * fall thru' to reset VM_LOCKED--no need to unlock, and
-        * return nr_pages so these don't get counted against task's
-        * locked limit.  huge pages are already counted against
-        * locked vm limit.
-        */
-       make_pages_present(start, end);
-
-no_mlock:
-       vma->vm_flags &= ~VM_LOCKED;    /* and don't come back! */
-       return nr_pages;                /* error or pages NOT mlocked */
-}
-
 /*
  * munlock_vma_pages_range() - munlock all pages in the vma range.'
  * @vma - vma containing range to be munlock()ed.
@@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
  *
  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
  * munlock is a no-op.  However, for some special vmas, we go ahead and
- * populate the ptes via make_pages_present().
+ * populate the ptes.
  *
  * For vmas that pass the filters, merge/split as appropriate.
  */
@@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
 
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
-               newflags = vma->vm_flags VM_LOCKED;
-               if (!on)
-                       newflags &= ~VM_LOCKED;
+               newflags = vma->vm_flags & ~VM_LOCKED;
+               if (on)
+                       newflags |= VM_LOCKED | VM_POPULATE;
 
                tmp = vma->vm_end;
                if (tmp > end)
@@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
 
-static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 {
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
-       int ret = 0;
+       long ret = 0;
 
        VM_BUG_ON(start & ~PAGE_MASK);
        VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
-               if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+               if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
+                   VM_POPULATE)
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
@@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
        if (!error)
-               error = do_mlock_pages(start, len, 0);
+               error = __mm_populate(start, len, 0);
        return error;
 }
 
@@ -519,18 +480,18 @@ static int do_mlockall(int flags)
        struct vm_area_struct * vma, * prev = NULL;
 
        if (flags & MCL_FUTURE)
-               current->mm->def_flags |= VM_LOCKED;
+               current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
        else
-               current->mm->def_flags &= ~VM_LOCKED;
+               current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
        if (flags == MCL_FUTURE)
                goto out;
 
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
                vm_flags_t newflags;
 
-               newflags = vma->vm_flags VM_LOCKED;
-               if (!(flags & MCL_CURRENT))
-                       newflags &= ~VM_LOCKED;
+               newflags = vma->vm_flags & ~VM_LOCKED;
+               if (flags & MCL_CURRENT)
+                       newflags |= VM_LOCKED | VM_POPULATE;
 
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
-       if (!ret && (flags & MCL_CURRENT)) {
-               /* Ignore errors */
-               do_mlock_pages(0, TASK_SIZE, 1);
-       }
+       if (!ret && (flags & MCL_CURRENT))
+               mm_populate(0, TASK_SIZE);
 out:
        return ret;
 }
index 1ffd97a..c280a02 100644 (file)
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
        unsigned long or_mask, add_mask;
 
        shift = 8 * sizeof(unsigned long);
-       width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+       width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-               "Section %d Node %d Zone %d Flags %d\n",
+               "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
                SECTIONS_WIDTH,
                NODES_WIDTH,
                ZONES_WIDTH,
+               LAST_NID_WIDTH,
                NR_PAGEFLAGS);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
-               "Section %d Node %d Zone %d\n",
+               "Section %d Node %d Zone %d Lastnid %d\n",
                SECTIONS_SHIFT,
                NODES_SHIFT,
-               ZONES_SHIFT);
-       mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
-               "Section %lu Node %lu Zone %lu\n",
+               ZONES_SHIFT,
+               LAST_NID_SHIFT);
+       mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
+               "Section %lu Node %lu Zone %lu Lastnid %lu\n",
                (unsigned long)SECTIONS_PGSHIFT,
                (unsigned long)NODES_PGSHIFT,
-               (unsigned long)ZONES_PGSHIFT);
-       mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
-               "Zone ID: %lu -> %lu\n",
-               (unsigned long)ZONEID_PGOFF,
-               (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+               (unsigned long)ZONES_PGSHIFT,
+               (unsigned long)LAST_NID_PGSHIFT);
+       mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
+               "Node/Zone ID: %lu -> %lu\n",
+               (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
+               (unsigned long)ZONEID_PGOFF);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
-               "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+               "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
                shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
 #ifdef NODE_NOT_IN_PAGE_FLAGS
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
                "Node not in page flags");
 #endif
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+       mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+               "Last nid not in page flags");
+#endif
 
        if (SECTIONS_WIDTH) {
                shift -= SECTIONS_WIDTH;
index 09da0b2..318e121 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 */
                free -= global_page_state(NR_SHMEM);
 
-               free += nr_swap_pages;
+               free += get_nr_swap_pages();
 
                /*
                 * Any slabs which are created with the
@@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        unsigned long newbrk, oldbrk;
        struct mm_struct *mm = current->mm;
        unsigned long min_brk;
+       bool populate;
 
        down_write(&mm->mmap_sem);
 
@@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        /* Ok, looks good - let it rip. */
        if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
                goto out;
+
 set_brk:
        mm->brk = brk;
+       populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+       up_write(&mm->mmap_sem);
+       if (populate)
+               mm_populate(oldbrk, newbrk - oldbrk);
+       return brk;
+
 out:
        retval = mm->brk;
        up_write(&mm->mmap_sem);
@@ -801,7 +809,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                anon_vma_interval_tree_post_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_post_update_vma(next);
-               anon_vma_unlock(anon_vma);
+               anon_vma_unlock_write(anon_vma);
        }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 
 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff)
+                       unsigned long flags, unsigned long pgoff,
+                       unsigned long *populate)
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
        vm_flags_t vm_flags;
 
+       *populate = 0;
+
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
@@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                }
        }
 
-       return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+       /*
+        * Set 'VM_NORESERVE' if we should not account for the
+        * memory use of this mapping.
+        */
+       if (flags & MAP_NORESERVE) {
+               /* We honor MAP_NORESERVE if allowed to overcommit */
+               if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                       vm_flags |= VM_NORESERVE;
+
+               /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+               if (file && is_file_hugepages(file))
+                       vm_flags |= VM_NORESERVE;
+       }
+
+       addr = mmap_region(file, addr, len, vm_flags, pgoff);
+       if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
+               *populate = len;
+       return addr;
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
-                         unsigned long len, unsigned long flags,
-                         vm_flags_t vm_flags, unsigned long pgoff)
+               unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1420,20 +1447,6 @@ munmap_back:
                return -ENOMEM;
 
        /*
-        * Set 'VM_NORESERVE' if we should not account for the
-        * memory use of this mapping.
-        */
-       if ((flags & MAP_NORESERVE)) {
-               /* We honor MAP_NORESERVE if allowed to overcommit */
-               if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-                       vm_flags |= VM_NORESERVE;
-
-               /* hugetlb applies strict overcommit unless MAP_NORESERVE */
-               if (file && is_file_hugepages(file))
-                       vm_flags |= VM_NORESERVE;
-       }
-
-       /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
@@ -1531,10 +1544,12 @@ out:
 
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
-               if (!mlock_vma_pages_range(vma, addr, addr + len))
+               if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
+                                       vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
-       } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
-               make_pages_present(addr, addr + len);
+               else
+                       vma->vm_flags &= ~VM_LOCKED;
+       }
 
        if (file)
                uprobe_mmap(vma);
@@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
                return vma;
        if (!prev || expand_stack(prev, addr))
                return NULL;
-       if (prev->vm_flags & VM_LOCKED) {
-               mlock_vma_pages_range(prev, addr, prev->vm_end);
-       }
+       if (prev->vm_flags & VM_LOCKED)
+               __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
        return prev;
 }
 #else
@@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
        start = vma->vm_start;
        if (expand_stack(vma, addr))
                return NULL;
-       if (vma->vm_flags & VM_LOCKED) {
-               mlock_vma_pages_range(vma, addr, start);
-       }
+       if (vma->vm_flags & VM_LOCKED)
+               __mlock_vma_pages_range(vma, addr, start, NULL);
        return vma;
 }
 #endif
@@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
-       if (flags & VM_LOCKED) {
-               if (!mlock_vma_pages_range(vma, addr, addr + len))
-                       mm->locked_vm += (len >> PAGE_SHIFT);
-       }
+       if (flags & VM_LOCKED)
+               mm->locked_vm += (len >> PAGE_SHIFT);
        return addr;
 }
 
@@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
        struct mm_struct *mm = current->mm;
        unsigned long ret;
+       bool populate;
 
        down_write(&mm->mmap_sem);
        ret = do_brk(addr, len);
+       populate = ((mm->def_flags & VM_LOCKED) != 0);
        up_write(&mm->mmap_sem);
+       if (populate)
+               mm_populate(addr, len);
        return ret;
 }
 EXPORT_SYMBOL(vm_brk);
@@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_node))
                        BUG();
-               anon_vma_unlock(anon_vma);
+               anon_vma_unlock_write(anon_vma);
        }
 }
 
index 8a5ac8c..2175fb0 100644 (file)
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
-       struct hlist_node *n;
        int id;
 
        /*
-        * SRCU here will block mmu_notifier_unregister until
-        * ->release returns.
+        * srcu_read_lock() here will block synchronize_srcu() in
+        * mmu_notifier_unregister() until all registered
+        * ->release() callouts this function makes have
+        * returned.
         */
        id = srcu_read_lock(&srcu);
-       hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
-               /*
-                * if ->release runs before mmu_notifier_unregister it
-                * must be handled as it's the only way for the driver
-                * to flush all existing sptes and stop the driver
-                * from establishing any more sptes before all the
-                * pages in the mm are freed.
-                */
-               if (mn->ops->release)
-                       mn->ops->release(mn, mm);
-       srcu_read_unlock(&srcu, id);
-
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
                                 struct mmu_notifier,
                                 hlist);
+
                /*
-                * We arrived before mmu_notifier_unregister so
-                * mmu_notifier_unregister will do nothing other than
-                * to wait ->release to finish and
-                * mmu_notifier_unregister to return.
+                * Unlink.  This will prevent mmu_notifier_unregister()
+                * from also making the ->release() callout.
                 */
                hlist_del_init_rcu(&mn->hlist);
+               spin_unlock(&mm->mmu_notifier_mm->lock);
+
+               /*
+                * Clear sptes. (see 'release' description in mmu_notifier.h)
+                */
+               if (mn->ops->release)
+                       mn->ops->release(mn, mm);
+
+               spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
 
        /*
-        * synchronize_srcu here prevents mmu_notifier_release to
-        * return to exit_mmap (which would proceed freeing all pages
-        * in the mm) until the ->release method returns, if it was
-        * invoked by mmu_notifier_unregister.
-        *
-        * The mmu_notifier_mm can't go away from under us because one
-        * mm_count is hold by exit_mmap.
+        * All callouts to ->release() which we have done are complete.
+        * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
+        */
+       srcu_read_unlock(&srcu, id);
+
+       /*
+        * mmu_notifier_unregister() may have unlinked a notifier and may
+        * still be calling out to it.  Additionally, other notifiers
+        * may have been active via vmtruncate() et. al. Block here
+        * to ensure that all notifier callouts for this mm have been
+        * completed and the sptes are really cleaned up before returning
+        * to exit_mmap().
         */
        synchronize_srcu(&srcu);
 }
@@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
        }
        srcu_read_unlock(&srcu, id);
 }
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
@@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
        }
        srcu_read_unlock(&srcu, id);
 }
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
                                    struct mm_struct *mm,
@@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
+       spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-               /*
-                * SRCU here will force exit_mmap to wait ->release to finish
-                * before freeing the pages.
-                */
                int id;
 
-               id = srcu_read_lock(&srcu);
                /*
-                * exit_mmap will block in mmu_notifier_release to
-                * guarantee ->release is called before freeing the
-                * pages.
+                * Ensure we synchronize up with __mmu_notifier_release().
                 */
+               id = srcu_read_lock(&srcu);
+
+               hlist_del_rcu(&mn->hlist);
+               spin_unlock(&mm->mmu_notifier_mm->lock);
+
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-               srcu_read_unlock(&srcu, id);
 
-               spin_lock(&mm->mmu_notifier_mm->lock);
-               hlist_del_rcu(&mn->hlist);
+               /*
+                * Allow __mmu_notifier_release() to complete.
+                */
+               srcu_read_unlock(&srcu, id);
+       } else
                spin_unlock(&mm->mmu_notifier_mm->lock);
-       }
 
        /*
-        * Wait any running method to finish, of course including
-        * ->release if it was run by mmu_notifier_relase instead of us.
+        * Wait for any running method to finish, including ->release() if it
+        * was run by __mmu_notifier_release() instead of us.
         */
        synchronize_srcu(&srcu);
 
index 4596d81..2ac0afb 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * linux/mm/mmzone.c
  *
- * management codes for pgdats and zones.
+ * management codes for pgdats, zones and page flags
  */
 
 
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
 }
+
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
+int page_nid_xchg_last(struct page *page, int nid)
+{
+       unsigned long old_flags, flags;
+       int last_nid;
+
+       do {
+               old_flags = flags = page->flags;
+               last_nid = page_nid_last(page);
+
+               flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
+               flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+       } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+
+       return last_nid;
+}
+#endif
index f9766f4..463a257 100644 (file)
@@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (anon_vma)
-               anon_vma_unlock(anon_vma);
+               anon_vma_unlock_write(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long old_len,
-               unsigned long new_len, unsigned long new_addr)
+               unsigned long new_len, unsigned long new_addr, bool *locked)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
@@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
-               if (new_len > old_len)
-                       mlock_vma_pages_range(new_vma, new_addr + old_len,
-                                                      new_addr + new_len);
+               *locked = true;
        }
 
        return new_addr;
@@ -367,9 +365,8 @@ Eagain:
        return ERR_PTR(-EAGAIN);
 }
 
-static unsigned long mremap_to(unsigned long addr,
-       unsigned long old_len, unsigned long new_addr,
-       unsigned long new_len)
+static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
+               unsigned long new_addr, unsigned long new_len, bool *locked)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
@@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
        if (ret & ~PAGE_MASK)
                goto out1;
 
-       ret = move_vma(vma, addr, old_len, new_len, new_addr);
+       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
        if (!(ret & ~PAGE_MASK))
                goto out;
 out1:
@@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
+       bool locked = false;
 
        down_write(&current->mm->mmap_sem);
 
@@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
        if (flags & MREMAP_FIXED) {
                if (flags & MREMAP_MAYMOVE)
-                       ret = mremap_to(addr, old_len, new_addr, new_len);
+                       ret = mremap_to(addr, old_len, new_addr, new_len,
+                                       &locked);
                goto out;
        }
 
@@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
-                               mlock_vma_pages_range(vma, addr + old_len,
-                                                  addr + new_len);
+                               locked = true;
+                               new_addr = addr;
                        }
                        ret = addr;
                        goto out;
@@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                        goto out;
                }
 
-               ret = move_vma(vma, addr, old_len, new_len, new_addr);
+               ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
        up_write(&current->mm->mmap_sem);
+       if (locked && new_len > old_len)
+               mm_populate(new_addr + old_len, new_len - old_len);
        return ret;
 }
index b20db4e..da0d210 100644 (file)
@@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp)
        return PAGE_SIZE << compound_order(page);
 }
 
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long start, int nr_pages, unsigned int foll_flags,
-                    struct page **pages, struct vm_area_struct **vmas,
-                    int *retry)
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, unsigned long nr_pages,
+                     unsigned int foll_flags, struct page **pages,
+                     struct vm_area_struct **vmas, int *nonblocking)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -190,9 +190,10 @@ finish_or_fault:
  *   slab page or a secondary page from a compound page
  * - don't permit access to VMAs that don't support it, such as I/O mappings
  */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-       unsigned long start, int nr_pages, int write, int force,
-       struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                   unsigned long start, unsigned long nr_pages,
+                   int write, int force, struct page **pages,
+                   struct vm_area_struct **vmas)
 {
        int flags = 0;
 
@@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long len,
                            unsigned long prot,
                            unsigned long flags,
-                           unsigned long pgoff)
+                           unsigned long pgoff,
+                           unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
@@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
 
+       *populate = 0;
+
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        return ret;
 }
 
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
-                       unsigned int foll_flags)
+struct page *follow_page_mask(struct vm_area_struct *vma,
+                             unsigned long address, unsigned int flags,
+                             unsigned int *page_mask)
 {
+       *page_mask = 0;
        return NULL;
 }
 
@@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 */
                free -= global_page_state(NR_SHMEM);
 
-               free += nr_swap_pages;
+               free += get_nr_swap_pages();
 
                /*
                 * Any slabs which are created with the
index 0399f14..79e451a 100644 (file)
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
        dump_stack();
-       mem_cgroup_print_oom_info(memcg, p);
-       show_mem(SHOW_MEM_FILTER_NODES);
+       if (memcg)
+               mem_cgroup_print_oom_info(memcg, p);
+       else
+               show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
                dump_tasks(memcg, nodemask);
 }
index 7300c9d..cdc377c 100644 (file)
@@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
 
+       /* Subtract min_free_kbytes */
+       x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
+
        return x + 1;   /* Ensure that we never return 0 */
 }
 
index d1107ad..e9075fd 100644 (file)
@@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map = {
+       .acpi = false,
+       .nr_map = 0,
+};
+
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
        int ret = 0;
        unsigned seq;
        unsigned long pfn = page_to_pfn(page);
+       unsigned long sp, start_pfn;
 
        do {
                seq = zone_span_seqbegin(zone);
-               if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
-                       ret = 1;
-               else if (pfn < zone->zone_start_pfn)
+               start_pfn = zone->zone_start_pfn;
+               sp = zone->spanned_pages;
+               if (!zone_spans_pfn(zone, pfn))
                        ret = 1;
        } while (zone_span_seqretry(zone, seq));
 
+       if (ret)
+               pr_err("page %lu outside zone [ %lu - %lu ]\n",
+                       pfn, start_pfn, start_pfn + sp);
+
        return ret;
 }
 
@@ -288,7 +300,7 @@ static void bad_page(struct page *page)
 
        /* Don't complain about poisoned pages */
        if (PageHWPoison(page)) {
-               reset_page_mapcount(page); /* remove PageBuddy */
+               page_mapcount_reset(page); /* remove PageBuddy */
                return;
        }
 
@@ -320,7 +332,7 @@ static void bad_page(struct page *page)
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
-       reset_page_mapcount(page); /* remove PageBuddy */
+       page_mapcount_reset(page); /* remove PageBuddy */
        add_taint(TAINT_BAD_PAGE);
 }
 
@@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page,
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
 
+       VM_BUG_ON(!zone_is_initialized(zone));
+
        if (unlikely(PageCompound(page)))
                if (unlikely(destroy_compound_page(page, order)))
                        return;
@@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
-       reset_page_last_nid(page);
+       page_nid_reset_last(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
-                       if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+                       if (likely(!is_migrate_isolate_page(page))) {
                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
                                if (is_migrate_cma(mt))
                                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        zone->pages_scanned = 0;
 
        __free_one_page(page, zone, order, migratetype);
-       if (unlikely(migratetype != MIGRATE_ISOLATE))
+       if (unlikely(!is_migrate_isolate(migratetype)))
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+#ifdef CONFIG_MEMORY_ISOLATION
        [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+#endif
 };
 
 /*
@@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
        end_pfn = start_pfn + pageblock_nr_pages - 1;
 
        /* Do not cross zone boundaries */
-       if (start_pfn < zone->zone_start_pfn)
+       if (!zone_spans_pfn(zone, start_pfn))
                start_page = page;
-       if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
+       if (!zone_spans_pfn(zone, end_pfn))
                return 0;
 
        return move_freepages(zone, start_page, end_page, migratetype);
@@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        list_add_tail(&page->lru, list);
                if (IS_ENABLED(CONFIG_CMA)) {
                        mt = get_pageblock_migratetype(page);
-                       if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+                       if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
                                mt = migratetype;
                }
                set_freepage_migratetype(page, mt);
@@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
 
        spin_lock_irqsave(&zone->lock, flags);
 
-       max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       max_zone_pfn = zone_end_pfn(zone);
        for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                if (pfn_valid(pfn)) {
                        struct page *page = pfn_to_page(pfn);
@@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
         * excessively into the page allocator
         */
        if (migratetype >= MIGRATE_PCPTYPES) {
-               if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+               if (unlikely(is_migrate_isolate(migratetype))) {
                        free_one_page(zone, page, 0, migratetype);
                        goto out;
                }
@@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
        zone = page_zone(page);
        mt = get_pageblock_migratetype(page);
 
-       if (mt != MIGRATE_ISOLATE) {
+       if (!is_migrate_isolate(mt)) {
                /* Obey watermarks as if the page was being allocated */
                watermark = low_wmark_pages(zone) + (1 << order);
                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
@@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
                        int mt = get_pageblock_migratetype(page);
-                       if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+                       if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
                                set_pageblock_migratetype(page,
                                                          MIGRATE_MOVABLE);
                }
@@ -2615,10 +2631,17 @@ retry_cpuset:
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
-       if (unlikely(!page))
+       if (unlikely(!page)) {
+               /*
+                * Runtime PM, block IO and its error handling path
+                * can deadlock because I/O on the device might not
+                * complete.
+                */
+               gfp_mask = memalloc_noio_flags(gfp_mask);
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
+       }
 
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
@@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
 }
 EXPORT_SYMBOL(free_pages_exact);
 
-static unsigned int nr_free_zone_pages(int offset)
+/**
+ * nr_free_zone_pages - count number of pages beyond high watermark
+ * @offset: The zone index of the highest zone
+ *
+ * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * high watermark within all zones at or below a given zone index.  For each
+ * zone, the number of pages is calculated as:
+ *     present_pages - high_pages
+ */
+static unsigned long nr_free_zone_pages(int offset)
 {
        struct zoneref *z;
        struct zone *zone;
 
        /* Just pick one node, since fallback list is circular */
-       unsigned int sum = 0;
+       unsigned long sum = 0;
 
        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 
        for_each_zone_zonelist(zone, z, zonelist, offset) {
-               unsigned long size = zone->present_pages;
+               unsigned long size = zone->managed_pages;
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
@@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
        return sum;
 }
 
-/*
- * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+/**
+ * nr_free_buffer_pages - count number of pages beyond high watermark
+ *
+ * nr_free_buffer_pages() counts the number of pages which are beyond the high
+ * watermark within ZONE_DMA and ZONE_NORMAL.
  */
-unsigned int nr_free_buffer_pages(void)
+unsigned long nr_free_buffer_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 
-/*
- * Amount of free RAM allocatable within all zones
+/**
+ * nr_free_pagecache_pages - count number of pages beyond high watermark
+ *
+ * nr_free_pagecache_pages() counts the number of pages which are beyond the
+ * high watermark within all zones.
  */
-unsigned int nr_free_pagecache_pages(void)
+unsigned long nr_free_pagecache_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
@@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        val->totalram = pgdat->node_present_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
-       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
        val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
                        NR_FREE_PAGES);
 #else
@@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type)
 #ifdef CONFIG_CMA
                [MIGRATE_CMA]           = 'C',
 #endif
+#ifdef CONFIG_MEMORY_ISOLATION
                [MIGRATE_ISOLATE]       = 'I',
+#endif
        };
        char tmp[MIGRATE_TYPES + 1];
        char *p = tmp;
@@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
-       int best_node = -1;
+       int best_node = NUMA_NO_NODE;
        const struct cpumask *tmp = cpumask_of_node(0);
 
        /* Use the local node if we haven't already */
@@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         * the block.
         */
        start_pfn = zone->zone_start_pfn;
-       end_pfn = start_pfn + zone->spanned_pages;
+       end_pfn = zone_end_pfn(zone);
        start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
@@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                set_page_links(page, zone, nid, pfn);
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
-               reset_page_mapcount(page);
-               reset_page_last_nid(page);
+               page_mapcount_reset(page);
+               page_nid_reset_last(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 * pfn out of zone.
                 */
                if ((z->zone_start_pfn <= pfn)
-                   && (pfn < z->zone_start_pfn + z->spanned_pages)
+                   && (pfn < zone_end_pfn(z))
                    && !(pfn & (pageblock_nr_pages - 1)))
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 
@@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
         *
         * OK, so we don't know how big the cache is.  So guess.
         */
-       batch = zone->present_pages / 1024;
+       batch = zone->managed_pages / 1024;
        if (batch * PAGE_SIZE > 512 * 1024)
                batch = (512 * 1024) / PAGE_SIZE;
        batch /= 4;             /* We effectively *= 4 below */
@@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
 
                if (percpu_pagelist_fraction)
                        setup_pagelist_highmark(pcp,
-                               (zone->present_pages /
+                               (zone->managed_pages /
                                        percpu_pagelist_fraction));
        }
 }
@@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
+/**
+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
+ *
+ * zone_movable_limit is initialized as 0. This function will try to get
+ * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
+ * assigne them to zone_movable_limit.
+ * zone_movable_limit[nid] == 0 means no limit for the node.
+ *
+ * Note: Each range is represented as [start_pfn, end_pfn)
+ */
+static void __meminit sanitize_zone_movable_limit(void)
+{
+       int map_pos = 0, i, nid;
+       unsigned long start_pfn, end_pfn;
+
+       if (!movablemem_map.nr_map)
+               return;
+
+       /* Iterate all ranges from minimum to maximum */
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               /*
+                * If we have found lowest pfn of ZONE_MOVABLE of the node
+                * specified by user, just go on to check next range.
+                */
+               if (zone_movable_limit[nid])
+                       continue;
+
+#ifdef CONFIG_ZONE_DMA
+               /* Skip DMA memory. */
+               if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
+                       start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+               /* Skip DMA32 memory. */
+               if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
+                       start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
+#endif
+
+#ifdef CONFIG_HIGHMEM
+               /* Skip lowmem if ZONE_MOVABLE is highmem. */
+               if (zone_movable_is_highmem() &&
+                   start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
+                       start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+
+               if (start_pfn >= end_pfn)
+                       continue;
+
+               while (map_pos < movablemem_map.nr_map) {
+                       if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
+                               break;
+
+                       if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
+                               map_pos++;
+                               continue;
+                       }
+
+                       /*
+                        * The start_pfn of ZONE_MOVABLE is either the minimum
+                        * pfn specified by movablemem_map, or 0, which means
+                        * the node has no ZONE_MOVABLE.
+                        */
+                       zone_movable_limit[nid] = max(start_pfn,
+                                       movablemem_map.map[map_pos].start_pfn);
+
+                       break;
+               }
+       }
+}
+
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
@@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 
        return zholes_size[zone_type];
 }
-
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                nr_all_pages += freesize;
 
                zone->spanned_pages = size;
-               zone->present_pages = freesize;
+               zone->present_pages = realsize;
                /*
                 * Set an approximate value for lowmem here, it will be adjusted
                 * when the bootmem allocator frees pages into the buddy system.
@@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 * for the buddy allocator to function correctly.
                 */
                start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
-               end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+               end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
@@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                required_kernelcore = max(required_kernelcore, corepages);
        }
 
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If neither kernelcore/movablecore nor movablemem_map is specified,
+        * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
+        * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
+        */
+       if (!required_kernelcore) {
+               if (movablemem_map.nr_map)
+                       memcpy(zone_movable_pfn, zone_movable_limit,
+                               sizeof(zone_movable_pfn));
                goto out;
+       }
 
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-       find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
@@ -4864,10 +4981,24 @@ restart:
                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                        unsigned long size_pages;
 
+                       /*
+                        * Find more memory for kernelcore in
+                        * [zone_movable_pfn[nid], zone_movable_limit[nid]).
+                        */
                        start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                        if (start_pfn >= end_pfn)
                                continue;
 
+                       if (zone_movable_limit[nid]) {
+                               end_pfn = min(end_pfn, zone_movable_limit[nid]);
+                               /* No range left for kernelcore in this node */
+                               if (start_pfn >= end_pfn) {
+                                       zone_movable_pfn[nid] =
+                                                       zone_movable_limit[nid];
+                                       break;
+                               }
+                       }
+
                        /* Account for what is only usable for kernelcore */
                        if (start_pfn < usable_startpfn) {
                                unsigned long kernel_pages;
@@ -4927,12 +5058,12 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
 
+out:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 
-out:
        /* restore the node_state */
        node_states[N_MEMORY] = saved_node_state;
 }
@@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+       find_usable_zone_for_movable();
+       sanitize_zone_movable_limit();
        find_zone_movable_pfns_for_nodes();
 
        /* Print out the zone ranges */
@@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
+/**
+ * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
+ * @start_pfn: start pfn of the range to be checked
+ * @end_pfn:   end pfn of the range to be checked (exclusive)
+ *
+ * This function checks if a given memory range [start_pfn, end_pfn) overlaps
+ * the movablemem_map.map[] array.
+ *
+ * Return: index of the first overlapped element in movablemem_map.map[]
+ *         or -1 if they don't overlap each other.
+ */
+int __init movablemem_map_overlap(unsigned long start_pfn,
+                                  unsigned long end_pfn)
+{
+       int overlap;
+
+       if (!movablemem_map.nr_map)
+               return -1;
+
+       for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
+               if (start_pfn < movablemem_map.map[overlap].end_pfn)
+                       break;
+
+       if (overlap == movablemem_map.nr_map ||
+           end_pfn <= movablemem_map.map[overlap].start_pfn)
+               return -1;
+
+       return overlap;
+}
+
+/**
+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
+ * @start_pfn: start pfn of the range
+ * @end_pfn:   end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+void __init insert_movablemem_map(unsigned long start_pfn,
+                                 unsigned long end_pfn)
+{
+       int pos, overlap;
+
+       /*
+        * pos will be at the 1st overlapped range, or the position
+        * where the element should be inserted.
+        */
+       for (pos = 0; pos < movablemem_map.nr_map; pos++)
+               if (start_pfn <= movablemem_map.map[pos].end_pfn)
+                       break;
+
+       /* If there is no overlapped range, just insert the element. */
+       if (pos == movablemem_map.nr_map ||
+           end_pfn < movablemem_map.map[pos].start_pfn) {
+               /*
+                * If pos is not the end of array, we need to move all
+                * the rest elements backward.
+                */
+               if (pos < movablemem_map.nr_map)
+                       memmove(&movablemem_map.map[pos+1],
+                               &movablemem_map.map[pos],
+                               sizeof(struct movablemem_entry) *
+                               (movablemem_map.nr_map - pos));
+               movablemem_map.map[pos].start_pfn = start_pfn;
+               movablemem_map.map[pos].end_pfn = end_pfn;
+               movablemem_map.nr_map++;
+               return;
+       }
+
+       /* overlap will be at the last overlapped range */
+       for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
+               if (end_pfn < movablemem_map.map[overlap].start_pfn)
+                       break;
+
+       /*
+        * If there are more ranges overlapped, we need to merge them,
+        * and move the rest elements forward.
+        */
+       overlap--;
+       movablemem_map.map[pos].start_pfn = min(start_pfn,
+                                       movablemem_map.map[pos].start_pfn);
+       movablemem_map.map[pos].end_pfn = max(end_pfn,
+                                       movablemem_map.map[overlap].end_pfn);
+
+       if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
+               memmove(&movablemem_map.map[pos+1],
+                       &movablemem_map.map[overlap+1],
+                       sizeof(struct movablemem_entry) *
+                       (movablemem_map.nr_map - overlap - 1));
+
+       movablemem_map.nr_map -= overlap - pos;
+}
+
+/**
+ * movablemem_map_add_region - Add a memory range into movablemem_map.
+ * @start:     physical start address of range
+ * @end:       physical end address of range
+ *
+ * This function transform the physical address into pfn, and then add the
+ * range into movablemem_map by calling insert_movablemem_map().
+ */
+static void __init movablemem_map_add_region(u64 start, u64 size)
+{
+       unsigned long start_pfn, end_pfn;
+
+       /* In case size == 0 or start + size overflows */
+       if (start + size <= start)
+               return;
+
+       if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
+               pr_err("movablemem_map: too many entries;"
+                       " ignoring [mem %#010llx-%#010llx]\n",
+                       (unsigned long long) start,
+                       (unsigned long long) (start + size - 1));
+               return;
+       }
+
+       start_pfn = PFN_DOWN(start);
+       end_pfn = PFN_UP(start + size);
+       insert_movablemem_map(start_pfn, end_pfn);
+}
+
+/*
+ * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
+ * @p: The boot option of the following format:
+ *     movablemem_map=nn[KMG]@ss[KMG]
+ *
+ * This option sets the memory range [ss, ss+nn) to be used as movable memory.
+ *
+ * Return: 0 on success or -EINVAL on failure.
+ */
+static int __init cmdline_parse_movablemem_map(char *p)
+{
+       char *oldp;
+       u64 start_at, mem_size;
+
+       if (!p)
+               goto err;
+
+       if (!strcmp(p, "acpi"))
+               movablemem_map.acpi = true;
+
+       /*
+        * If user decide to use info from BIOS, all the other user specified
+        * ranges will be ingored.
+        */
+       if (movablemem_map.acpi) {
+               if (movablemem_map.nr_map) {
+                       memset(movablemem_map.map, 0,
+                               sizeof(struct movablemem_entry)
+                               * movablemem_map.nr_map);
+                       movablemem_map.nr_map = 0;
+               }
+               return 0;
+       }
+
+       oldp = p;
+       mem_size = memparse(p, &p);
+       if (p == oldp)
+               goto err;
+
+       if (*p == '@') {
+               oldp = ++p;
+               start_at = memparse(p, &p);
+               if (p == oldp || *p != '\0')
+                       goto err;
+
+               movablemem_map_add_region(start_at, mem_size);
+               return 0;
+       }
+err:
+       return -EINVAL;
+}
+early_param("movablemem_map", cmdline_parse_movablemem_map);
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
@@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
                        /* we treat the high watermark as reserved pages. */
                        max += high_wmark_pages(zone);
 
-                       if (max > zone->present_pages)
-                               max = zone->present_pages;
+                       if (max > zone->managed_pages)
+                               max = zone->managed_pages;
                        reserve_pages += max;
                        /*
                         * Lowmem reserves are not available to
@@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
-                       unsigned long present_pages = zone->present_pages;
+                       unsigned long managed_pages = zone->managed_pages;
 
                        zone->lowmem_reserve[j] = 0;
 
@@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
                                        sysctl_lowmem_reserve_ratio[idx] = 1;
 
                                lower_zone = pgdat->node_zones + idx;
-                               lower_zone->lowmem_reserve[j] = present_pages /
+                               lower_zone->lowmem_reserve[j] = managed_pages /
                                        sysctl_lowmem_reserve_ratio[idx];
-                               present_pages += lower_zone->present_pages;
+                               managed_pages += lower_zone->managed_pages;
                        }
                }
        }
@@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
        /* Calculate total number of !ZONE_HIGHMEM pages */
        for_each_zone(zone) {
                if (!is_highmem(zone))
-                       lowmem_pages += zone->present_pages;
+                       lowmem_pages += zone->managed_pages;
        }
 
        for_each_zone(zone) {
                u64 tmp;
 
                spin_lock_irqsave(&zone->lock, flags);
-               tmp = (u64)pages_min * zone->present_pages;
+               tmp = (u64)pages_min * zone->managed_pages;
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
                        /*
@@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
                         * deltas controls asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
-                       int min_pages;
+                       unsigned long min_pages;
 
-                       min_pages = zone->present_pages / 1024;
-                       if (min_pages < SWAP_CLUSTER_MAX)
-                               min_pages = SWAP_CLUSTER_MAX;
-                       if (min_pages > 128)
-                               min_pages = 128;
+                       min_pages = zone->managed_pages / 1024;
+                       min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
                        zone->watermark[WMARK_MIN] = min_pages;
                } else {
                        /*
@@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
        unsigned int gb, ratio;
 
        /* Zone size in gigabytes */
-       gb = zone->present_pages >> (30 - PAGE_SHIFT);
+       gb = zone->managed_pages >> (30 - PAGE_SHIFT);
        if (gb)
                ratio = int_sqrt(10 * gb);
        else
@@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
 
        for_each_zone(zone)
-               zone->min_unmapped_pages = (zone->present_pages *
+               zone->min_unmapped_pages = (zone->managed_pages *
                                sysctl_min_unmapped_ratio) / 100;
        return 0;
 }
@@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
 
        for_each_zone(zone)
-               zone->min_slab_pages = (zone->present_pages *
+               zone->min_slab_pages = (zone->managed_pages *
                                sysctl_min_slab_ratio) / 100;
        return 0;
 }
@@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        for_each_populated_zone(zone) {
                for_each_possible_cpu(cpu) {
                        unsigned long  high;
-                       high = zone->present_pages / percpu_pagelist_fraction;
+                       high = zone->managed_pages / percpu_pagelist_fraction;
                        setup_pagelist_highmark(
                                per_cpu_ptr(zone->pageset, cpu), high);
                }
@@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
-       VM_BUG_ON(pfn < zone->zone_start_pfn);
-       VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
+       VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 
        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
                if (flags & value)
@@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
 
        zone = page_zone(page);
        pfn = page_to_pfn(page);
-       if (zone->zone_start_pfn > pfn ||
-                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+       if (!zone_spans_pfn(zone, pfn))
                return false;
 
        return !has_unmovable_pages(zone, page, 0, true);
@@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                                                        &cc->migratepages);
                cc->nr_migratepages -= nr_reclaimed;
 
-               ret = migrate_pages(&cc->migratepages,
-                                   alloc_migrate_target,
-                                   0, false, MIGRATE_SYNC,
-                                   MR_CMA);
+               ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
+                                   0, MIGRATE_SYNC, MR_CMA);
        }
-
-       putback_movable_pages(&cc->migratepages);
-       return ret > 0 ? 0 : ret;
+       if (ret < 0) {
+               putback_movable_pages(&cc->migratepages);
+               return ret;
+       }
+       return 0;
 }
 
 /**
index 3d38edf..807c96b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
         */
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
-               anon_vma_unlock(anon_vma);
+               anon_vma_unlock_write(anon_vma);
        }
 
        kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
-               anon_vma_unlock(anon_vma);
+               anon_vma_unlock_write(anon_vma);
 
                if (unlikely(allocated))
                        put_anon_vma(allocated);
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
-       anon_vma_unlock(anon_vma);
+       anon_vma_unlock_write(anon_vma);
 
        return 0;
 
index 5dd56f6..1ad7924 100644 (file)
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
                                        pgoff_t start, unsigned int nr_pages,
                                        struct page **pages, pgoff_t *indices)
 {
-       unsigned int i;
-       unsigned int ret;
-       unsigned int nr_found;
+       void **slot;
+       unsigned int ret = 0;
+       struct radix_tree_iter iter;
+
+       if (!nr_pages)
+               return 0;
 
        rcu_read_lock();
 restart:
-       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, indices, start, nr_pages);
-       ret = 0;
-       for (i = 0; i < nr_found; i++) {
+       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
                struct page *page;
 repeat:
-               page = radix_tree_deref_slot((void **)pages[i]);
+               page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
                if (radix_tree_exception(page)) {
@@ -364,17 +364,16 @@ repeat:
                        goto repeat;
 
                /* Has the page moved? */
-               if (unlikely(page != *((void **)pages[i]))) {
+               if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
 export:
-               indices[ret] = indices[i];
+               indices[ret] = iter.index;
                pages[ret] = page;
-               ret++;
+               if (++ret == nr_pages)
+                       break;
        }
-       if (unlikely(!ret && nr_found))
-               goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                               bool remount)
 {
        char *this_char, *value, *rest;
+       struct mempolicy *mpol = NULL;
        uid_t uid;
        gid_t gid;
 
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        printk(KERN_ERR
                            "tmpfs: No value for mount option '%s'\n",
                            this_char);
-                       return 1;
+                       goto error;
                }
 
                if (!strcmp(this_char,"size")) {
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        if (!gid_valid(sbinfo->gid))
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                       if (mpol_parse_str(value, &sbinfo->mpol))
+                       mpol_put(mpol);
+                       mpol = NULL;
+                       if (mpol_parse_str(value, &mpol))
                                goto bad_val;
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
-                       return 1;
+                       goto error;
                }
        }
+       sbinfo->mpol = mpol;
        return 0;
 
 bad_val:
        printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
               value, this_char);
+error:
+       mpol_put(mpol);
        return 1;
 
 }
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        unsigned long inodes;
        int error = -EINVAL;
 
+       config.mpol = NULL;
        if (shmem_parse_options(data, &config, true))
                return error;
 
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
 
-       mpol_put(sbinfo->mpol);
-       sbinfo->mpol        = config.mpol;      /* transfers initial ref */
+       /*
+        * Preserve previous mempolicy unless mpol remount option was specified.
+        */
+       if (config.mpol) {
+               mpol_put(sbinfo->mpol);
+               sbinfo->mpol = config.mpol;     /* transfers initial ref */
+       }
 out:
        spin_unlock(&sbinfo->stat_lock);
        return error;
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb)
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
        percpu_counter_destroy(&sbinfo->used_blocks);
+       mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
 }
index a99fdf7..eeed4a0 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
                        clear_slob_page_free(sp);
                spin_unlock_irqrestore(&slob_lock, flags);
                __ClearPageSlab(sp);
-               reset_page_mapcount(sp);
+               page_mapcount_reset(sp);
                slob_free_pages(b, 0);
                return;
        }
index ba2ca53..ebcc44e 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __ClearPageSlab(page);
 
        memcg_release_pages(s, order);
-       reset_page_mapcount(page);
+       page_mapcount_reset(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
        __free_memcg_kmem_pages(page, order);
index 6b5fb76..7ca6dc8 100644 (file)
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 }
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-       return; /* XXX: Not implemented yet */
+       vmemmap_free(memmap, nr_pages);
 }
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
+       vmemmap_free(memmap, nr_pages);
 }
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
        /*
         * Check to see if allocation came from hot-plug-add
         */
-       if (PageSlab(usemap_page)) {
+       if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
                kfree(usemap);
                if (memmap)
                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 
        for (i = 0; i < PAGES_PER_SECTION; i++) {
                if (PageHWPoison(&memmap[i])) {
-                       atomic_long_sub(1, &mce_bad_pages);
+                       atomic_long_sub(1, &num_poisoned_pages);
                        ClearPageHWPoison(&memmap[i]);
                }
        }
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
        struct page *memmap = NULL;
-       unsigned long *usemap = NULL;
+       unsigned long *usemap = NULL, flags;
+       struct pglist_data *pgdat = zone->zone_pgdat;
 
+       pgdat_resize_lock(pgdat, &flags);
        if (ms->section_mem_map) {
                usemap = ms->pageblock_flags;
                memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
                ms->section_mem_map = 0;
                ms->pageblock_flags = NULL;
        }
+       pgdat_resize_unlock(pgdat, &flags);
 
        clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
        free_section_usemap(memmap, usemap);
index 6310dc2..8a529a0 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 void __init swap_setup(void)
 {
        unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-
 #ifdef CONFIG_SWAP
-       bdi_init(swapper_space.backing_dev_info);
+       int i;
+
+       bdi_init(swapper_spaces[0].backing_dev_info);
+       for (i = 0; i < MAX_SWAPFILES; i++) {
+               spin_lock_init(&swapper_spaces[i].tree_lock);
+               INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
+       }
 #endif
 
        /* Use a smaller cluster for small-memory machines */
index 0cb36fb..7efcf15 100644 (file)
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 };
 
-struct address_space swapper_space = {
-       .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-       .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
-       .a_ops          = &swap_aops,
-       .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
-       .backing_dev_info = &swap_backing_dev_info,
+struct address_space swapper_spaces[MAX_SWAPFILES] = {
+       [0 ... MAX_SWAPFILES - 1] = {
+               .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+               .a_ops          = &swap_aops,
+               .backing_dev_info = &swap_backing_dev_info,
+       }
 };
 
 #define INC_CACHE_INFO(x)      do { swap_cache_info.x++; } while (0)
@@ -53,13 +53,24 @@ static struct {
        unsigned long find_total;
 } swap_cache_info;
 
+unsigned long total_swapcache_pages(void)
+{
+       int i;
+       unsigned long ret = 0;
+
+       for (i = 0; i < MAX_SWAPFILES; i++)
+               ret += swapper_spaces[i].nrpages;
+       return ret;
+}
+
 void show_swap_cache_info(void)
 {
-       printk("%lu pages in swap cache\n", total_swapcache_pages);
+       printk("%lu pages in swap cache\n", total_swapcache_pages());
        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-       printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+       printk("Free swap  = %ldkB\n",
+               get_nr_swap_pages() << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
@@ -70,6 +81,7 @@ void show_swap_cache_info(void)
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
        int error;
+       struct address_space *address_space;
 
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
        SetPageSwapCache(page);
        set_page_private(page, entry.val);
 
-       spin_lock_irq(&swapper_space.tree_lock);
-       error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+       address_space = swap_address_space(entry);
+       spin_lock_irq(&address_space->tree_lock);
+       error = radix_tree_insert(&address_space->page_tree,
+                                       entry.val, page);
        if (likely(!error)) {
-               total_swapcache_pages++;
+               address_space->nrpages++;
                __inc_zone_page_state(page, NR_FILE_PAGES);
                INC_CACHE_INFO(add_total);
        }
-       spin_unlock_irq(&swapper_space.tree_lock);
+       spin_unlock_irq(&address_space->tree_lock);
 
        if (unlikely(error)) {
                /*
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
+       swp_entry_t entry;
+       struct address_space *address_space;
+
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapCache(page));
        VM_BUG_ON(PageWriteback(page));
 
-       radix_tree_delete(&swapper_space.page_tree, page_private(page));
+       entry.val = page_private(page);
+       address_space = swap_address_space(entry);
+       radix_tree_delete(&address_space->page_tree, page_private(page));
        set_page_private(page, 0);
        ClearPageSwapCache(page);
-       total_swapcache_pages--;
+       address_space->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
 }
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page)
 void delete_from_swap_cache(struct page *page)
 {
        swp_entry_t entry;
+       struct address_space *address_space;
 
        entry.val = page_private(page);
 
-       spin_lock_irq(&swapper_space.tree_lock);
+       address_space = swap_address_space(entry);
+       spin_lock_irq(&address_space->tree_lock);
        __delete_from_swap_cache(page);
-       spin_unlock_irq(&swapper_space.tree_lock);
+       spin_unlock_irq(&address_space->tree_lock);
 
        swapcache_free(entry, page);
        page_cache_release(page);
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 {
        struct page *page;
 
-       page = find_get_page(&swapper_space, entry.val);
+       page = find_get_page(swap_address_space(entry), entry.val);
 
        if (page)
                INC_CACHE_INFO(find_success);
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * called after lookup_swap_cache() failed, re-calling
                 * that would confuse statistics.
                 */
-               found_page = find_get_page(&swapper_space, entry.val);
+               found_page = find_get_page(swap_address_space(entry),
+                                       entry.val);
                if (found_page)
                        break;
 
index e97a0e5..c72c648 100644 (file)
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 
 DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
-long nr_swap_pages;
+atomic_long_t nr_swap_pages;
+/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
 static int least_priority;
+static atomic_t highest_priority_index = ATOMIC_INIT(-1);
 
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
        struct page *page;
        int ret = 0;
 
-       page = find_get_page(&swapper_space, entry.val);
+       page = find_get_page(swap_address_space(entry), entry.val);
        if (!page)
                return 0;
        /*
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        si->lowest_alloc = si->max;
                        si->highest_alloc = 0;
                }
-               spin_unlock(&swap_lock);
+               spin_unlock(&si->lock);
 
                /*
                 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
-                               spin_lock(&swap_lock);
+                               spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
-                               spin_lock(&swap_lock);
+                               spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                }
 
                offset = scan_base;
-               spin_lock(&swap_lock);
+               spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
                si->lowest_alloc = 0;
        }
@@ -293,9 +295,9 @@ checks:
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
-               spin_unlock(&swap_lock);
+               spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset);
-               spin_lock(&swap_lock);
+               spin_lock(&si->lock);
                /* entry was freed successfully, try to use this again */
                if (swap_was_freed)
                        goto checks;
@@ -335,13 +337,13 @@ checks:
                            si->lowest_alloc <= last_in_cluster)
                                last_in_cluster = si->lowest_alloc - 1;
                        si->flags |= SWP_DISCARDING;
-                       spin_unlock(&swap_lock);
+                       spin_unlock(&si->lock);
 
                        if (offset < last_in_cluster)
                                discard_swap_cluster(si, offset,
                                        last_in_cluster - offset + 1);
 
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                        si->lowest_alloc = 0;
                        si->flags &= ~SWP_DISCARDING;
 
@@ -355,10 +357,10 @@ checks:
                         * could defer that delay until swap_writepage,
                         * but it's easier to keep this self-contained.
                         */
-                       spin_unlock(&swap_lock);
+                       spin_unlock(&si->lock);
                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
                                wait_for_discard, TASK_UNINTERRUPTIBLE);
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                } else {
                        /*
                         * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
        return offset;
 
 scan:
-       spin_unlock(&swap_lock);
+       spin_unlock(&si->lock);
        while (++offset <= si->highest_bit) {
                if (!si->swap_map[offset]) {
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                        goto checks;
                }
                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                        goto checks;
                }
                if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
        offset = si->lowest_bit;
        while (++offset < scan_base) {
                if (!si->swap_map[offset]) {
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                        goto checks;
                }
                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
-                       spin_lock(&swap_lock);
+                       spin_lock(&si->lock);
                        goto checks;
                }
                if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
                        latency_ration = LATENCY_LIMIT;
                }
        }
-       spin_lock(&swap_lock);
+       spin_lock(&si->lock);
 
 no_page:
        si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
        pgoff_t offset;
        int type, next;
        int wrapped = 0;
+       int hp_index;
 
        spin_lock(&swap_lock);
-       if (nr_swap_pages <= 0)
+       if (atomic_long_read(&nr_swap_pages) <= 0)
                goto noswap;
-       nr_swap_pages--;
+       atomic_long_dec(&nr_swap_pages);
 
        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+               hp_index = atomic_xchg(&highest_priority_index, -1);
+               /*
+                * highest_priority_index records current highest priority swap
+                * type which just frees swap entries. If its priority is
+                * higher than that of swap_list.next swap type, we use it.  It
+                * isn't protected by swap_lock, so it can be an invalid value
+                * if the corresponding swap type is swapoff. We double check
+                * the flags here. It's even possible the swap type is swapoff
+                * and swapon again and its priority is changed. In such rare
+                * case, low prority swap type might be used, but eventually
+                * high priority swap will be used after several rounds of
+                * swap.
+                */
+               if (hp_index != -1 && hp_index != type &&
+                   swap_info[type]->prio < swap_info[hp_index]->prio &&
+                   (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+                       type = hp_index;
+                       swap_list.next = type;
+               }
+
                si = swap_info[type];
                next = si->next;
                if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
                        wrapped++;
                }
 
-               if (!si->highest_bit)
+               spin_lock(&si->lock);
+               if (!si->highest_bit) {
+                       spin_unlock(&si->lock);
                        continue;
-               if (!(si->flags & SWP_WRITEOK))
+               }
+               if (!(si->flags & SWP_WRITEOK)) {
+                       spin_unlock(&si->lock);
                        continue;
+               }
 
                swap_list.next = next;
+
+               spin_unlock(&swap_lock);
                /* This is called for allocating swap entry for cache */
                offset = scan_swap_map(si, SWAP_HAS_CACHE);
-               if (offset) {
-                       spin_unlock(&swap_lock);
+               spin_unlock(&si->lock);
+               if (offset)
                        return swp_entry(type, offset);
-               }
+               spin_lock(&swap_lock);
                next = swap_list.next;
        }
 
-       nr_swap_pages++;
+       atomic_long_inc(&nr_swap_pages);
 noswap:
        spin_unlock(&swap_lock);
        return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
        struct swap_info_struct *si;
        pgoff_t offset;
 
-       spin_lock(&swap_lock);
        si = swap_info[type];
+       spin_lock(&si->lock);
        if (si && (si->flags & SWP_WRITEOK)) {
-               nr_swap_pages--;
+               atomic_long_dec(&nr_swap_pages);
                /* This is called for allocating swap entry, not cache */
                offset = scan_swap_map(si, 1);
                if (offset) {
-                       spin_unlock(&swap_lock);
+                       spin_unlock(&si->lock);
                        return swp_entry(type, offset);
                }
-               nr_swap_pages++;
+               atomic_long_inc(&nr_swap_pages);
        }
-       spin_unlock(&swap_lock);
+       spin_unlock(&si->lock);
        return (swp_entry_t) {0};
 }
 
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
                goto bad_offset;
        if (!p->swap_map[offset])
                goto bad_free;
-       spin_lock(&swap_lock);
+       spin_lock(&p->lock);
        return p;
 
 bad_free:
@@ -511,6 +541,27 @@ out:
        return NULL;
 }
 
+/*
+ * This swap type frees swap entry, check if it is the highest priority swap
+ * type which just frees swap entry. get_swap_page() uses
+ * highest_priority_index to search highest priority swap type. The
+ * swap_info_struct.lock can't protect us if there are multiple swap types
+ * active, so we use atomic_cmpxchg.
+ */
+static void set_highest_priority_index(int type)
+{
+       int old_hp_index, new_hp_index;
+
+       do {
+               old_hp_index = atomic_read(&highest_priority_index);
+               if (old_hp_index != -1 &&
+                       swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+                       break;
+               new_hp_index = type;
+       } while (atomic_cmpxchg(&highest_priority_index,
+               old_hp_index, new_hp_index) != old_hp_index);
+}
+
 static unsigned char swap_entry_free(struct swap_info_struct *p,
                                     swp_entry_t entry, unsigned char usage)
 {
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
                        p->highest_bit = offset;
-               if (swap_list.next >= 0 &&
-                   p->prio > swap_info[swap_list.next]->prio)
-                       swap_list.next = p->type;
-               nr_swap_pages++;
+               set_highest_priority_index(p->type);
+               atomic_long_inc(&nr_swap_pages);
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
                if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
                swap_entry_free(p, entry, 1);
-               spin_unlock(&swap_lock);
+               spin_unlock(&p->lock);
        }
 }
 
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
                count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
                if (page)
                        mem_cgroup_uncharge_swapcache(page, entry, count != 0);
-               spin_unlock(&swap_lock);
+               spin_unlock(&p->lock);
        }
 }
 
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
        p = swap_info_get(entry);
        if (p) {
                count = swap_count(p->swap_map[swp_offset(entry)]);
-               spin_unlock(&swap_lock);
+               spin_unlock(&p->lock);
        }
        return count;
 }
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
                if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
-                       page = find_get_page(&swapper_space, entry.val);
+                       page = find_get_page(swap_address_space(entry),
+                                               entry.val);
                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
                                page = NULL;
                        }
                }
-               spin_unlock(&swap_lock);
+               spin_unlock(&p->lock);
        }
        if (page) {
                /*
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];
 
+               spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= sis->inuse_pages;
                }
+               spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
+       struct page *swapcache;
        struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
 
+       swapcache = page;
+       page = ksm_might_need_to_copy(page, vma, addr);
+       if (unlikely(!page))
+               return -ENOMEM;
+
        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
                                         GFP_KERNEL, &memcg)) {
                ret = -ENOMEM;
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        get_page(page);
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-       page_add_anon_rmap(page, vma, addr);
+       if (page == swapcache)
+               page_add_anon_rmap(page, vma, addr);
+       else /* ksm created a completely new copy */
+               page_add_new_anon_rmap(page, vma, addr);
        mem_cgroup_commit_charge_swapin(page, memcg);
        swap_free(entry);
        /*
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 out:
        pte_unmap_unlock(pte, ptl);
 out_nolock:
+       if (page != swapcache) {
+               unlock_page(page);
+               put_page(page);
+       }
        return ret;
 }
 
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
        p->swap_map = swap_map;
        frontswap_map_set(p, frontswap_map);
        p->flags |= SWP_WRITEOK;
-       nr_swap_pages += p->pages;
+       atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
 
        /* insert swap space into swap_list: */
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned long *frontswap_map)
 {
        spin_lock(&swap_lock);
+       spin_lock(&p->lock);
        _enable_swap_info(p, prio, swap_map, frontswap_map);
        frontswap_init(p->type);
+       spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
 
 static void reinsert_swap_info(struct swap_info_struct *p)
 {
        spin_lock(&swap_lock);
+       spin_lock(&p->lock);
        _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+       spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
 
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
+       spin_lock(&p->lock);
        if (p->prio < 0) {
                for (i = p->next; i >= 0; i = swap_info[i]->next)
                        swap_info[i]->prio = p->prio--;
                least_priority++;
        }
-       nr_swap_pages -= p->pages;
+       atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
+       spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 
        set_current_oom_origin();
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
+       spin_lock(&p->lock);
        drain_mmlist();
 
        /* wait for anyone still in scan_swap_map */
        p->highest_bit = 0;             /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
+               spin_unlock(&p->lock);
                spin_unlock(&swap_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&swap_lock);
+               spin_lock(&p->lock);
        }
 
        swap_file = p->swap_file;
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->swap_map = NULL;
        p->flags = 0;
        frontswap_invalidate_area(type);
+       spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
        p->flags = SWP_USED;
        p->next = -1;
        spin_unlock(&swap_lock);
+       spin_lock_init(&p->lock);
 
        return p;
 }
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val)
                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += si->inuse_pages;
        }
-       val->freeswap = nr_swap_pages + nr_to_be_unused;
+       val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
 }
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        p = swap_info[type];
        offset = swp_offset(entry);
 
-       spin_lock(&swap_lock);
+       spin_lock(&p->lock);
        if (unlikely(offset >= p->max))
                goto unlock_out;
 
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        p->swap_map[offset] = count | has_cache;
 
 unlock_out:
-       spin_unlock(&swap_lock);
+       spin_unlock(&p->lock);
 out:
        return err;
 
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        }
 
        if (!page) {
-               spin_unlock(&swap_lock);
+               spin_unlock(&si->lock);
                return -ENOMEM;
        }
 
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        list_add_tail(&page->lru, &head->lru);
        page = NULL;                    /* now it's attached, don't free it */
 out:
-       spin_unlock(&swap_lock);
+       spin_unlock(&si->lock);
 outer:
        if (page)
                __free_page(page);
index c55e26b..ab1424d 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,8 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 {
        unsigned long ret;
        struct mm_struct *mm = current->mm;
+       unsigned long populate;
 
        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                down_write(&mm->mmap_sem);
-               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
+                                   &populate);
                up_write(&mm->mmap_sem);
+               if (populate)
+                       mm_populate(ret, populate);
        }
        return ret;
 }
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
+struct address_space *page_mapping(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+
+       VM_BUG_ON(PageSlab(page));
+#ifdef CONFIG_SWAP
+       if (unlikely(PageSwapCache(page))) {
+               swp_entry_t entry;
+
+               entry.val = page_private(page);
+               mapping = swap_address_space(entry);
+       } else
+#endif
+       if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+               mapping = NULL;
+       return mapping;
+}
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
index 5123a16..0f751f2 100644 (file)
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
                                unsigned long start, unsigned long end)
 {
-       return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
-                                               __builtin_return_address(0));
+       return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
+                                 GFP_KERNEL, __builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
 
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
 {
-       return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
-                                 caller);
+       return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
+                                 GFP_KERNEL, caller);
 }
 
 /**
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                               -1, GFP_KERNEL, __builtin_return_address(0));
+                                 NUMA_NO_NODE, GFP_KERNEL,
+                                 __builtin_return_address(0));
 }
 
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                               -1, GFP_KERNEL, caller);
+                                 NUMA_NO_NODE, GFP_KERNEL, caller);
 }
 
 /**
@@ -1650,7 +1651,7 @@ fail:
  *     @end:           vm area range end
  *     @gfp_mask:      flags for the page level allocator
  *     @prot:          protection mask for the allocated pages
- *     @node:          node to use for allocation or -1
+ *     @node:          node to use for allocation or NUMA_NO_NODE
  *     @caller:        caller's return address
  *
  *     Allocate enough pages to cover @size from the page level
@@ -1706,7 +1707,7 @@ fail:
  *     @align:         desired alignment
  *     @gfp_mask:      flags for the page level allocator
  *     @prot:          protection mask for the allocated pages
- *     @node:          node to use for allocation or -1
+ *     @node:          node to use for allocation or NUMA_NO_NODE
  *     @caller:        caller's return address
  *
  *     Allocate enough pages to cover @size from the page level
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-       return __vmalloc_node(size, 1, gfp_mask, prot, -1,
+       return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
  */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
+       return __vmalloc_node_flags(size, NUMA_NO_NODE,
+                                   GFP_KERNEL | __GFP_HIGHMEM);
 }
 EXPORT_SYMBOL(vmalloc);
 
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc);
  */
 void *vzalloc(unsigned long size)
 {
-       return __vmalloc_node_flags(size, -1,
+       return __vmalloc_node_flags(size, NUMA_NO_NODE,
                                GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 }
 EXPORT_SYMBOL(vzalloc);
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size)
 
        ret = __vmalloc_node(size, SHMLBA,
                             GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
-                            PAGE_KERNEL, -1, __builtin_return_address(0));
+                            PAGE_KERNEL, NUMA_NO_NODE,
+                            __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node);
 void *vmalloc_exec(unsigned long size)
 {
        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-                             -1, __builtin_return_address(0));
+                             NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size)
 void *vmalloc_32(unsigned long size)
 {
        return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
-                             -1, __builtin_return_address(0));
+                             NUMA_NO_NODE, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
 
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size)
        void *ret;
 
        ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
-                            -1, __builtin_return_address(0));
+                            NUMA_NO_NODE, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
index 196709f..88c5fed 100644 (file)
@@ -128,7 +128,7 @@ struct scan_control {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-long vm_total_pages;   /* The total number of pages which the VM controls */
+unsigned long vm_total_pages;  /* The total number of pages which the VM controls */
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
 }
 #endif
 
-static int inactive_file_is_low_global(struct zone *zone)
-{
-       unsigned long active, inactive;
-
-       active = zone_page_state(zone, NR_ACTIVE_FILE);
-       inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-
-       return (active > inactive);
-}
-
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
  * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
  */
 static int inactive_file_is_low(struct lruvec *lruvec)
 {
-       if (!mem_cgroup_disabled())
-               return mem_cgroup_inactive_file_is_low(lruvec);
+       unsigned long inactive;
+       unsigned long active;
+
+       inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+       active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
 
-       return inactive_file_is_low_global(lruvec_zone(lruvec));
+       return active > inactive;
 }
 
 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
        return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+enum scan_balance {
+       SCAN_EQUAL,
+       SCAN_FRACT,
+       SCAN_ANON,
+       SCAN_FILE,
+};
+
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
-       unsigned long anon, file, free;
+       struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+       u64 fraction[2];
+       u64 denominator = 0;    /* gcc */
+       struct zone *zone = lruvec_zone(lruvec);
        unsigned long anon_prio, file_prio;
+       enum scan_balance scan_balance;
+       unsigned long anon, file, free;
+       bool force_scan = false;
        unsigned long ap, fp;
-       struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-       u64 fraction[2], denominator;
        enum lru_list lru;
-       int noswap = 0;
-       bool force_scan = false;
-       struct zone *zone = lruvec_zone(lruvec);
 
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                force_scan = true;
 
        /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || (nr_swap_pages <= 0)) {
-               noswap = 1;
-               fraction[0] = 0;
-               fraction[1] = 1;
-               denominator = 1;
+       if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       /*
+        * Global reclaim will swap to prevent OOM even with no
+        * swappiness, but memcg users want to use this knob to
+        * disable swapping for individual groups completely when
+        * using the memory controller's swap limit feature would be
+        * too expensive.
+        */
+       if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       /*
+        * Do not apply any pressure balancing cleverness when the
+        * system is close to OOM, scan both anon and file equally
+        * (unless the swappiness setting disagrees with swapping).
+        */
+       if (!sc->priority && vmscan_swappiness(sc)) {
+               scan_balance = SCAN_EQUAL;
                goto out;
        }
 
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
                get_lru_size(lruvec, LRU_INACTIVE_FILE);
 
+       /*
+        * If it's foreseeable that reclaiming the file cache won't be
+        * enough to get the zone back into a desirable shape, we have
+        * to swap.  Better start now and leave the - probably heavily
+        * thrashing - remaining file pages alone.
+        */
        if (global_reclaim(sc)) {
-               free  = zone_page_state(zone, NR_FREE_PAGES);
+               free = zone_page_state(zone, NR_FREE_PAGES);
                if (unlikely(file + free <= high_wmark_pages(zone))) {
-                       /*
-                        * If we have very few page cache pages, force-scan
-                        * anon pages.
-                        */
-                       fraction[0] = 1;
-                       fraction[1] = 0;
-                       denominator = 1;
-                       goto out;
-               } else if (!inactive_file_is_low_global(zone)) {
-                       /*
-                        * There is enough inactive page cache, do not
-                        * reclaim anything from the working set right now.
-                        */
-                       fraction[0] = 0;
-                       fraction[1] = 1;
-                       denominator = 1;
+                       scan_balance = SCAN_ANON;
                        goto out;
                }
        }
 
        /*
+        * There is enough inactive page cache, do not reclaim
+        * anything from the anonymous working set right now.
+        */
+       if (!inactive_file_is_low(lruvec)) {
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       scan_balance = SCAN_FRACT;
+
+       /*
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 out:
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
+               unsigned long size;
                unsigned long scan;
 
-               scan = get_lru_size(lruvec, lru);
-               if (sc->priority || noswap || !vmscan_swappiness(sc)) {
-                       scan >>= sc->priority;
-                       if (!scan && force_scan)
-                               scan = SWAP_CLUSTER_MAX;
+               size = get_lru_size(lruvec, lru);
+               scan = size >> sc->priority;
+
+               if (!scan && force_scan)
+                       scan = min(size, SWAP_CLUSTER_MAX);
+
+               switch (scan_balance) {
+               case SCAN_EQUAL:
+                       /* Scan lists relative to size */
+                       break;
+               case SCAN_FRACT:
+                       /*
+                        * Scan types proportional to swappiness and
+                        * their relative recent reclaim efficiency.
+                        */
                        scan = div64_u64(scan * fraction[file], denominator);
+                       break;
+               case SCAN_FILE:
+               case SCAN_ANON:
+                       /* Scan one type exclusively */
+                       if ((scan_balance == SCAN_FILE) != file)
+                               scan = 0;
+                       break;
+               default:
+                       /* Look ma, no brain */
+                       BUG();
                }
                nr[lru] = scan;
        }
 }
 
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+       unsigned long nr[NR_LRU_LISTS];
+       unsigned long nr_to_scan;
+       enum lru_list lru;
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
+
+       get_scan_count(lruvec, sc, nr);
+
+       blk_start_plug(&plug);
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+                                       nr[LRU_INACTIVE_FILE]) {
+               for_each_evictable_lru(lru) {
+                       if (nr[lru]) {
+                               nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+                               nr[lru] -= nr_to_scan;
+
+                               nr_reclaimed += shrink_list(lru, nr_to_scan,
+                                                           lruvec, sc);
+                       }
+               }
+               /*
+                * On large memory systems, scan >> priority can become
+                * really large. This is fine for the starting priority;
+                * we want to put equal scanning pressure on each zone.
+                * However, if the VM has a harder time of freeing pages,
+                * with multiple processes reclaiming pages, the total
+                * freeing target can get unreasonably large.
+                */
+               if (nr_reclaimed >= nr_to_reclaim &&
+                   sc->priority < DEF_PRIORITY)
+                       break;
+       }
+       blk_finish_plug(&plug);
+       sc->nr_reclaimed += nr_reclaimed;
+
+       /*
+        * Even if we did not try to evict anon pages at all, we want to
+        * rebalance the anon lru active/inactive ratio.
+        */
+       if (inactive_anon_is_low(lruvec))
+               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                                  sc, LRU_ACTIVE_ANON);
+
+       throttle_vm_writeout(sc->gfp_mask);
+}
+
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
  * calls try_to_compact_zone() that it will have enough free pages to succeed.
  * It will give up earlier than that if there is difficulty reclaiming pages.
  */
-static inline bool should_continue_reclaim(struct lruvec *lruvec,
+static inline bool should_continue_reclaim(struct zone *zone,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
-       if (nr_swap_pages > 0)
-               inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
+       inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+       if (get_nr_swap_pages() > 0)
+               inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
 
        /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
+       switch (compaction_suitable(zone, sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
        }
 }
 
-/*
- * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
- */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
-       unsigned long nr[NR_LRU_LISTS];
-       unsigned long nr_to_scan;
-       enum lru_list lru;
        unsigned long nr_reclaimed, nr_scanned;
-       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-       struct blk_plug plug;
-
-restart:
-       nr_reclaimed = 0;
-       nr_scanned = sc->nr_scanned;
-       get_scan_count(lruvec, sc, nr);
-
-       blk_start_plug(&plug);
-       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-                                       nr[LRU_INACTIVE_FILE]) {
-               for_each_evictable_lru(lru) {
-                       if (nr[lru]) {
-                               nr_to_scan = min_t(unsigned long,
-                                                  nr[lru], SWAP_CLUSTER_MAX);
-                               nr[lru] -= nr_to_scan;
-
-                               nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
-                       }
-               }
-               /*
-                * On large memory systems, scan >> priority can become
-                * really large. This is fine for the starting priority;
-                * we want to put equal scanning pressure on each zone.
-                * However, if the VM has a harder time of freeing pages,
-                * with multiple processes reclaiming pages, the total
-                * freeing target can get unreasonably large.
-                */
-               if (nr_reclaimed >= nr_to_reclaim &&
-                   sc->priority < DEF_PRIORITY)
-                       break;
-       }
-       blk_finish_plug(&plug);
-       sc->nr_reclaimed += nr_reclaimed;
 
-       /*
-        * Even if we did not try to evict anon pages at all, we want to
-        * rebalance the anon lru active/inactive ratio.
-        */
-       if (inactive_anon_is_low(lruvec))
-               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-                                  sc, LRU_ACTIVE_ANON);
-
-       /* reclaim/compaction might need reclaim to continue */
-       if (should_continue_reclaim(lruvec, nr_reclaimed,
-                                   sc->nr_scanned - nr_scanned, sc))
-               goto restart;
+       do {
+               struct mem_cgroup *root = sc->target_mem_cgroup;
+               struct mem_cgroup_reclaim_cookie reclaim = {
+                       .zone = zone,
+                       .priority = sc->priority,
+               };
+               struct mem_cgroup *memcg;
 
-       throttle_vm_writeout(sc->gfp_mask);
-}
+               nr_reclaimed = sc->nr_reclaimed;
+               nr_scanned = sc->nr_scanned;
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-       struct mem_cgroup *root = sc->target_mem_cgroup;
-       struct mem_cgroup_reclaim_cookie reclaim = {
-               .zone = zone,
-               .priority = sc->priority,
-       };
-       struct mem_cgroup *memcg;
+               memcg = mem_cgroup_iter(root, NULL, &reclaim);
+               do {
+                       struct lruvec *lruvec;
 
-       memcg = mem_cgroup_iter(root, NULL, &reclaim);
-       do {
-               struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+                       lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-               shrink_lruvec(lruvec, sc);
+                       shrink_lruvec(lruvec, sc);
 
-               /*
-                * Limit reclaim has historically picked one memcg and
-                * scanned it with decreasing priority levels until
-                * nr_to_reclaim had been reclaimed.  This priority
-                * cycle is thus over after a single memcg.
-                *
-                * Direct reclaim and kswapd, on the other hand, have
-                * to scan all memory cgroups to fulfill the overall
-                * scan target for the zone.
-                */
-               if (!global_reclaim(sc)) {
-                       mem_cgroup_iter_break(root, memcg);
-                       break;
-               }
-               memcg = mem_cgroup_iter(root, memcg, &reclaim);
-       } while (memcg);
+                       /*
+                        * Direct reclaim and kswapd have to scan all memory
+                        * cgroups to fulfill the overall scan target for the
+                        * zone.
+                        *
+                        * Limit reclaim, on the other hand, only cares about
+                        * nr_to_reclaim pages to be reclaimed and it will
+                        * retry with decreasing priority if one round over the
+                        * whole hierarchy is not sufficient.
+                        */
+                       if (!global_reclaim(sc) &&
+                                       sc->nr_reclaimed >= sc->nr_to_reclaim) {
+                               mem_cgroup_iter_break(root, memcg);
+                               break;
+                       }
+                       memcg = mem_cgroup_iter(root, memcg, &reclaim);
+               } while (memcg);
+       } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc));
 }
 
 /* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * a reasonable chance of completing and allocating the page
         */
        balance_gap = min(low_wmark_pages(zone),
-               (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+               (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                        KSWAPD_ZONE_BALANCE_GAP_RATIO);
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        goto out;
 
                /*
+                * If we're getting trouble reclaiming, start doing
+                * writepage even in laptop mode.
+                */
+               if (sc->priority < DEF_PRIORITY - 2)
+                       sc->may_writepage = 1;
+
+               /*
                 * Try to write back as many pages as we just scanned.  This
                 * tends to cause slow streaming writers to write data to the
                 * disk smoothly, at the dirtying rate, which is nice.   But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 {
        unsigned long nr_reclaimed;
        struct scan_control sc = {
-               .gfp_mask = gfp_mask,
+               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .may_writepage = !laptop_mode,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
  */
 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
-       unsigned long present_pages = 0;
+       unsigned long managed_pages = 0;
        unsigned long balanced_pages = 0;
        int i;
 
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                if (!populated_zone(zone))
                        continue;
 
-               present_pages += zone->present_pages;
+               managed_pages += zone->managed_pages;
 
                /*
                 * A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                 * they must be considered balanced here as well!
                 */
                if (zone->all_unreclaimable) {
-                       balanced_pages += zone->present_pages;
+                       balanced_pages += zone->managed_pages;
                        continue;
                }
 
                if (zone_balanced(zone, order, 0, i))
-                       balanced_pages += zone->present_pages;
+                       balanced_pages += zone->managed_pages;
                else if (!order)
                        return false;
        }
 
        if (order)
-               return balanced_pages >= (present_pages >> 2);
+               return balanced_pages >= (managed_pages >> 2);
        else
                return true;
 }
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-       struct zone *unbalanced_zone;
+       bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
 
        do {
                unsigned long lru_pages = 0;
-               int has_under_min_watermark_zone = 0;
-
-               unbalanced_zone = NULL;
 
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
-               if (i < 0)
+
+               if (i < 0) {
+                       pgdat_is_balanced = true;
                        goto out;
+               }
 
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
                         * of the zone, whichever is smaller.
                         */
                        balance_gap = min(low_wmark_pages(zone),
-                               (zone->present_pages +
+                               (zone->managed_pages +
                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
                        /*
@@ -2720,12 +2772,10 @@ loop_again:
                        }
 
                        /*
-                        * If we've done a decent amount of scanning and
-                        * the reclaim ratio is low, start doing writepage
-                        * even in laptop mode
+                        * If we're getting trouble reclaiming, start doing
+                        * writepage even in laptop mode.
                         */
-                       if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-                           total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+                       if (sc.priority < DEF_PRIORITY - 2)
                                sc.may_writepage = 1;
 
                        if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
                                continue;
                        }
 
-                       if (!zone_balanced(zone, testorder, 0, end_zone)) {
-                               unbalanced_zone = zone;
-                               /*
-                                * We are still under min water mark.  This
-                                * means that we have a GFP_ATOMIC allocation
-                                * failure risk. Hurry up!
-                                */
-                               if (!zone_watermark_ok_safe(zone, order,
-                                           min_wmark_pages(zone), end_zone, 0))
-                                       has_under_min_watermark_zone = 1;
-                       } else {
+                       if (zone_balanced(zone, testorder, 0, end_zone))
                                /*
                                 * If a zone reaches its high watermark,
                                 * consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
-                       }
-
                }
 
                /*
@@ -2766,17 +2804,9 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
 
-               if (pgdat_balanced(pgdat, order, *classzone_idx))
+               if (pgdat_balanced(pgdat, order, *classzone_idx)) {
+                       pgdat_is_balanced = true;
                        break;          /* kswapd: all done */
-               /*
-                * OK, kswapd is getting into trouble.  Take a nap, then take
-                * another pass across the zones.
-                */
-               if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
-                       if (has_under_min_watermark_zone)
-                               count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
-                       else if (unbalanced_zone)
-                               wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
                }
 
                /*
@@ -2788,9 +2818,9 @@ loop_again:
                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        } while (--sc.priority >= 0);
-out:
 
-       if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
+out:
+       if (!pgdat_is_balanced) {
                cond_resched();
 
                try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
        nr = global_page_state(NR_ACTIVE_FILE) +
             global_page_state(NR_INACTIVE_FILE);
 
-       if (nr_swap_pages > 0)
+       if (get_nr_swap_pages() > 0)
                nr += global_page_state(NR_ACTIVE_ANON) +
                      global_page_state(NR_INACTIVE_ANON);
 
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
             zone_page_state(zone, NR_INACTIVE_FILE);
 
-       if (nr_swap_pages > 0)
+       if (get_nr_swap_pages() > 0)
                nr += zone_page_state(zone, NR_ACTIVE_ANON) +
                      zone_page_state(zone, NR_INACTIVE_ANON);
 
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
-               .nr_to_reclaim = max_t(unsigned long, nr_pages,
-                                      SWAP_CLUSTER_MAX),
-               .gfp_mask = gfp_mask,
+               .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
        };
index 9800306..e1d8ed1 100644 (file)
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone)
         * 125          1024            10      16-32 GB        9
         */
 
-       mem = zone->present_pages >> (27 - PAGE_SHIFT);
+       mem = zone->managed_pages >> (27 - PAGE_SHIFT);
 
        threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
 #ifdef CONFIG_CMA
        "CMA",
 #endif
+#ifdef CONFIG_MEMORY_ISOLATION
        "Isolate",
+#endif
 };
 
 static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = {
        "kswapd_inodesteal",
        "kswapd_low_wmark_hit_quickly",
        "kswapd_high_wmark_hit_quickly",
-       "kswapd_skip_congestion_wait",
        "pageoutrun",
        "allocstall",
 
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
        int mtype;
        unsigned long pfn;
        unsigned long start_pfn = zone->zone_start_pfn;
-       unsigned long end_pfn = start_pfn + zone->spanned_pages;
+       unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count[MIGRATE_TYPES] = { 0, };
 
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
index fd05c81..de2e950 100644 (file)
@@ -87,7 +87,7 @@ struct virtio_chan {
        /* This is global limit. Since we don't have a global structure,
         * will be placing it in each channel.
         */
-       int p9_max_pages;
+       unsigned long p9_max_pages;
        /* Scatterlist: can be too big for stack. */
        struct scatterlist sg[VIRTQUEUE_NUM];
 
index a5b89a6..7427ab5 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 #include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
 
 #include "net-sysfs.h"
 
@@ -1257,6 +1258,8 @@ void netdev_unregister_kobject(struct net_device * net)
 
        remove_queue_kobjects(net);
 
+       pm_runtime_set_memalloc_noio(dev, false);
+
        device_del(dev);
 }
 
@@ -1301,6 +1304,8 @@ int netdev_register_kobject(struct net_device *net)
                return error;
        }
 
+       pm_runtime_set_memalloc_noio(dev, true);
+
        return error;
 }