mm: THP: introducing a fine-grained transparent hugepage technique for ARM64 architecture 45/263545/5
authorSung-hun Kim <sfoon.kim@samsung.com>
Fri, 2 Jul 2021 09:43:36 +0000 (18:43 +0900)
committerSung-hun Kim <sfoon.kim@samsung.com>
Tue, 7 Sep 2021 07:28:29 +0000 (16:28 +0900)
Transparent hugepage (THP) is one of promise solutions to deal with
increased memory footprints, but it mostly focused on server-side
environments.

This patch claims that embedded systems also get benefits by using
THP to deal with increased but still small-sized memory footprints
in applications on the embedded system.

An ARM64 architecture featured a fine-grained hugepage which support
64KB sized hugepages while the size of commonly used hugepage is 2MB.
We used these two kinds of hugepages corresponding to required size
of virtual memory.

In this patch, we developed an eager-and-conservative policy. With
this policy, the kernel do not allow to allocate 2MB hugepages on
page faults to decrease enlarged page fault latencies. Instead, the
kernel allocates 64KB hugepages to deal with hugepage allocation.
Since 64KB hugepages require the smaller order pages than 2MB
hugepages, it does not severely affect to user-noticed memory latency
due to the memory management tasks such as memory compaction.

On the other hand, khugepaged makes both 64KB hugepages and 2MB
hugepages for both anonymous pages and file pages corresponding to
virtual memory sizes.

Moreover, our proposed finegrained THP (fTHP) supports hugepage
mappings on pages in CMA. Since pages in CMA already contiguous, fTHP
just allows hugepage mappings for 64KB or 2MB aligned memory areas.

The proposed method achieves upto 32% of throughput improvement
against Linux kernel with default THP that the system runs a read
workload in lmbench [1] when the buffer is fitted in the CPU
last-level-cache. For the large-sized buffer (bigger than 2MB),
the proposed method shows similar throughput to default THP in Linux
kernel.

[1] LMbench - Tools for peformance analysis:
http://lmbench.sourceforge.net

Change-Id: I750528db8f04b37fda39052bea775d18ca5d53fb
Signed-off-by: Sung-hun Kim <sfoon.kim@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
42 files changed:
arch/arm64/include/asm/finegrained_thp.h [new file with mode: 0644]
arch/arm64/include/asm/huge_mm.h [new file with mode: 0644]
arch/arm64/include/asm/pgtable.h
arch/arm64/mm/Makefile
arch/arm64/mm/finegrained_thp.c [new file with mode: 0644]
arch/arm64/mm/huge_memory.c [new file with mode: 0644]
arch/arm64/mm/mmu.c
fs/proc/meminfo.c
include/asm-generic/finegrained_thp.h [new file with mode: 0644]
include/asm-generic/huge_mm.h [new file with mode: 0644]
include/linux/huge_mm.h
include/linux/mm.h
include/linux/mmu_notifier.h
include/linux/mmzone.h
include/linux/pgtable.h
include/linux/rmap.h
include/linux/swapops.h
include/linux/vm_event_item.h
include/uapi/asm-generic/mman-common.h
kernel/dma/Kconfig
kernel/events/uprobes.c
mm/Kconfig
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/internal.h
mm/ioremap.c
mm/khugepaged.c
mm/madvise.c
mm/memcontrol.c
mm/memory.c
mm/migrate.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/rmap.c
mm/shmem.c
mm/swap_slots.c
mm/swapfile.c
mm/truncate.c
mm/vmscan.c
mm/vmstat.c

diff --git a/arch/arm64/include/asm/finegrained_thp.h b/arch/arm64/include/asm/finegrained_thp.h
new file mode 100644 (file)
index 0000000..6f3d9bb
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef __ASM_FINEGRAINED_THP_H
+#define __ASM_FINEGRAINED_THP_H
+#ifdef CONFIG_FINEGRAINED_THP
+extern void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug);
+#else /* CONFIG_FINEGRAINED_THP */
+static inline void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_FINEGRAINED_THP_H */
diff --git a/arch/arm64/include/asm/huge_mm.h b/arch/arm64/include/asm/huge_mm.h
new file mode 100644 (file)
index 0000000..cc44800
--- /dev/null
@@ -0,0 +1,261 @@
+#ifndef __ASM_HUGE_MM_H
+#define __ASM_HUGE_MM_H
+
+#ifdef CONFIG_FINEGRAINED_THP
+#include <linux/mm.h> /* for compound_order/compound_nr */
+#include <asm/pgtable.h>
+
+#define HPAGE_CONT_PTE_MASK CONT_PTE_MASK
+#define HPAGE_CONT_PTE_SIZE CONT_PTE_SIZE
+#define HPAGE_CONT_PTE_ORDER (CONT_PTE_SHIFT-PAGE_SHIFT)
+#define HPAGE_CONT_PTE_NR (1 << HPAGE_CONT_PTE_ORDER)
+
+extern int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                        pte_t *dst_pte, pte_t *src_pte, unsigned long addr,
+                        struct vm_area_struct *vma, int *rss);
+
+extern vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf);
+
+static inline vm_fault_t arch_do_huge_pte_anonymous_page(
+                       struct vm_fault *vmf)
+{
+       return arm64_do_huge_pte_anonymous_page(vmf);
+}
+
+extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte);
+extern int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
+                       unsigned long addr, pgprot_t newprot,
+                       unsigned long cp_flags);
+
+extern pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
+                               unsigned long address, pte_t *ptep);
+
+/*
+ * Below codes should be moved to arm64-dependent codes
+ * Most codes are borrowed from arch/arm64/mm/hugetlbpage.c
+ */
+
+#define HPAGE_CONT_PTE_CACHE_INDEX_MASK (HPAGE_CONT_PTE_NR - 1)
+
+static inline bool transhuge_adv_vma_suitable(struct vm_area_struct *vma,
+               unsigned long haddr)
+{
+       /* Don't have to check pgoff for anonymous vma */
+       if (!vma_is_anonymous(vma)) {
+               if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CONT_PTE_CACHE_INDEX_MASK)
+                       != (vma->vm_pgoff & HPAGE_CONT_PTE_CACHE_INDEX_MASK))
+                       return false;
+       }
+
+       if (haddr < vma->vm_start || haddr + HPAGE_CONT_PTE_SIZE >= vma->vm_end)
+               return false;
+       return true;
+}
+
+static inline pgprot_t thp_pte_pgprot(pte_t pte)
+{
+       unsigned long pfn = pte_pfn(pte);
+
+       return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+static inline pte_t arm64_make_huge_pte(struct page *hpage,
+               struct vm_area_struct *vma)
+{
+       return pte_mkcont(pte_mkhuge(mk_pte(hpage, vma->vm_page_prot)));
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+               struct vm_area_struct *vma)
+{
+       return arm64_make_huge_pte(hpage, vma);
+}
+
+static inline void arm64_clear_and_flush(struct mm_struct *mm,
+                                       unsigned long addr,
+                                       pte_t *ptep,
+                                       unsigned long pgsize,
+                                       unsigned long ncontig)
+{
+       int i;
+       struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+       unsigned long saddr = addr;
+
+       for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+               pte_clear(mm, addr, ptep);
+
+       flush_tlb_range(&vma, saddr, addr);
+}
+
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline void arm64_set_huge_pte_at(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{
+       int i;
+       unsigned long pfn;
+       pgprot_t hugeprot;
+
+       pfn = pte_pfn(pte);
+       hugeprot = thp_pte_pgprot(pte);
+
+       arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR);
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE, pfn += 1)
+               set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{
+       arm64_set_huge_pte_at(mm, addr, ptep, pte, headoff);
+}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep)
+{
+       arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR);
+}
+
+extern vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page);
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+                       struct page *page)
+{
+       return arm64_do_set_huge_pte(vmf, page);
+}
+
+extern vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte);
+
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+       int ret = VM_FAULT_FALLBACK;
+
+       if (pte_cont(entry))
+               ret = arm64_wp_huge_pte(vmf, entry);
+       return ret;
+}
+
+extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte);
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf, pte_t entry)
+{
+       if (pte_cont(entry)) {
+               huge_cont_pte_set_accessed(vmf, entry);
+               return true;
+       }
+       return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+       if (pte_cont(pte))
+               return pte_clearhuge(pte);
+       return pte;
+}
+
+extern int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned long pfn, pgprot_t prot);
+
+static inline int arch_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned long pfn, pgprot_t prot)
+{
+       return arm64_remap_pte_range(mm, pmd, addr, end, pfn, prot);
+}
+
+void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
+               pte_t *pte, unsigned long address,
+               bool freeze, struct page *page);
+
+#define split_huge_pte(__vma, __pmd, __pte, __address)                         \
+       do {                                                            \
+               pte_t *____pte = (__pte);                               \
+               if (is_swap_pte(*____pte) || pte_cont(*____pte) \
+                                       || pte_devmap(*____pte))        \
+                       __split_huge_pte(__vma, __pmd, __pte, __address,        \
+                                               false, NULL);           \
+       }  while (0)
+
+void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
+               bool freeze, struct page *page);
+extern bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma,
+                                       unsigned long vm_flags, int nr_pages);
+extern bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma,
+                                       unsigned long vm_flags, int nr_pages);
+
+static inline bool arch_hugepage_vma_shmem_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return arm64_hugepage_vma_shmem_check(vma, vm_flags,
+                                               HPAGE_CONT_PTE_NR);
+}
+
+static inline bool arch_hugepage_vma_file_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return arm64_hugepage_vma_file_check(vma, vm_flags,
+                                               HPAGE_CONT_PTE_NR);
+}
+
+#else /* CONFIG_FINEGRAINED_THP */
+
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+       return VM_FAULT_FALLBACK;
+}
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf,
+                               pte_t entry)
+{
+       return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+       return pte;
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+               struct vm_area_struct *vma)
+{
+       return mk_pte(hpage, vma->vm_page_prot);
+}
+
+static inline vm_fault_t arch_do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+       return VM_FAULT_FALLBACK;
+}
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+                       struct page *page)
+{
+       return VM_FAULT_FALLBACK;
+}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep)
+{}
+
+static inline bool arch_hugepage_vma_shmem_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return false;
+}
+
+static inline bool arch_hugepage_vma_file_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return false;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_HUGE_MM_H */
index 717f13d..05ff2c5 100644 (file)
@@ -153,6 +153,14 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
        return pte;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static inline pte_t pte_clearhuge(pte_t pte)
+{
+       pte = clear_pte_bit(pte, __pgprot(PTE_CONT));
+       return pte;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
 {
        pmd_val(pmd) &= ~pgprot_val(prot);
@@ -325,6 +333,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
  */
 #define pte_mkhuge(pte)                (__pte(pte_val(pte) & ~PTE_TABLE_BIT))
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_FINEGRAINED_THP
+/* 64KB hugepage definition for THP */
+#define pte_trans_huge(pte)    (pte_val(pte) && !(pte_val(pte) & PTE_TABLE_BIT))
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+
 /*
  * Hugetlb definitions.
  */
index 5ead3c3..a1d152e 100644 (file)
@@ -9,6 +9,8 @@ obj-$(CONFIG_PTDUMP_DEBUGFS)    += ptdump_debugfs.o
 obj-$(CONFIG_NUMA)             += numa.o
 obj-$(CONFIG_DEBUG_VIRTUAL)    += physaddr.o
 obj-$(CONFIG_ARM64_MTE)                += mteswap.o
+obj-$(CONFIG_FINEGRAINED_THP) += huge_memory.o
+obj-$(CONFIG_FINEGRAINED_THP) += finegrained_thp.o
 KASAN_SANITIZE_physaddr.o      += n
 
 obj-$(CONFIG_KASAN)            += kasan_init.o
diff --git a/arch/arm64/mm/finegrained_thp.c b/arch/arm64/mm/finegrained_thp.c
new file mode 100644 (file)
index 0000000..5ebb4ac
--- /dev/null
@@ -0,0 +1,26 @@
+#include <linux/shmem_fs.h>
+#include <asm/huge_mm.h>
+
+bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma,
+                                       unsigned long vm_flags, int nr_pages)
+{
+       /* Enabled via shmem mount options or sysfs settings. */
+       if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
+               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+                               nr_pages);
+       }
+       return false;
+}
+
+bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma,
+                                       unsigned long vm_flags, int nr_pages)
+{
+       /* Read-only file mappings need to be aligned for THP to work. */
+       if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+           (vm_flags & VM_DENYWRITE)) {
+               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+                               nr_pages);
+       }
+       return false;
+}
+
diff --git a/arch/arm64/mm/huge_memory.c b/arch/arm64/mm/huge_memory.c
new file mode 100644 (file)
index 0000000..2ef1a21
--- /dev/null
@@ -0,0 +1,1090 @@
+/*
+ * Hugepage support for arm64 architecture
+ *
+ * 21.08.07.
+ *
+ */
+
+#include <linux/huge_mm.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/khugepaged.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/oom.h>
+
+#include <asm/huge_mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_FINEGRAINED_THP
+pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
+                               unsigned long address, pte_t *ptep)
+{
+       pte_t pte;
+       int i;
+
+       VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+       VM_BUG_ON(!pte_cont(*ptep));
+       pte = ptep_get_and_clear(vma->vm_mm, address, ptep);
+
+       for (i = 1; i < HPAGE_CONT_PTE_NR; i++)
+               ptep_get_and_clear(vma->vm_mm, address + PAGE_SIZE * i, ptep + i);
+
+       flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
+       return pte;
+}
+
+#define USE_THP_PRINT_CONT_TABLE
+#ifdef USE_THP_PRINT_CONT_TABLE
+void thp_print_cont_pte_table(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, unsigned long line)
+{
+       int i, pid = 0;
+
+       if (mm->owner) {
+               pr_info("THP: %s from %lu proc-%d(%s)\n", __func__, line,
+                               task_pid_nr(mm->owner), mm->owner->comm);
+               pid = task_pid_nr(mm->owner);
+       } else
+               pr_info("THP: %s from %lu\n", __func__, line);
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE) {
+               pr_info("%lx: %llx pid(%d)\n", addr, pte_val(*ptep), pid);
+       }
+}
+#else
+void thp_print_cont_pte_table(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, unsigned long line)
+{}
+#endif /* USE_THP_PRINT_CONT_TABLE */
+
+/*
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ *               fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ *         available
+ * never: never stall for any thp allocation
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+       const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+       /* Always do synchronous compaction */
+       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+               return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+       /* Kick kcompactd and fail quickly */
+       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+       /* Synchronous compaction if madvised, otherwise kick kcompactd */
+       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+               return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                       __GFP_KSWAPD_RECLAIM);
+
+       /* Only do synchronous compaction if madvised */
+       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+               return GFP_TRANSHUGE_LIGHT |
+                      (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+       return GFP_TRANSHUGE_LIGHT;
+}
+
+/*
+ * a caller must hold both locks of dst and src
+ */
+int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                 pte_t *dst_pte, pte_t *src_pte, unsigned long haddr,
+                 struct vm_area_struct *vma, int *rss)
+{
+       struct page *src_page;
+       unsigned long addr = haddr;
+       pte_t pte, *_pte;
+
+       pte = *src_pte;
+
+       src_page = vm_normal_page(vma, addr, pte);
+       if (!src_page)
+               return -EAGAIN;
+
+       VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+       get_page(src_page);
+       page_dup_rmap(src_page, true);
+       if (rss)
+               rss[MM_ANONPAGES] += HPAGE_CONT_PTE_NR;
+       else
+               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+
+       _pte = src_pte;
+       while (addr < haddr + HPAGE_CONT_PTE_SIZE) {
+               ptep_set_wrprotect(src_mm, addr, _pte);
+               addr += PAGE_SIZE;
+       }
+       pte = pte_mkold(pte_wrprotect(pte));
+       arm64_set_huge_pte_at(dst_mm, haddr, dst_pte, pte, 0);
+
+       return 0;
+}
+
+vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
+{
+       int i;
+       pte_t entry;
+       struct vm_area_struct *vma = vmf->vma;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+       unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       pgoff_t index, pgoff, addroff, headoff;
+       vm_fault_t ret = VM_FAULT_FALLBACK;
+
+       if (!transhuge_adv_vma_suitable(vma, haddr))
+               return VM_FAULT_FALLBACK;
+
+       page = compound_head(page);
+       index = page->index;
+       pgoff = vmf->pgoff;
+       addroff = (vmf->address - haddr) >> PAGE_SHIFT;
+
+       if (pgoff - index != addroff)
+               return VM_FAULT_FALLBACK;
+
+       /*
+        * Archs like ppc64 need additonal space to store information
+        * related to pte entry. Use the preallocated table for that.
+        */
+       if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+               vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
+               if (!vmf->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
+       if (unlikely(pmd_none(*vmf->pmd))) {
+               if (pte_alloc(vma->vm_mm, vmf->pmd))
+                       return VM_FAULT_OOM;
+               smp_wmb();
+       }
+
+       /* The head offset indicates the position of the first page in the hugepage */
+       headoff = (addroff + (HPAGE_CONT_PTE_NR - pgoff)) % HPAGE_CONT_PTE_NR;
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, haddr, &vmf->ptl);
+       if (!vmf->pte || unlikely(!pte_none(*vmf->pte))) {
+               spin_unlock(vmf->ptl);
+               vmf->pte = NULL;
+               return ret;
+       }
+
+       entry = arm64_make_huge_pte(compound_head(page), vma);
+       if (write)
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+               flush_icache_page(vma, page + i);
+       if (write && !(vma->vm_flags & VM_SHARED)) {
+               add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
+               if (PageAnon(page))
+                       page_add_new_anon_rmap(page, vma, haddr, true);
+       } else {
+               add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
+               page_add_file_rmap(page, true);
+       }
+
+       arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, headoff);
+       update_mmu_cache_pmd(vma, haddr, vmf->pmd);
+       count_vm_event(THP_FILE_MAPPED);
+       return 0;
+}
+
+static vm_fault_t arm64_do_huge_pte_wp_page_fallback(struct vm_fault *vmf,
+                       pte_t orig_pte, struct page *page)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       int i;
+       vm_fault_t ret = 0;
+       struct page **pages;
+       struct mmu_notifier_range range;
+
+       pages = kmalloc_array(HPAGE_CONT_PTE_NR, sizeof(struct page *),
+                             GFP_KERNEL);
+       if (unlikely(!pages)) {
+               ret |= VM_FAULT_OOM;
+               goto out;
+       }
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+               pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+                                              vmf->address);
+               if (unlikely(!pages[i] ||
+                            mem_cgroup_charge(pages[i], vma->vm_mm,
+                                    GFP_KERNEL))) {
+                       if (pages[i])
+                               put_page(pages[i]);
+                       while (--i >= 0) {
+                               put_page(pages[i]);
+                       }
+                       kfree(pages);
+                       ret |= VM_FAULT_OOM;
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+               copy_user_highpage(pages[i], page + i,
+                                  haddr + PAGE_SIZE * i, vma);
+               __SetPageUptodate(pages[i]);
+               cond_resched();
+       }
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               haddr, haddr + HPAGE_CONT_PTE_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+
+       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+       if (unlikely(!pte_same(*vmf->pte, orig_pte)))
+               goto out_free_pages;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+
+       /*
+        * Leave pmd empty until pte is filled note we must notify here as
+        * concurrent CPU thread might write to new page before the call to
+        * mmu_notifier_invalidate_range_end() happens which can lead to a
+        * device seeing memory write in different order than CPU.
+        *
+        * See Documentation/vm/mmu_notifier.rst
+        */
+       vmf->pte = pte_offset_map(vmf->pmd, haddr);
+       ptep_huge_clear_flush_notify(vma, haddr, vmf->pte);
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, haddr += PAGE_SIZE) {
+               pte_t entry;
+               entry = mk_pte(pages[i], vma->vm_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               set_page_private(pages[i], 0);
+
+               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
+               lru_cache_add_inactive_or_unevictable(pages[i], vma);
+               vmf->pte = pte_offset_map(vmf->pmd, haddr);
+               VM_BUG_ON(!pte_none(*vmf->pte));
+               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+               pte_unmap(vmf->pte);
+       }
+       kfree(pages);
+
+       smp_wmb(); /* make pte visible before pmd */
+       page_remove_rmap(page, true);
+       spin_unlock(vmf->ptl);
+
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback as
+        * the above pmdp_huge_clear_flush_notify() did already call it.
+        */
+       mmu_notifier_invalidate_range_only_end(&range);
+
+       ret |= VM_FAULT_WRITE;
+       put_page(page);
+
+out:
+       return ret;
+
+out_free_pages:
+       spin_unlock(vmf->ptl);
+       mmu_notifier_invalidate_range_end(&range);
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+               set_page_private(pages[i], 0);
+               put_page(pages[i]);
+       }
+       kfree(pages);
+       goto out;
+}
+
+vm_fault_t arm64_do_huge_pte_wp_page(struct vm_fault *vmf, pte_t orig_pte)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct page *page = NULL, *new_page;
+       unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       struct mmu_notifier_range range;
+       gfp_t huge_gfp;                 /* for allocation and charge */
+       vm_fault_t ret = 0;
+
+       vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
+       VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
+       spin_lock(vmf->ptl);
+       if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+               spin_unlock(vmf->ptl);
+               return ret;
+       }
+
+       page = pte_page(orig_pte);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+       page = compound_head(page);
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part.
+        */
+       if (!trylock_page(page)) {
+               get_page(page);
+               spin_unlock(vmf->ptl);
+               lock_page(page);
+               spin_lock(vmf->ptl);
+               if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+                       spin_unlock(vmf->ptl);
+                       unlock_page(page);
+                       put_page(page);
+                       return 0;
+               }
+               put_page(page);
+       }
+
+       if (reuse_swap_page(page, NULL)) {
+               huge_cont_pte_set_accessed(vmf, orig_pte);
+               unlock_page(page);
+               spin_unlock(vmf->ptl);
+               return VM_FAULT_WRITE;
+       }
+       unlock_page(page);
+       get_page(page);
+       spin_unlock(vmf->ptl);
+
+       /*
+        * For 2MB hugepage, the kernel just splits it
+        * into standard-sized pages and fallbacks to
+        * normal page fault handling path.
+        *
+        * For 64KB hugepage, I think alloc-on-COW can
+        * be get a performance benefit. This is because,
+        * significant time is consumed for copying contents
+        * of 2MB page, but 64KB page is much smaller than
+        * 2MB page. So, I guess that the overhead can be
+        * negligible.
+        *
+        * TODO: accounting time overhead of below procedure
+        */
+#ifdef CONFIG_THP_CONSERVATIVE
+        goto fallback;
+#endif
+       if (__transparent_hugepage_enabled(vma)) {
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr,
+                               HPAGE_CONT_PTE_ORDER);
+       } else
+               new_page = NULL;
+
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
+               if (!page) {
+                       split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+                       ret |= VM_FAULT_FALLBACK;
+               } else {
+                       ret = arm64_do_huge_pte_wp_page_fallback(vmf, orig_pte, page);
+                       if (ret & VM_FAULT_OOM) {
+                               split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+                               ret |= VM_FAULT_FALLBACK;
+                       }
+                       put_page(page);
+               }
+               count_vm_event(THP_FAULT_FALLBACK);
+               goto out;
+       }
+
+       if (unlikely(mem_cgroup_charge(new_page, vma->vm_mm,
+                                       huge_gfp))) {
+               put_page(new_page);
+               split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+               if (page)
+                       put_page(page);
+               ret |= VM_FAULT_FALLBACK;
+               count_vm_event(THP_FAULT_FALLBACK);
+               goto out;
+       }
+
+       count_vm_event(THP_FAULT_ALLOC);
+       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+       if (!page)
+               clear_huge_page(new_page, vmf->address, HPAGE_CONT_PTE_NR);
+       else
+               copy_user_huge_page(new_page, page, vmf->address,
+                                   vma, HPAGE_CONT_PTE_NR);
+       __SetPageUptodate(new_page);
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               haddr, haddr + HPAGE_CONT_PTE_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+
+       spin_lock(vmf->ptl);
+       if (page)
+               put_page(page);
+       if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+               spin_unlock(vmf->ptl);
+               mem_cgroup_uncharge(new_page);
+               put_page(new_page);
+               goto out_mn;
+       } else {
+               pte_t entry;
+
+               entry = arm64_make_huge_pte(new_page, vma);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+               vmf->pte = pte_offset_map(vmf->pmd, haddr);
+
+               page_add_new_anon_rmap(new_page, vma, haddr, true);
+               lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+               arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, 0);
+               update_mmu_cache(vma, vmf->address, vmf->pte);
+
+               if (!page) {
+                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+               } else {
+                       VM_BUG_ON_PAGE(!PageHead(page), page);
+                       page_remove_rmap(page, true);
+                       put_page(page);
+               }
+               ret |= VM_FAULT_WRITE;
+       }
+       spin_unlock(vmf->ptl);
+out_mn:
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback as
+        * the above pmdp_huge_clear_flush_notify() did already call it.
+        */
+       mmu_notifier_invalidate_range_only_end(&range);
+out:
+       return ret;
+#ifdef CONFIG_THP_CONSERVATIVE
+fallback:
+       __split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address, false, NULL);
+       return VM_FAULT_FALLBACK;
+#endif /* CONFIG_THP_CONSERVATIVE */
+}
+
+/* the caller must hold lock */
+vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte)
+{
+       unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       pte_t *hpte_p;
+
+       if (vma_is_anonymous(vmf->vma)) {
+               spin_unlock(vmf->ptl);
+               return arm64_do_huge_pte_wp_page(vmf, orig_pte);
+       }
+
+       VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+
+       hpte_p = pte_offset_map(vmf->pmd, haddr);
+       spin_unlock(vmf->ptl);
+       __split_huge_pte(vmf->vma, vmf->pmd, hpte_p, haddr, false, NULL);
+       spin_lock(vmf->ptl);
+
+       return VM_FAULT_FALLBACK;
+}
+
+static inline int check_huge_pte_range(pte_t *head)
+{
+       int i;
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, head++) {
+               if (!pte_none(*head))
+                       return 1;
+       }
+       return 0;
+}
+
+void thp_print_cont_pte_table(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, unsigned long line);
+
+static vm_fault_t __do_huge_pte_anonymous_page(struct vm_fault *vmf,
+                       struct page *page, gfp_t gfp)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       unsigned long offset, haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       pte_t entry;
+       vm_fault_t ret = 0;
+
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+       if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+               return VM_FAULT_FALLBACK;
+       }
+       cgroup_throttle_swaprate(page, gfp);
+
+       clear_huge_page(compound_head(page), haddr, HPAGE_CONT_PTE_NR);
+       /*
+        * The memory barrier inside __SetPageUptodate makes sure that
+        * clear_huge_page writes become visible before the set_pmd_at()
+        * write.
+        */
+       __SetPageUptodate(page);
+
+       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+       ret = check_stable_address_space(vma->vm_mm);
+       if (ret)
+               goto unlock_release;
+
+       if (userfaultfd_missing(vma)) {
+               spin_unlock(vmf->ptl);
+               put_page(page);
+               ret = handle_userfault(vmf, VM_UFFD_MISSING);
+               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+               return ret;
+       }
+
+       entry = arm64_make_huge_pte(page, vma);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       offset = (vmf->address - haddr) >> PAGE_SHIFT;
+       vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+       if (!pte_none(*vmf->pte)) {
+               ret = VM_FAULT_FALLBACK;
+               goto unlock_release;
+       }
+       if (check_huge_pte_range(vmf->pte - offset)) {
+               /* recheck */
+               /* TODO: COPY? */
+               ret = VM_FAULT_FALLBACK;
+               goto unlock_release;
+       }
+
+       page_add_new_anon_rmap(page, vma, haddr, true);
+       lru_cache_add_inactive_or_unevictable(page, vma);
+       arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte - offset, entry, 0);
+       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+
+       spin_unlock(vmf->ptl);
+
+       count_vm_event(THP_FAULT_ALLOC);
+       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+       return 0;
+
+unlock_release:
+       spin_unlock(vmf->ptl);
+       put_page(page);
+
+       return ret;
+}
+
+vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct page *page;
+       unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       spinlock_t *ptl;
+       gfp_t gfp;
+
+       if (!transhuge_adv_vma_suitable(vma, haddr))
+               return VM_FAULT_FALLBACK;
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
+       if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
+               return VM_FAULT_OOM;
+       if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+                       !mm_forbids_zeropage(vma->vm_mm) &&
+                       transparent_hugepage_use_zero_page()) {
+               return VM_FAULT_FALLBACK;
+       }
+       ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+       vmf->pte = pte_offset_map(vmf->pmd, haddr);
+       if (check_huge_pte_range(vmf->pte)) {
+               pte_unmap(vmf->pte);
+               spin_unlock(ptl);
+               return VM_FAULT_FALLBACK;
+       }
+       pte_unmap(vmf->pte);
+       spin_unlock(ptl);
+
+       gfp = alloc_hugepage_direct_gfpmask(vma);
+       page = alloc_hugepage_vma(gfp, vma,
+                               haddr,
+                               HPAGE_CONT_PTE_ORDER);
+       if (unlikely(!page)) {
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+       prep_transhuge_page(page);
+       return __do_huge_pte_anonymous_page(vmf, page, gfp);
+}
+
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                       pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+                       unsigned long end, struct page *page,
+                       int *rss, spinlock_t *ptl)
+{
+       struct mm_struct *mm = tlb->mm;
+       unsigned long haddr = (*addr) & HPAGE_CONT_PTE_MASK;
+       unsigned long range_end =
+               ((haddr + HPAGE_CONT_PTE_SIZE) > end) ? end :
+               haddr + HPAGE_CONT_PTE_SIZE;
+       size_t size = range_end - haddr;
+       unsigned long map_count = size >> PAGE_SHIFT;
+       pte_t *pte;
+
+       pte = pte_offset_map(pmd, haddr);
+
+       if ((*addr) == haddr && haddr + HPAGE_CONT_PTE_SIZE <= range_end) {
+               arm64_clear_and_flush(mm, *addr, pte, PAGE_SIZE, map_count);
+               page_remove_rmap(compound_head(page), true);
+               rss[mm_counter(page)] -= map_count;
+               __tlb_adjust_range(tlb, *addr, size);
+               __tlb_remove_tlb_entry(tlb, pte, *addr);
+               tlb_remove_page_size(tlb, page, size);
+
+               *addr += size;
+               pte += map_count;
+
+               if (*addr >= end)
+                       *addr = end - PAGE_SIZE;
+
+               *ptep = pte;
+       } else {
+               if (haddr < vma->vm_start) {
+                       pr_err("haddr(%lx) is less than vm start(%lx)\n",
+                                       haddr, vma->vm_start);
+                       thp_print_cont_pte_table(mm, haddr, pte, __LINE__);
+               }
+
+               spin_unlock(ptl);
+               __split_huge_pte(vma, pmd, pte, haddr, false, NULL);
+               spin_lock(ptl);
+       }
+
+       pte_unmap(pte);
+
+       return map_count == HPAGE_CONT_PTE_NR;
+}
+
+/* caller must hold a proper lock */
+void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte)
+{
+       int i;
+       pte_t entry, *pte;
+       unsigned long haddr;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+       haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+       pte = pte_offset_map(vmf->pmd, haddr);
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, pte++, haddr += PAGE_SIZE) {
+               entry = pte_mkyoung(*pte);
+               if (write)
+                       entry = pte_mkwrite(pte_mkdirty(entry));
+               ptep_set_access_flags(vmf->vma, haddr, pte, entry, write);
+       }
+       update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+}
+
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+       return pmd_write(pmd) ||
+              ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
+extern void mlock_vma_page(struct page *page);
+extern void clear_page_mlock(struct page *page);
+
+struct page *follow_trans_huge_pte(struct vm_area_struct *vma,
+                                  unsigned long addr,
+                                  pmd_t *pmd,
+                                  unsigned int flags)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
+       pte_t *pte;
+
+       assert_spin_locked(pmd_lockptr(mm, pmd));
+
+       if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+               goto out;
+
+       /* Avoid dumping huge zero page */
+       if ((flags & FOLL_DUMP))
+               return ERR_PTR(-EFAULT);
+
+       /* Full NUMA hinting faults to serialise migration in fault paths */
+       if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+               goto out;
+
+       pte = pte_offset_map(pmd, addr);
+       page = pte_page(*pte);
+       VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+       if (!try_grab_page(page, flags))
+               return ERR_PTR(-ENOMEM);
+
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * For anon THP:
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                *
+                * For file THP:
+                *
+                * We can expect PageDoubleMap() to be stable under page lock:
+                * for file pages we set it in page_add_file_rmap(), which
+                * requires page to be locked.
+                */
+
+               if (PageAnon(page) && compound_mapcount(page) != 1)
+                       goto skip_mlock;
+               if (PageDoubleMap(page) || !page->mapping)
+                       goto skip_mlock;
+               if (!trylock_page(page))
+                       goto skip_mlock;
+               if (page->mapping && !PageDoubleMap(page))
+                       mlock_vma_page(page);
+               unlock_page(page);
+       }
+skip_mlock:
+       page += (addr & ~HPAGE_CONT_PTE_MASK) >> PAGE_SHIFT;
+       VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
+
+out:
+       return page;
+}
+
+static inline pte_t ptep_invalidate(struct vm_area_struct *vma,
+                               unsigned long address, pte_t *ptep)
+{
+       return __pte(xchg_relaxed(&pte_val(*ptep), (pte_val(*ptep) & ~PTE_VALID)));
+}
+
+extern atomic_long_t nr_phys_cont_pte_pages;
+
+static int remap_try_huge_pte(struct mm_struct *mm, pte_t *pte, unsigned long addr,
+                               unsigned long end, unsigned long pfn,
+                               pgprot_t prot)
+{
+       phys_addr_t phys_addr = __pfn_to_phys(pfn);
+       pte_t entry;
+
+       if ((end - addr) != CONT_PTE_SIZE)
+               return 0;
+
+       if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
+               return 0;
+
+       if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
+               return 0;
+
+       entry = pte_mkspecial(pte_mkcont(pte_mkhuge(pfn_pte(pfn, prot))));
+       arch_set_huge_pte_at(mm, addr, pte, entry, 0);
+
+       atomic_long_add(HPAGE_CONT_PTE_NR, &nr_phys_cont_pte_pages);
+
+       return 1;
+}
+
+int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned long pfn, pgprot_t prot)
+{
+       pte_t *pte, *mapped_pte;
+       unsigned long next;
+       spinlock_t *ptl;
+       int err = 0;
+
+       mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+       if (!pte)
+               return -ENOMEM;
+       arch_enter_lazy_mmu_mode();
+       do {
+               BUG_ON(!pte_none(*pte));
+               if (!pfn_modify_allowed(pfn, prot)) {
+                       err = -EACCES;
+                       break;
+               }
+
+               next = pte_cont_addr_end(addr, end);
+               if (remap_try_huge_pte(mm, pte, addr, next, pfn, prot)) {
+                       pte += HPAGE_CONT_PTE_NR;
+                       pfn += HPAGE_CONT_PTE_NR;
+                       addr += HPAGE_CONT_PTE_SIZE;
+               } else {
+                       set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+                       pfn++;
+                       pte++;
+                       addr += PAGE_SIZE;
+               }
+       } while (addr != end);
+       arch_leave_lazy_mmu_mode();
+       pte_unmap_unlock(mapped_pte, ptl);
+       return err;
+}
+
+/* caller must hold appropriate lock (pmd lock) */
+int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
+               unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pte_t entry;
+       bool preserve_write;
+       bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       int i, ret;
+
+       preserve_write = prot_numa && pte_write(*pte);
+       ret = 1;
+
+       /* currently, we don't consider numa cases, but just remain them
+        * for the future work */
+       if (prot_numa && is_huge_zero_page(pte_page(*pte)))
+               goto out;
+
+       if (prot_numa && pte_protnone(*pte))
+               goto out;
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+               entry = ptep_invalidate(vma, addr, pte);
+               entry = pte_modify(entry, newprot);
+               if (preserve_write)
+                       entry = pte_mk_savedwrite(entry);
+               entry = pte_mkcont(entry);
+
+               set_pte_at(mm, addr, pte, entry);
+               pte++;
+               addr += PAGE_SIZE;
+       }
+
+       flush_tlb_range(vma, addr, addr + HPAGE_CONT_PTE_SIZE);
+       ret = HPAGE_CONT_PTE_NR;
+out:
+       return ret;
+}
+
+static void __split_huge_pte_locked(struct vm_area_struct *vma, pte_t *pte,
+               unsigned long haddr, bool freeze)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page;
+       pte_t old_pte, _pte;
+       bool young, write, soft_dirty, pte_migration = false, uffd_wp = false;
+       unsigned long addr;
+       int i;
+
+       VM_BUG_ON(haddr & ~HPAGE_CONT_PTE_MASK);
+       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_CONT_PTE_SIZE, vma);
+
+       count_vm_event(THP_SPLIT_CONT_PTE);
+
+       if (!vma_is_anonymous(vma)) {
+               _pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
+               if (vma_is_dax(vma))
+                       return;
+               page = pte_page(_pte);
+               if (!PageDirty(page) && pte_dirty(_pte))
+                       set_page_dirty(page);
+               if (!PageReferenced(page) && pte_young(_pte))
+                       SetPageReferenced(page);
+               page_remove_rmap(page, true);
+               put_page(page);
+               add_mm_counter(mm, mm_counter_file(page), -HPAGE_CONT_PTE_NR);
+               return;
+       } else if (is_huge_zero_page(pte_page(*pte))) {
+               pr_err("contiguous pte mapping for zero anon pages are not supported yet");
+               BUG();
+       }
+
+       old_pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
+
+       pte_migration = is_pte_migration_entry(old_pte);
+       if (unlikely(pte_migration)) {
+               swp_entry_t entry;
+
+               entry = pte_to_swp_entry(old_pte);
+               page = pfn_to_page(swp_offset(entry));
+               write = is_write_migration_entry(entry);
+               young = false;
+               soft_dirty = pte_swp_soft_dirty(old_pte);
+               uffd_wp = pte_swp_uffd_wp(old_pte);
+       } else {
+               page = pte_page(old_pte);
+               if (pte_dirty(old_pte))
+                       SetPageDirty(page);
+               write = pte_write(old_pte);
+               young = pte_young(old_pte);
+               soft_dirty = pte_soft_dirty(old_pte);
+               uffd_wp = pte_uffd_wp(old_pte);
+       }
+
+       VM_BUG_ON_PAGE(!page_count(page), page);
+       page_ref_add(page, HPAGE_CONT_PTE_NR - 1);
+
+       for (i = 0, addr = haddr; i < HPAGE_CONT_PTE_NR;
+                               i++, addr += PAGE_SIZE, pte++) {
+               pte_t entry;
+
+               if (freeze || pte_migration) {
+                       swp_entry_t swp_entry;
+                       swp_entry = make_migration_entry(page + i, write);
+                       entry = swp_entry_to_pte(swp_entry);
+                       if (soft_dirty)
+                               entry = pte_swp_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_swp_mkuffd_wp(entry);
+               } else {
+                       entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
+                       entry = maybe_mkwrite(entry, vma);
+                       if (!write)
+                               entry = pte_wrprotect(entry);
+                       if (!young)
+                               entry = pte_mkold(entry);
+                       if (soft_dirty)
+                               entry = pte_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_mkuffd_wp(entry);
+               }
+               //BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, addr, pte, entry);
+               if (!pte_migration)
+                       atomic_inc(&page[i]._mapcount);
+               pte_unmap(pte);
+       }
+
+       if (!pte_migration) {
+               /*
+                * Set PG_double_map before dropping compound_mapcount to avoid
+                * false-negative page_mapped().
+                */
+               if (compound_mapcount(page) > 1 &&
+                               !TestSetPageDoubleMap(page)) {
+                       for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+                               atomic_inc(&page[i]._mapcount);
+               }
+
+               lock_page_memcg(page);
+               if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+                       /* Last compound_mapcount is gone. */
+                       __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
+                       if (TestClearPageDoubleMap(page)) {
+                               /* No need in mapcount reference anymore */
+                               for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+                                       atomic_dec(&page[i]._mapcount);
+                       }
+               }
+               unlock_page_memcg(page);
+       }
+
+       smp_wmb();
+
+       if (freeze) {
+               for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+                       page_remove_rmap(page + i, false);
+                       put_page(page + i);
+               }
+       }
+}
+
+void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
+               pte_t *pte, unsigned long address,
+               bool freeze, struct page *page)
+{
+       spinlock_t *ptl;
+       struct mmu_notifier_range range;
+       pte_t _pte;
+       bool locked = false;
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               address & HPAGE_CONT_PTE_MASK,
+                               (address & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+
+       if (page) {
+               VM_WARN_ON_ONCE(!PageLocked(page));
+               if (page != pte_page(*pte))
+                       goto out;
+       }
+repeat:
+       if (pte_cont(*pte)) {
+               if (!page) {
+                       page = pte_page(*pte);
+                       /*
+                        * An anonymous page must be locked, to ensure that a
+                        * concurrent reuse_swap_page() sees stable mapcount;
+                        * but reuse_swap_page() is not used on shmem or file,
+                        * and page lock must not be taken when zap_pte_range()
+                        * calls __split_huge_pte() while i_mmap_lock is held.
+                        */
+                       if (PageAnon(page)) {
+                               if (unlikely(!trylock_page(page))) {
+                                       _pte = *pte;
+                                       get_page(page);
+                                       spin_unlock(ptl);
+                                       lock_page(page);
+                                       spin_lock(ptl);
+                                       if (unlikely(!pte_same(*pte, _pte))) {
+                                               unlock_page(page);
+                                               put_page(page);
+                                               page = NULL;
+                                               goto repeat;
+                                       }
+                                       put_page(page);
+                               }
+                               locked = true;
+                       }
+               }
+               if (PageMlocked(page))
+                       clear_page_mlock(page);
+       } else if (!(pte_devmap(*pte) || is_pte_migration_entry(*pte)))
+               goto out;
+       __split_huge_pte_locked(vma, pte, range.start, freeze);
+out:
+       spin_unlock(ptl);
+       if (locked && page)
+               unlock_page(page);
+       mmu_notifier_invalidate_range_only_end(&range);
+}
+
+void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
+               bool freeze, struct page *page)
+{
+       unsigned long haddr = address & HPAGE_CONT_PTE_MASK;
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset(vma->vm_mm, haddr);
+       if (!pgd_present(*pgd))
+               return;
+
+       p4d = p4d_offset(pgd, haddr);
+       if (!p4d_present(*p4d))
+               return;
+
+       pud = pud_offset(p4d, haddr);
+       if (!pud_present(*pud))
+               return;
+
+       pmd = pmd_offset(pud, haddr);
+       if (!pmd_present(*pmd))
+               return;
+
+       pte = pte_offset_map(pmd, haddr);
+       if (!pte_present(*pte))
+               return;
+
+       __split_huge_pte(vma, pmd, pte, haddr, freeze, page);
+}
+#endif /* CONFIG_FINEGRAINED_THP */
index 6aabf1e..a32cc50 100644 (file)
@@ -1365,6 +1365,22 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
        return 1;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+int cont_pte_set_huge(pte_t *ptep, phys_addr_t phys, pgprot_t prot)
+{
+       int i;
+       pte_t new_pte;
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++, phys += PAGE_SIZE, ptep++) {
+               new_pte = pfn_pte(__phys_to_pfn(phys), prot);
+               new_pte = pte_mkcont(new_pte);
+               set_pte(ptep, new_pte);
+       }
+
+       return 1;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 int pud_clear_huge(pud_t *pudp)
 {
        if (!pud_sect(READ_ONCE(*pudp)))
index 887a553..9a78266 100644 (file)
@@ -130,15 +130,33 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        show_val_kb(m, "AnonHugePages:  ",
                    global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+       show_val_kb(m, "Anon64KBPages:  ",
+                   global_node_page_state(NR_ANON_64KB_THPS) * HPAGE_CONT_PTE_NR);
+#endif /* CONFIG_FINEGRAINED_THP */
        show_val_kb(m, "ShmemHugePages: ",
                    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
        show_val_kb(m, "ShmemPmdMapped: ",
                    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+       show_val_kb(m, "ShmemPteMapped: ",
+                   global_node_page_state(NR_SHMEM_PTEMAPPED) * HPAGE_CONT_PTE_NR);
+       show_val_kb(m, "File64KBPages:  ",
+                       global_node_page_state(NR_FILE_64KB_THPS) * HPAGE_CONT_PTE_NR);
+#endif /* CONFIG_FINEGRAINED_THP */
        show_val_kb(m, "FileHugePages:  ",
                    global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
        show_val_kb(m, "FilePmdMapped:  ",
                    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+       show_val_kb(m, "FileCPteMapped: ",
+                   global_node_page_state(NR_FILE_PTEMAPPED) * HPAGE_CONT_PTE_NR);
+#endif /* CONFIG_FINEGRAINED_THP */
 #endif
+       show_val_kb(m, "PhysCPteMapped: ",
+                   phys_cont_pte_pages());
+       show_val_kb(m, "PhysPmdMapped:  ",
+                   phys_huge_pmd_pages() * HPAGE_PMD_NR);
 
 #ifdef CONFIG_CMA
        show_val_kb(m, "CmaTotal:       ", totalcma_pages);
diff --git a/include/asm-generic/finegrained_thp.h b/include/asm-generic/finegrained_thp.h
new file mode 100644 (file)
index 0000000..08a3461
--- /dev/null
@@ -0,0 +1,8 @@
+/* a generic header for fine-grained thp */
+#ifndef __ASM_FINEGRAINED_THP_H
+#define __ASM_FINEGRAINED_THP_H
+static inline void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_FINEGRAINED_THP_H */
diff --git a/include/asm-generic/huge_mm.h b/include/asm-generic/huge_mm.h
new file mode 100644 (file)
index 0000000..48527cf
--- /dev/null
@@ -0,0 +1,57 @@
+/* a generic header for architecture-dependent hugepage */
+#ifndef __ASM_HUGE_MM_H
+#define __ASM_HUGE_MM_H
+#ifndef CONFIG_FINEGRAINED_THP
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+       return VM_FAULT_FALLBACK;
+}
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf,
+                               pte_t entry)
+{
+       return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+       return pte;
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+               struct vm_area_struct *vma)
+{
+       return mk_pte(hpage, vma->vm_page_prot);
+}
+
+static inline void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug)
+{}
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+                       struct page *page)
+{}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep)
+{}
+
+static inline bool arch_hugepage_vma_shmem_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return false;
+}
+
+static inline bool arch_hugepage_vma_file_check(
+                               struct vm_area_struct *vma,
+                               unsigned long vm_flags)
+{
+       return false;
+}
+#endif /* CONFIG_FINGRAINED_THP */
+#endif /* __ASM_HUGE_MM_H */
index 0365aa9..4f8818c 100644 (file)
@@ -6,6 +6,9 @@
 #include <linux/mm_types.h>
 
 #include <linux/fs.h> /* only for vma_is_dax() */
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/huge_mm.h> /* for compound_order/compound_nr */
+#endif
 
 extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -272,8 +275,13 @@ static inline struct page *thp_head(struct page *page)
 static inline unsigned int thp_order(struct page *page)
 {
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (PageHead(page))
+               return page[1].compound_order;
+#else
        if (PageHead(page))
                return HPAGE_PMD_ORDER;
+#endif
        return 0;
 }
 
@@ -285,7 +293,11 @@ static inline int thp_nr_pages(struct page *page)
 {
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        if (PageHead(page))
+#ifdef CONFIG_FINEGRAINED_THP
+               return page[1].compound_nr;
+#else
                return HPAGE_PMD_NR;
+#endif
        return 1;
 }
 
index b8eadd9..de2371d 100644 (file)
@@ -2803,6 +2803,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_PIN       0x40000 /* pages must be released via unpin_user_page */
 #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
 
+#define FOLL_SPLIT_PTE 0x100000 /* pslit huge pte before returning */
+
 /*
  * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
  * other. Here is what they mean, and how to use them:
@@ -3151,6 +3153,9 @@ static inline int pages_identical(struct page *page1, struct page *page2)
        return !memcmp_pages(page1, page2);
 }
 
+extern unsigned long phys_cont_pte_pages(void);
+extern unsigned long phys_huge_pmd_pages(void);
+
 #ifdef CONFIG_MAPPING_DIRTY_HELPERS
 unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
index b820078..104ff57 100644 (file)
@@ -592,6 +592,21 @@ static inline void mmu_notifier_range_init_migrate(
        ___pte;                                                         \
 })
 
+#ifdef CONFIG_FINEGRAINED_THP
+#define        ptep_huge_clear_flush_notify(__vma, __address, __ptep)          \
+({                                                                     \
+       unsigned long ___addr = __address & HPAGE_CONT_PTE_MASK;                        \
+       struct mm_struct *___mm = (__vma)->vm_mm;                       \
+       pte_t ___pte;                                                   \
+                                                                       \
+       ___pte = ptep_huge_clear_flush(__vma, __address, __ptep);               \
+       mmu_notifier_invalidate_range(___mm, ___addr,                   \
+                                       ___addr + HPAGE_CONT_PTE_SIZE);         \
+                                                                       \
+       ___pte;                                                         \
+})
+#endif /* CONFIG_FINEGRAINED_THP */
+
 #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)            \
 ({                                                                     \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
@@ -737,6 +752,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
 #define set_pte_at_notify set_pte_at
 
+#ifdef CONFIG_FINEGRAINED_THP
+#define        ptep_huge_clear_flush_notify ptep_huge_clear_flush
+#endif
+
 static inline void mmu_notifier_synchronize(void)
 {
 }
index 9d0c454..26df92e 100644 (file)
@@ -193,9 +193,19 @@ enum node_stat_item {
        NR_SHMEM,               /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
+#ifdef CONFIG_FINEGRAINED_THP
+       NR_SHMEM_PTEMAPPED,
+       NR_FILE_64KB_THPS,
+#endif /* CONFIG_FINEGRAINED_THP */
        NR_FILE_THPS,
+#ifdef CONFIG_FINEGRAINED_THP
+       NR_FILE_PTEMAPPED,
+#endif /* CONFIG_FINEGRAINED_THP */
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
+#ifdef CONFIG_FINEGRAINED_THP
+       NR_ANON_64KB_THPS,
+#endif
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,    /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,             /* page dirtyings since bootup */
index 7c869ea..62a80bf 100644 (file)
@@ -1300,7 +1300,9 @@ static inline int p4d_clear_huge(p4d_t *p4d)
        return 0;
 }
 #endif /* !__PAGETABLE_P4D_FOLDED */
-
+#ifdef CONFIG_FINEGRAINED_THP
+int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot);
+#endif /* CONFIG_FINEGRAINED_THP */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
@@ -1309,6 +1311,12 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
 int pud_free_pmd_page(pud_t *pud, unsigned long addr);
 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
 #else  /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot)
+{
+       return 0;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
 static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 {
        return 0;
index def5c62..6d6f374 100644 (file)
@@ -98,6 +98,9 @@ enum ttu_flags {
        TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
                                         * caller holds it */
        TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under splitting thp */
+#ifdef CONFIG_FINEGRAINED_THP
+       TTU_SPLIT_HUGE_PTE      = 0x200,        /* split huge PTE if any */
+#endif
 };
 
 #ifdef CONFIG_MMU
index d9b7c91..71aa4b7 100644 (file)
@@ -274,6 +274,12 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 {
        return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
 }
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int is_pte_migration_entry(pte_t pte)
+{
+       return !pte_present(pte) && is_migration_entry(pte_to_swp_entry(pte));
+}
+#endif
 #else
 static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
@@ -303,6 +309,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 {
        return 0;
 }
+
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int is_pte_migration_entry(pte_t pte)
+{
+       return 0;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
index 21d7c7f..77e4bdd 100644 (file)
@@ -95,6 +95,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                THP_SPLIT_PAGE,
                THP_SPLIT_PAGE_FAILED,
                THP_DEFERRED_SPLIT_PAGE,
+#ifdef CONFIG_FINEGRAINED_THP
+               THP_SPLIT_CONT_PTE,
+#endif
                THP_SPLIT_PMD,
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
                THP_SPLIT_PUD,
index f94f65d..f5d33b8 100644 (file)
 #define MAP_SYNC               0x080000 /* perform synchronous page faults for the mapping */
 #define MAP_FIXED_NOREPLACE    0x100000        /* MAP_FIXED which doesn't unmap underlying mapping */
 
+#ifdef CONFIG_FINEGRAINED_THP
+#define MAP_FILE_THP   0x200000        /* MAP_FIXED which doesn't unmap underlying mapping */
+#endif
+
 #define MAP_UNINITIALIZED 0x4000000    /* For anonymous mmap, memory could be
                                         * uninitialized */
 
index c99de4a..f3bb8b2 100644 (file)
@@ -177,6 +177,7 @@ endchoice
 
 config CMA_ALIGNMENT
        int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+       range 9 12 if FINEGRAINED_THP
        range 2 12
        default 8
        help
index 00b0358..50bc0fd 100644 (file)
@@ -475,7 +475,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 
 retry:
        if (is_register)
+#ifdef CONFIG_FINEGRAINED_THP
+       {
+               gup_flags |= FOLL_SPLIT_PMD | FOLL_SPLIT_PTE;
+               pr_info("THP-%s: FOLL_SPLIT_PTE called comm(%s)\n", __func__, current->comm);
+       }
+#else /* CONFIG_FINEGRAINED_THP */
                gup_flags |= FOLL_SPLIT_PMD;
+#endif /* CONFIG_FINEGRAINED_THP */
        /* Read the page with vaddr into memory */
        ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
                                    &old_page, &vma, NULL);
index ffcae7b..3965f52 100644 (file)
@@ -875,6 +875,23 @@ config READ_ONLY_THP_FOR_FS
          support of file THPs will be developed in the next few release
          cycles.
 
+config FINEGRAINED_THP
+       bool "Fine-grained THP support (EXPERIMENTAL)"
+       depends on TRANSPARENT_HUGEPAGE
+
+       help
+         Allow khugepaged to create 64KB hugepages and 64KB hugepage
+         allocation on page faults.
+
+         It is only supported by ARM64 architecture for now.
+
+config THP_CONSERVATIVE
+       bool "A conservative policy for fTHP (EXPERIMENTAL)"
+       depends on FINEGRAINED_THP
+
+       help
+         In the conservative policy, only khugepaged can make hugepages
+
 config ARCH_HAS_PTE_SPECIAL
        bool
 
index 125b69f..02099ca 100644 (file)
@@ -206,7 +206,14 @@ static void unaccount_page_cache_page(struct address_space *mapping,
                if (PageTransHuge(page))
                        __dec_node_page_state(page, NR_SHMEM_THPS);
        } else if (PageTransHuge(page)) {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (thp_nr_pages(page) == HPAGE_PMD_NR)
+                       __dec_node_page_state(page, NR_FILE_THPS);
+               else
+                       __dec_node_page_state(page, NR_FILE_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
                __dec_node_page_state(page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
                filemap_nr_thps_dec(mapping);
        }
 
index 054ff92..cd32ef8 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -447,6 +447,10 @@ retry:
                        return ERR_PTR(ret);
                goto retry;
        }
+#ifdef CONFIG_FINEGRAINED_THP
+       else if (flags & FOLL_SPLIT_PTE && pte_cont(pte))
+               split_huge_pte(vma, pmd, ptep, address);
+#endif /* CONFIG_FINEGRAINED_THP */
 
        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
        if (unlikely(!try_grab_page(page, flags))) {
index 4a78514..20ea663 100644 (file)
@@ -753,6 +753,13 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
        return __do_huge_pmd_anonymous_page(vmf, page, gfp);
 }
 
+#ifndef CONFIG_FINEGRAINED_THP
+vm_fault_t do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+       return VM_FAULT_FALLBACK;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
                pgtable_t pgtable)
@@ -1109,6 +1116,9 @@ out:
        return ret;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+#endif /* CONFIG_FINEGRAINED_THP */
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, int flags)
@@ -1660,6 +1670,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
+               atomic_long_dec(&nr_phys_huge_pmd_pages);
                spin_unlock(ptl);
                if (is_huge_zero_pmd(orig_pmd))
                        tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
@@ -2183,6 +2194,61 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        }
 }
 
+static int thp_pte_alloc_locked(struct mm_struct *mm, pmd_t *pmd)
+{
+       pgtable_t new = pte_alloc_one(mm);
+       if (!new)
+               return -ENOMEM;
+
+       if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
+               mm_inc_nr_ptes(mm);
+               pmd_populate(mm, pmd, new);
+               new = NULL;
+       }
+       if (new)
+               pte_free(mm, new);
+       return 0;
+}
+
+static int thp_remap_pte_range_locked(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned long pfn, pgprot_t prot)
+{
+       pte_t *pte;
+       int err = 0;
+
+       err = thp_pte_alloc_locked(mm, pmd);
+       if (err)
+               return err;
+
+       pte = pte_offset_map(pmd, addr);
+       if (!pte)
+               return -ENOMEM;
+
+       arch_enter_lazy_mmu_mode();
+       do {
+               BUG_ON(!pte_none(*pte));
+               if (!pfn_modify_allowed(pfn, prot)) {
+                       err = -EACCES;
+                       break;
+               }
+
+               set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+               pfn++;
+               pte++;
+               addr += PAGE_SIZE;
+       } while (addr != end);
+       arch_leave_lazy_mmu_mode();
+       return err;
+}
+
+static inline pgprot_t thp_pmd_pgprot(pmd_t pmd)
+{
+       unsigned long pfn = pmd_pfn(pmd);
+
+       return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
+}
+
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct page *page)
 {
@@ -2209,7 +2275,19 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        }
 
 repeat:
-       if (pmd_trans_huge(*pmd)) {
+       if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd)) {
+               struct mm_struct *mm = vma->vm_mm;
+               unsigned long haddr = address & HPAGE_PMD_MASK;
+               pmd_t orig_pmd;
+
+               orig_pmd = pmdp_huge_get_and_clear_full(vma, haddr, pmd, 0);
+               atomic_long_dec(&nr_phys_huge_pmd_pages);
+               thp_remap_pte_range_locked(mm, pmd, haddr,
+                                          haddr + HPAGE_PMD_SIZE,
+                                          pmd_pfn(orig_pmd),
+                                          thp_pmd_pgprot(orig_pmd));
+               goto out;
+       } else if (pmd_trans_huge(*pmd) && vm_normal_page_pmd(vma, address, *pmd)) {
                if (!page) {
                        page = pmd_page(*pmd);
                        /*
@@ -2301,7 +2379,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
                split_huge_pmd_address(vma, start, false, NULL);
-
+#ifdef CONFIG_FINEGRAINED_THP
+       if (start & ~HPAGE_CONT_PTE_MASK &&
+               (start & HPAGE_CONT_PTE_MASK) >= vma->vm_start &&
+               (start & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end)
+               split_huge_pte_address(vma, start, false, NULL);
+#endif
        /*
         * If the new end address isn't hpage aligned and it could
         * previously contain an hugepage: check if we need to split
@@ -2311,6 +2394,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
                split_huge_pmd_address(vma, end, false, NULL);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (end & ~HPAGE_CONT_PTE_MASK &&
+               (end & HPAGE_CONT_PTE_MASK) >= vma->vm_start &&
+               (end & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end)
+               split_huge_pte_address(vma, end, false, NULL);
+#endif
 
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
@@ -2325,17 +2414,34 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
                        split_huge_pmd_address(next, nstart, false, NULL);
+#ifdef CONFIG_FINEGRAINED_THP
+               if (nstart & ~HPAGE_CONT_PTE_MASK &&
+                       (nstart & HPAGE_CONT_PTE_MASK) >= next->vm_start &&
+                       (nstart & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= next->vm_end)
+                       split_huge_pte_address(next, nstart, false, NULL);
+#endif
        }
 }
 
 static void unmap_page(struct page *page)
 {
+#ifdef CONFIG_FINEGRAINED_THP
+       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+               TTU_RMAP_LOCKED;
+#else
        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+#endif
        bool unmap_success;
 
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (compound_order(page) == HPAGE_PMD_ORDER)
+               ttu_flags |= TTU_SPLIT_HUGE_PMD;
+       else
+               ttu_flags |= TTU_SPLIT_HUGE_PTE;
+#endif /* CONFIG_FINEGRAINED_THP */
        if (PageAnon(page))
                ttu_flags |= TTU_SPLIT_FREEZE;
 
@@ -2720,8 +2826,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                if (mapping) {
                        if (PageSwapBacked(head))
                                __dec_node_page_state(head, NR_SHMEM_THPS);
-                       else
+                       else {
+#ifdef CONFIG_FINEGRAINED_THP
+                               if (thp_nr_pages(head) == HPAGE_CONT_PTE_NR)
+                                       __dec_node_page_state(head, NR_FILE_64KB_THPS);
+                               else
+#endif /* CONFIG_FINEGRAINED_THP */
                                __dec_node_page_state(head, NR_FILE_THPS);
+                       }
                }
 
                __split_huge_page(page, list, end, flags);
index c43ccdd..171d962 100644 (file)
@@ -612,6 +612,9 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 
 void setup_zone_pageset(struct zone *zone);
 
+extern atomic_long_t nr_phys_cont_pte_pages;
+extern atomic_long_t nr_phys_huge_pmd_pages;
+
 struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
index 5fa1ab4..50a9121 100644 (file)
@@ -21,6 +21,10 @@ static int __read_mostly ioremap_pud_capable;
 static int __read_mostly ioremap_pmd_capable;
 static int __read_mostly ioremap_huge_disabled;
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int __read_mostly ioremap_cont_pte_capable;
+#endif
+
 static int __init set_nohugeiomap(char *str)
 {
        ioremap_huge_disabled = 1;
@@ -55,12 +59,45 @@ static inline int ioremap_pmd_enabled(void)
        return ioremap_pmd_capable;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int ioremap_cont_pte_enabled(void)
+{
+       return ioremap_cont_pte_capable;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 #else  /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
 static inline int ioremap_p4d_enabled(void) { return 0; }
 static inline int ioremap_pud_enabled(void) { return 0; }
 static inline int ioremap_pmd_enabled(void) { return 0; }
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int ioremap_cont_pte_enabled(void) { return 0; }
+#endif /* CONFIG_FINEGRAINED_THP */
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int ioremap_try_huge_pte(pte_t *pte, unsigned long addr,
+                               unsigned long end, phys_addr_t phys_addr,
+                               pgprot_t prot)
+{
+       int i;
+
+       if (!ioremap_cont_pte_enabled())
+               return 0;
+       if ((end - addr) != CONT_PTE_SIZE)
+               return 0;
+       if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
+               return 0;
+       if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
+               return 0;
+
+       for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+               if (pte_present(*(pte + i)))
+                       return 0;
+       return cont_pte_set_huge(pte, phys_addr, prot);
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
                pgtbl_mod_mask *mask)
@@ -73,9 +110,23 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
        if (!pte)
                return -ENOMEM;
        do {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (addr + HPAGE_CONT_PTE_SIZE < end &&
+                               ioremap_try_huge_pte(pte, addr, end, phys_addr, prot)) {
+                       pte += HPAGE_CONT_PTE_NR - 1;
+                       pfn += HPAGE_CONT_PTE_NR;
+                       addr += HPAGE_CONT_PTE_SIZE - PAGE_SIZE;
+                       phys_addr += HPAGE_CONT_PTE_SIZE;
+                       continue;
+               }
+
+#endif /* CONFIG_FINEGRAINED_THP */
                BUG_ON(!pte_none(*pte));
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
+#ifdef CONFIG_FINEGRAINED_THP
+               phys_addr += PAGE_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
index abab394..aa96e8e 100644 (file)
@@ -21,6 +21,8 @@
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
+#include <asm/finegrained_thp.h>
+#include <asm/huge_mm.h>
 #include "internal.h"
 
 enum scan_result {
@@ -78,6 +80,32 @@ static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
 static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * thp_scan_hint:
+ * it used for providing hints to khugepaged
+ * which address space is changed recently.
+ */
+struct thp_scan_hint {
+       struct mm_slot *slot;
+       struct vm_area_struct *vma;
+       unsigned long diff;             /* memory difference */
+       unsigned long jiffies;          /* time stamp for profiling purpose */
+       struct list_head hint_list;
+};
+
+/* THP type descriptor */
+enum {
+       THP_TYPE_FAIL,  /* cannot make hugepage */
+       THP_TYPE_64KB,  /* 64KB hugepage can be made, use CONT_PTE */
+       THP_TYPE_2MB,   /* 2MB hugepage can be made, use PMD */
+};
+
+static unsigned int khugepaged_max_ptes_none_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly;
+#endif /* CONFIG_FINEGRAINED_THP */
+
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
@@ -113,10 +141,18 @@ struct khugepaged_scan {
        struct list_head mm_head;
        struct mm_slot *mm_slot;
        unsigned long address;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_type;
+       int nr_hint;
+       struct list_head hint_list;
+#endif /* CONFIG_FINEGRAINED_THP */
 };
 
 static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+#ifdef CONFIG_FINEGRAINED_THP
+       .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list),
+#endif
 };
 
 #ifdef CONFIG_SYSFS
@@ -394,6 +430,11 @@ int __init khugepaged_init(void)
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1;
+       khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8;
+       khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2;
+#endif
        return 0;
 }
 
@@ -437,6 +478,10 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
        return atomic_read(&mm->mm_users) == 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void clear_hint_list(struct mm_slot *slot);
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static bool hugepage_vma_check(struct vm_area_struct *vma,
                               unsigned long vm_flags)
 {
@@ -445,8 +490,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                return false;
 
+       /* Check arch-dependent shmem hugepage available */
+       if (arch_hugepage_vma_shmem_check(vma, vm_flags))
+               return true;
        /* Enabled via shmem mount options or sysfs settings. */
-       if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
+       else if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
                return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                HPAGE_PMD_NR);
        }
@@ -455,8 +503,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
        if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
                return false;
 
+       /* Check arch-dependent file hugepage available */
+       if (arch_hugepage_vma_file_check(vma, vm_flags))
+               return true;
        /* Read-only file mappings need to be aligned for THP to work. */
-       if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+       else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
            (vm_flags & VM_DENYWRITE)) {
                return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                HPAGE_PMD_NR);
@@ -519,6 +570,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
                return khugepaged_enter(vma, vm_flags);
+#ifdef CONFIG_FINEGRAINED_THP
+       hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+       hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+       if (hstart < hend)
+               return khugepaged_enter(vma, vm_flags);
+#endif /* CONFIG_FINEGRAINED_THP */
        return 0;
 }
 
@@ -530,6 +587,9 @@ void __khugepaged_exit(struct mm_struct *mm)
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+#ifdef CONFIG_FINEGRAINED_THP
+               clear_hint_list(mm_slot);
+#endif
                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
                free = 1;
@@ -594,23 +654,56 @@ static bool is_refcount_suitable(struct page *page)
        return page_count(page) == expected_refcount;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pte_t *pte,
+                                       struct list_head *compound_pagelist,
+                                       int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
                                        struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *page = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
        bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+       int max_ptes_shared, max_ptes_none;
+       int hpage_nr;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               hpage_nr = HPAGE_CONT_PTE_NR;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+       } else {
+               hpage_nr = HPAGE_PMD_NR;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+               max_ptes_none = khugepaged_max_ptes_none;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
 
-       for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+       for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+               _pte < pte + hpage_nr;
+#else
+               _pte < pte+HPAGE_PMD_NR;
+#endif
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
+#ifdef CONFIG_FINEGRAINED_THP
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+                           ++none_or_zero <= max_ptes_none)
+#else /* CONFIG_FINEGRAINED_THP */
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
+#endif /* CONFIG_FINEGRAINED_THP */
+                       {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -629,8 +722,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
                VM_BUG_ON_PAGE(!PageAnon(page), page);
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (page_mapcount(page) > 1 &&
+                               ++shared > max_ptes_shared)
+#else /* CONFIG_FINEGRAINED_THP */
                if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
+                               ++shared > khugepaged_max_ptes_shared)
+#endif /* CONFIG_FINEGRAINED_THP */
+               {
                        result = SCAN_EXCEED_SHARED_PTE;
                        goto out;
                }
@@ -732,15 +831,34 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                     struct vm_area_struct *vma,
+                                     unsigned long address,
+                                     spinlock_t *ptl,
+                                     struct list_head *compound_pagelist,
+                                     int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                                      struct vm_area_struct *vma,
                                      unsigned long address,
                                      spinlock_t *ptl,
                                      struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *src_page, *tmp;
        pte_t *_pte;
-       for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+#endif
+
+       for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+                               _pte < pte + hpage_nr;
+#else
+                               _pte < pte + HPAGE_PMD_NR;
+#endif
                                _pte++, page++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
 
@@ -894,12 +1012,21 @@ static int khugepaged_find_target_node(void)
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static inline struct page *alloc_khugepaged_hugepage(int hpage_order)
+#else
 static inline struct page *alloc_khugepaged_hugepage(void)
+#endif
 {
        struct page *page;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+                          hpage_order);
+#else
        page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
                           HPAGE_PMD_ORDER);
+#endif
        if (page)
                prep_transhuge_page(page);
        return page;
@@ -910,7 +1037,11 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
        struct page *hpage;
 
        do {
+#ifdef CONFIG_FINEGRAINED_THP
+               hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER);
+#else
                hpage = alloc_khugepaged_hugepage();
+#endif
                if (!hpage) {
                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        if (!*wait)
@@ -948,6 +1079,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type)
+{
+       struct page *page;
+
+       if (hpage_type == THP_TYPE_64KB)
+               page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER);
+       else {
+               VM_BUG_ON(!*hpage);
+               page = *hpage;
+       }
+       return page;
+}
+#else /* CONFIG_FINEGRAINED_THP */
 static struct page *
 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 {
@@ -955,6 +1101,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 
        return  *hpage;
 }
+#endif /* CONFIG_FINEGRAINED_THP */
 #endif
 
 /*
@@ -964,8 +1111,13 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
  * value (scan code).
  */
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+               struct vm_area_struct **vmap, int hpage_type)
+#else
 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                struct vm_area_struct **vmap)
+#endif
 {
        struct vm_area_struct *vma;
        unsigned long hstart, hend;
@@ -977,6 +1129,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
        if (!vma)
                return SCAN_VMA_NULL;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+               hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+               if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend)
+                       return SCAN_ADDRESS_RANGE;
+               if (!hugepage_vma_check(vma, vma->vm_flags))
+                       return SCAN_VMA_CHECK;
+               return 0;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -997,10 +1160,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
  * but with mmap_lock held to protect against vma changes.
  */
 
+#ifdef CONFIG_FINEGRAINED_THP
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd,
+                                       int referenced, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
                                        int referenced)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        int swapped_in = 0;
        vm_fault_t ret = 0;
@@ -1011,9 +1181,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                .pmd = pmd,
                .pgoff = linear_page_index(vma, address),
        };
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif
 
        vmf.pte = pte_offset_map(pmd, address);
-       for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+       for (;
+#ifdef CONFIG_FINEGRAINED_THP
+                       vmf.address < address + hpage_size;
+#else
+                       vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+#endif
                        vmf.pte++, vmf.address += PAGE_SIZE) {
                vmf.orig_pte = *vmf.pte;
                if (!is_swap_pte(vmf.orig_pte))
@@ -1024,7 +1203,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
                if (ret & VM_FAULT_RETRY) {
                        mmap_read_lock(mm);
-                       if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type))
+#else
+                       if (hugepage_vma_revalidate(mm, address, &vmf.vma))
+#endif
+                       {
                                /* vma is no longer available, don't continue to swapin */
                                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
                                return false;
@@ -1053,10 +1237,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
        return true;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_huge_page(struct mm_struct *mm,
+                                  unsigned long address,
+                                  struct page **hpage,
+                                  int node, int referenced, int unmapped,
+                                  int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
                                   int node, int referenced, int unmapped)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
@@ -1069,7 +1261,14 @@ static void collapse_huge_page(struct mm_struct *mm,
        struct mmu_notifier_range range;
        gfp_t gfp;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       pte_t _pte;
+
+       VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ?
+                               ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK));
+#else
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif
 
        /* Only allocate from the target node */
        gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
@@ -1081,7 +1280,11 @@ static void collapse_huge_page(struct mm_struct *mm,
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
        new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
        if (!new_page) {
                result = SCAN_ALLOC_HUGE_PAGE_FAIL;
                goto out_nolock;
@@ -1094,7 +1297,11 @@ static void collapse_huge_page(struct mm_struct *mm,
        count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
 
        mmap_read_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
        result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
        if (result) {
                mmap_read_unlock(mm);
                goto out_nolock;
@@ -1112,11 +1319,19 @@ static void collapse_huge_page(struct mm_struct *mm,
         * If it fails, we release mmap_lock and jump out_nolock.
         * Continuing to collapse causes inconsistency.
         */
+#ifdef CONFIG_FINEGRAINED_THP
+       if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+                                                    pmd, referenced, hpage_type)) {
+               mmap_read_unlock(mm);
+               goto out_nolock;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
                                                     pmd, referenced)) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
+#endif /* CONFIG_FINEGRAINED_THP*/
 
        mmap_read_unlock(mm);
        /*
@@ -1125,7 +1340,11 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        mmap_write_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
        result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
        if (result)
                goto out;
        /* check if the pmd is still valid */
@@ -1134,8 +1353,14 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        anon_vma_lock_write(vma->anon_vma);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+                               address, address + (hpage_type == THP_TYPE_64KB ?
+                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE));
+#else
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
                                address, address + HPAGE_PMD_SIZE);
+#endif
        mmu_notifier_invalidate_range_start(&range);
 
        pte = pte_offset_map(pmd, address);
@@ -1148,16 +1373,38 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-       _pmd = pmdp_collapse_flush(vma, address, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               /* FIXME: clearing ptes here causes
+                * __collapse_huge_page_isolate and __collapse_huge_page_copy
+                * to fail, __collapse_huge_page_copy also clears ptes
+                */
+               ;
+       else
+#endif /* CONFIG_FINEGRAINED_THP */
+               _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
 
        spin_lock(pte_ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+       isolated = __collapse_huge_page_isolate(vma, address, pte,
+                       &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
        isolated = __collapse_huge_page_isolate(vma, address, pte,
                        &compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(pte_ptl);
 
        if (unlikely(!isolated)) {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB) {
+                       pte_unmap(pte);
+                       anon_vma_unlock_write(vma->anon_vma);
+                       result = SCAN_FAIL;
+                       goto out;
+               }
+#endif /* CONFIG_FINEGRAINED_THP */
                pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
@@ -1179,15 +1426,34 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        anon_vma_unlock_write(vma->anon_vma);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+                       &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
        __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
                        &compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
        pte_unmap(pte);
        __SetPageUptodate(new_page);
+
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               /* 64KB hugepage */
+               _pte = arch_make_huge_pte(new_page, vma);
+               _pte = maybe_mkwrite(pte_mkdirty(_pte), vma);
+       } else {
+               /* 2MB hugepage */
+               pgtable = pmd_pgtable(_pmd);
+
+               _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+               _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        pgtable = pmd_pgtable(_pmd);
 
        _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
+#endif /* CONFIG_FINEGRAINED_THP */
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so
         * this is needed to avoid the copy_huge_page writes to become
@@ -1196,15 +1462,32 @@ static void collapse_huge_page(struct mm_struct *mm,
        smp_wmb();
 
        spin_lock(pmd_ptl);
-       BUG_ON(!pmd_none(*pmd));
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_2MB)
+#endif
+               BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address, true);
        lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               arch_set_huge_pte_at(mm, address, pte, _pte, 0);
+       else {
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
+               set_pmd_at(mm, address, pmd, _pmd);
+       }
+       update_mmu_cache_pmd(vma, address, pmd);
+#else /* CONFIG_FINEGRAINED_THP */
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(pmd_ptl);
 
-       *hpage = NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_2MB)
+#endif
+               *hpage = NULL;
 
        khugepaged_pages_collapsed++;
        result = SCAN_SUCCEED;
@@ -1213,16 +1496,27 @@ out_up_write:
 out_nolock:
        if (!IS_ERR_OR_NULL(*hpage))
                mem_cgroup_uncharge(*hpage);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+               put_page(new_page);
+#endif
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
 out:
        goto out_up_write;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                              struct vm_area_struct *vma,
+                              unsigned long address,
+                              struct page **hpage, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static int khugepaged_scan_pmd(struct mm_struct *mm,
                               struct vm_area_struct *vma,
                               unsigned long address,
                               struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
@@ -1234,7 +1528,26 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr;
+       int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+               hpage_nr = HPAGE_CONT_PTE_NR;
+               max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+       } else {
+               VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+               hpage_nr = HPAGE_PMD_NR;
+               max_ptes_swap = khugepaged_max_ptes_swap;
+               max_ptes_none = khugepaged_max_ptes_none;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif /* CONFIG_FINEGRAINED_THP */
 
        pmd = mm_find_pmd(mm, address);
        if (!pmd) {
@@ -1244,11 +1557,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-       for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+       for (_address = address, _pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+               _pte < pte + hpage_nr;
+#else
+               _pte < pte+HPAGE_PMD_NR;
+#endif
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (is_swap_pte(pteval)) {
-                       if (++unmapped <= khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (++unmapped <= max_ptes_swap)
+#else
+                       if (++unmapped <= khugepaged_max_ptes_swap)
+#endif
+                       {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
@@ -1266,7 +1589,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+                           ++none_or_zero <= max_ptes_none
+#else
+                           ++none_or_zero <= khugepaged_max_ptes_none
+#endif
+                       )
+                       {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -1299,8 +1628,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (PageCompound(page) && PageTransHuge(compound_head(page))) {
+                       result = SCAN_PAGE_COMPOUND;
+                       goto out_unmap;
+               }
+
+               if (page_mapcount(page) > 1 &&
+                               ++shared > max_ptes_shared)
+#else
                if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
+                               ++shared > khugepaged_max_ptes_shared)
+#endif
+               {
                        result = SCAN_EXCEED_SHARED_PTE;
                        goto out_unmap;
                }
@@ -1371,8 +1711,13 @@ out_unmap:
        if (ret) {
                node = khugepaged_find_target_node();
                /* collapse_huge_page will return with the mmap_lock released */
+#ifdef CONFIG_FINEGRAINED_THP
+               collapse_huge_page(mm, address, hpage, node,
+                               referenced, unmapped, hpage_type);
+#else
                collapse_huge_page(mm, address, hpage, node,
                                referenced, unmapped);
+#endif
        }
 out:
        trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1387,6 +1732,9 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
        lockdep_assert_held(&khugepaged_mm_lock);
 
        if (khugepaged_test_exit(mm)) {
+#ifdef CONFIG_FINEGRAINED_THP
+               clear_hint_list(mm_slot);
+#endif
                /* free mm_slot */
                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
@@ -1408,15 +1756,29 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
  * khugepaged should try to collapse the page table.
  */
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+                                        unsigned long addr, int hpage_type)
+#else
 static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
                                         unsigned long addr)
+#endif
 {
        struct mm_slot *mm_slot;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ?
+                                       ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK));
+#else
        VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+#endif
 
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               addr |= 0x01;
+#endif
        if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
                mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
        spin_unlock(&khugepaged_mm_lock);
@@ -1440,10 +1802,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        spinlock_t *ptl;
        int count = 0;
        int i;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB;
+       int hpage_nr = (hpage_type == THP_TYPE_64KB) ?
+                                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR;
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                                                       HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+
+       if (hpage_type == THP_TYPE_64KB)
+               haddr = addr & HPAGE_CONT_PTE_MASK;
+#endif
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (!vma || !vma->vm_file ||
+           vma->vm_start > haddr || vma->vm_end < haddr + hpage_size)
+               return;
+#else /* CONFIG_FINEGRAINED_THP */
        if (!vma || !vma->vm_file ||
            vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
                return;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        /*
         * This vm_flags may not have VM_HUGEPAGE if the page was not
@@ -1470,7 +1848,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
-            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+            i < hpage_nr;
+#else
+            i < HPAGE_PMD_NR;
+#endif
+            i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
 
                /* empty pte, skip */
@@ -1494,7 +1877,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
        /* step 2: adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte;
-            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+               i < hpage_nr;
+#else
+           i < HPAGE_PMD_NR;
+#endif
+            i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
 
                if (pte_none(*pte))
@@ -1513,10 +1901,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
        /* step 4: collapse pmd */
        ptl = pmd_lock(vma->vm_mm, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               pte_t *ptep = pte_offset_map(pmd, haddr);
+               arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep);
+               spin_unlock(ptl);
+       } else {
+               _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+               spin_unlock(ptl);
+               mm_dec_nr_ptes(mm);
+               pte_free(mm, pmd_pgtable(_pmd));
+       }
+#else /* CONFIG_FINEGRAINED_THP*/
        _pmd = pmdp_collapse_flush(vma, haddr, pmd);
        spin_unlock(ptl);
        mm_dec_nr_ptes(mm);
        pte_free(mm, pmd_pgtable(_pmd));
+#endif /* CONFIG_FINEGRAINED_THP */
 
 drop_hpage:
        unlock_page(hpage);
@@ -1551,12 +1952,22 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+                                                       int hpage_type)
+#else
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+#endif
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
        unsigned long addr;
        pmd_t *pmd, _pmd;
+#ifdef CONFIG_FINEGRAINED_THP
+       pte_t *ptep;
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1579,6 +1990,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                if (vma->anon_vma)
                        continue;
                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK)
+                       continue;
+               else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK)
+                       continue;
+               if (vma->vm_end < addr + hpage_size)
+                       continue;
+
+               mm = vma->vm_mm;
+               pmd = mm_find_pmd(mm, addr);
+               if (!pmd)
+                       continue;
+               if (mmap_write_trylock(mm)) {
+                       spinlock_t *ptl = pmd_lock(mm, pmd);
+                       if (hpage_type == THP_TYPE_64KB) {
+                               /* 64KB hugepage */
+                               ptep = pte_offset_map(pmd, addr);
+                               /* pte maps are established on page fault handling */
+                               arch_clear_huge_pte_range(mm, addr, ptep);
+                               spin_unlock(ptl);
+                       } else {
+                               /* 2MB hugepage */
+                               /*
+                                * We need exclusive mmap_sem to retract page table.
+                                *
+                                * We use trylock due to lock inversion: we need to acquire
+                                * mmap_sem while holding page lock. Fault path does it in
+                                * reverse order. Trylock is a way to avoid deadlock.
+                                */
+                               _pmd = pmdp_collapse_flush(vma, addr, pmd);
+                               spin_unlock(ptl);
+
+                               mm_dec_nr_ptes(mm);
+                               pte_free(mm, pmd_pgtable(_pmd));
+                       }
+                       mmap_write_unlock(mm);
+               } else
+                       khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
                if (addr & ~HPAGE_PMD_MASK)
                        continue;
                if (vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1608,6 +2058,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                        /* Try again later */
                        khugepaged_add_pte_mapped_thp(mm, addr);
                }
+#endif /* CONFIG_FINEGRAINED_THP */
        }
        i_mmap_unlock_write(mapping);
 }
@@ -1630,26 +2081,52 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start,
+               struct page **hpage, int node, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void collapse_file(struct mm_struct *mm,
                struct file *file, pgoff_t start,
                struct page **hpage, int node)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct address_space *mapping = file->f_mapping;
        gfp_t gfp;
        struct page *new_page;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+       int hpage_order = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER);
+       pgoff_t index, end = start + hpage_nr;
+#else /* CONFIG_FINEGRAINED_THP */
        pgoff_t index, end = start + HPAGE_PMD_NR;
+#endif /* CONFIG_FINEGRAINED_THP */
        LIST_HEAD(pagelist);
+#ifdef CONFIG_FINEGRAINED_THP
+       XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order);
+#else
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
+#endif
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);
 
        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
+#ifdef CONFIG_FINEGRAINED_THP
+       VM_BUG_ON(start & (hpage_nr - 1));
+#else
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+#endif
 
        /* Only allocate from the target node */
        gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
        new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
        if (!new_page) {
                result = SCAN_ALLOC_HUGE_PAGE_FAIL;
                goto out;
@@ -1857,7 +2334,14 @@ out_unlock:
        if (is_shmem)
                __inc_node_page_state(new_page, NR_SHMEM_THPS);
        else {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB)
+                       __inc_node_page_state(new_page, NR_FILE_64KB_THPS);
+               else
+                       __inc_node_page_state(new_page, NR_FILE_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
                __inc_node_page_state(new_page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
                filemap_nr_thps_inc(mapping);
        }
 
@@ -1873,6 +2357,9 @@ xa_unlocked:
 
        if (result == SCAN_SUCCEED) {
                struct page *page, *tmp;
+#ifdef CONFIG_FINEGRAINED_THP
+               int offset = 0;
+#endif
 
                /*
                 * Replacing old pages with new one has succeeded, now we
@@ -1880,12 +2367,28 @@ xa_unlocked:
                 */
                index = start;
                list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (hpage_type != THP_TYPE_64KB) {
+                               while (index < page->index) {
+                                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                                       index++;
+                               }
+                       }
+
+                       if (hpage_type == THP_TYPE_64KB) {
+                               copy_highpage(new_page + offset, page);
+                               offset++;
+                       } else
+                               copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+                                               page);
+#else /* CONFIG_FINEGRAINED_THP */
                        while (index < page->index) {
                                clear_highpage(new_page + (index % HPAGE_PMD_NR));
                                index++;
                        }
                        copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
                                        page);
+#endif /* CONFIG_FINEGRAINED_THP */
                        list_del(&page->lru);
                        page->mapping = NULL;
                        page_ref_unfreeze(page, 1);
@@ -1895,13 +2398,32 @@ xa_unlocked:
                        put_page(page);
                        index++;
                }
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB) {
+                       while (index < end) {
+                               clear_highpage(new_page + offset);
+                               offset++;
+                               index++;
+                       }
+               } else {
+                       while (index < end) {
+                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                               index++;
+                       }
+               }
+#else /* CONFIG_FINEGRAINED_THP */
                while (index < end) {
                        clear_highpage(new_page + (index % HPAGE_PMD_NR));
                        index++;
                }
+#endif /* CONFIG_FINEGRAINED_THP */
 
                SetPageUptodate(new_page);
+#ifdef CONFIG_FINEGRAINED_THP
+               page_ref_add(new_page, hpage_nr - 1);
+#else
                page_ref_add(new_page, HPAGE_PMD_NR - 1);
+#endif
                if (is_shmem)
                        set_page_dirty(new_page);
                lru_cache_add(new_page);
@@ -1909,9 +2431,14 @@ xa_unlocked:
                /*
                 * Remove pte page tables, so we can re-fault the page as huge.
                 */
+#ifdef CONFIG_FINEGRAINED_THP
+               retract_page_tables(mapping, start, hpage_type);
+               if (hpage_type == THP_TYPE_2MB)
+                       *hpage = NULL;
+#else /* CONFIG_FINEGRAINED_THP */
                retract_page_tables(mapping, start);
                *hpage = NULL;
-
+#endif /* CONFIG_FINEGRAINED_THP */
                khugepaged_pages_collapsed++;
        } else {
                struct page *page;
@@ -1956,14 +2483,24 @@ xa_unlocked:
 
        unlock_page(new_page);
 out:
+#ifdef CONFIG_FINEGRAINED_THP
+       if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+               put_page(new_page);
+#endif
        VM_BUG_ON(!list_empty(&pagelist));
        if (!IS_ERR_OR_NULL(*hpage))
                mem_cgroup_uncharge(*hpage);
        /* TODO: tracepoints */
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage,
+               int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void khugepaged_scan_file(struct mm_struct *mm,
                struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *page = NULL;
        struct address_space *mapping = file->f_mapping;
@@ -1971,17 +2508,43 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        int present, swap;
        int node = NUMA_NO_NODE;
        int result = SCAN_SUCCEED;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr;
+       int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */
+               max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+       } else {
+               hpage_nr = HPAGE_PMD_NR; /* 2MB */
+               max_ptes_swap = khugepaged_max_ptes_swap;
+               max_ptes_none = khugepaged_max_ptes_none;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
 
        present = 0;
        swap = 0;
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        rcu_read_lock();
-       xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+#ifdef CONFIG_FINEGRAINED_THP
+       xas_for_each(&xas, page, start + hpage_nr - 1)
+#else
+       xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1)
+#endif
+       {
                if (xas_retry(&xas, page))
                        continue;
 
                if (xa_is_value(page)) {
-                       if (++swap > khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (++swap > max_ptes_swap)
+#else
+                       if (++swap > khugepaged_max_ptes_swap)
+#endif
+                       {
                                result = SCAN_EXCEED_SWAP_PTE;
                                break;
                        }
@@ -2027,19 +2590,34 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        rcu_read_unlock();
 
        if (result == SCAN_SUCCEED) {
-               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (present < hpage_nr - max_ptes_none)
+#else
+               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none)
+#endif
+               {
                        result = SCAN_EXCEED_NONE_PTE;
                } else {
                        node = khugepaged_find_target_node();
+#ifdef CONFIG_FINEGRAINED_THP
+                       collapse_file(mm, file, start, hpage, node, hpage_type);
+#else
                        collapse_file(mm, file, start, hpage, node);
+#endif
                }
        }
 
        /* TODO: tracepoints */
 }
 #else
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage,
+               int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void khugepaged_scan_file(struct mm_struct *mm,
                struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        BUILD_BUG();
 }
@@ -2050,6 +2628,220 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 }
 #endif
 
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * if return value > 0 -> vma can make hugepage
+ *    calculated hugepage start and hugepage end are stored in pointers
+ * otherwise -> vma cannot make hugepage
+ */
+static inline int hugepage_determine_htype(unsigned long vm_start,
+               unsigned long vm_end, unsigned long *hstart, unsigned long *hend) {
+       unsigned long start, end;
+
+       /* determine 2MB hugepage */
+       start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+       end = vm_end & HPAGE_PMD_MASK;
+       if (start >= end) {
+               /* determine 64KB hugepage */
+               start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+               end = vm_end & HPAGE_CONT_PTE_MASK;
+               if (start >= end)
+                       return THP_TYPE_FAIL;
+               *hstart = start;
+               *hend = end;
+               return THP_TYPE_64KB;
+       }
+       *hstart = start;
+       *hend = end;
+       return THP_TYPE_2MB;
+}
+
+enum {
+       KHUGEPAGE_SCAN_CONTINUE,
+       KHUGEPAGE_SCAN_BREAK,
+       KHUGEPAGE_SCAN_BREAK_MMAP_LOCK,
+};
+
+static unsigned int khugepaged_scan_vma(struct mm_struct *mm,
+                       struct vm_area_struct *vma, struct page **hpage,
+                       unsigned int pages, int *progress)
+{
+       unsigned long hstart, hend;
+       int hpage_type, ret;
+       int hpage_size, hpage_nr;
+
+       if (!hugepage_vma_check(vma, vma->vm_flags))
+               return KHUGEPAGE_SCAN_CONTINUE;
+
+       hpage_type = hugepage_determine_htype(
+                               (vma->vm_start > khugepaged_scan.address) ?
+                               vma->vm_start : khugepaged_scan.address,
+                               vma->vm_end, &hstart, &hend);
+
+       if (hpage_type == THP_TYPE_FAIL)
+               return KHUGEPAGE_SCAN_CONTINUE;
+       if (khugepaged_scan.address > hend)
+               return KHUGEPAGE_SCAN_CONTINUE;
+       if (khugepaged_scan.address < hstart)
+               khugepaged_scan.address = hstart;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK);
+               hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+               hpage_nr = HPAGE_CONT_PTE_NR;
+       } else if (hpage_type == THP_TYPE_2MB) {
+               VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+               hpage_size = HPAGE_PMD_SIZE; /* 2MB */
+               hpage_nr = HPAGE_PMD_NR;
+               if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+                   !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+                               HPAGE_PMD_NR)) {
+                       /* fallback, vma or file not aligned to 2MB */
+                       hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+                       hpage_nr = HPAGE_CONT_PTE_NR;
+                       hpage_type = THP_TYPE_64KB;
+               }
+       } else
+               BUG();
+
+       while (khugepaged_scan.address < hend) {
+               if (khugepaged_scan.address + hpage_size >= hend) {
+                       if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) {
+                               hpage_size = HPAGE_CONT_PTE_SIZE;
+                               hpage_nr = HPAGE_CONT_PTE_NR;
+                               hpage_type = THP_TYPE_64KB;
+                       }
+               }
+               ret = 0;
+               cond_resched();
+               if (unlikely(khugepaged_test_exit(mm)))
+                       return KHUGEPAGE_SCAN_BREAK;
+
+               VM_BUG_ON(khugepaged_scan.address < hstart ||
+                               khugepaged_scan.address + hpage_size >
+                               hend);
+               if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+                       struct file *file = get_file(vma->vm_file);
+                       pgoff_t pgoff = linear_page_index(vma,
+                                       khugepaged_scan.address);
+
+                       mmap_read_unlock(mm);
+                       ret = 1;
+                       khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type);
+                       fput(file);
+               } else {
+                       ret = khugepaged_scan_pmd(mm, vma,
+                                       khugepaged_scan.address,
+                                       hpage, hpage_type);
+               }
+               /* move to next address */
+               khugepaged_scan.address += hpage_size;
+               *progress += hpage_nr;
+               if (ret)
+                       /* we released mmap_sem so break loop */
+                       return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK;
+               if (*progress >= pages)
+                       return KHUGEPAGE_SCAN_BREAK;
+       }
+       return KHUGEPAGE_SCAN_CONTINUE;
+}
+
+static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot,
+                                                               unsigned long addr)
+{
+       struct thp_scan_hint *hint;
+
+       list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) {
+               if (hint->slot == slot)
+                       return hint;
+       }
+       return NULL;
+}
+
+#ifdef CONFIG_THP_CONSERVATIVE
+/* caller must hold a proper mmap_lock */
+void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr,
+               long diff, const char *debug)
+{
+       struct mm_slot *slot;
+       struct vm_area_struct *vma;
+       struct thp_scan_hint *hint;
+       bool wakeup = false;
+       bool retry = false;
+
+       vma = find_vma(mm, addr);
+       if (!hugepage_vma_check(vma, vma->vm_flags))
+               return;
+
+again:
+       spin_lock(&khugepaged_mm_lock);
+       slot = get_mm_slot(mm);
+       if (!slot) {
+               /* make a new slot or go out */
+               spin_unlock(&khugepaged_mm_lock);
+               if (retry)
+                       return;
+               if (__khugepaged_enter(mm))
+                       return;
+               retry = true;
+               goto again;
+       }
+
+       hint = find_scan_hint(slot, addr);
+       if (!hint) {
+               spin_unlock(&khugepaged_mm_lock);
+               hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL);
+               hint->vma = vma;
+               hint->slot = slot;
+               hint->diff = 0;
+               hint->jiffies = jiffies;
+               spin_lock(&khugepaged_mm_lock);
+               list_add(&hint->hint_list, &khugepaged_scan.hint_list);
+               khugepaged_scan.nr_hint++;
+       }
+       hint->diff += diff;
+       if (hint->diff >= HPAGE_CONT_PTE_SIZE) {
+               wakeup = true;
+               //list_move(&hint->hint_list, &khugepaged_scan.hint_list);
+       }
+       spin_unlock(&khugepaged_mm_lock);
+
+       /* if possible, wake khugepaged up for starting a scan */
+       if (wakeup) {
+               wake_up_interruptible(&khugepaged_wait);
+       }
+}
+#else /* CONFIG_THP_CONSERVATIVE */
+void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_THP_CONSERVATIVE */
+
+static void clear_hint_list(struct mm_slot *slot)
+{
+       struct thp_scan_hint *hint;
+       hint = find_scan_hint(slot, 0);
+       if (hint) {
+               list_del(&hint->hint_list);
+               kfree(hint);
+               khugepaged_scan.nr_hint--;
+       }
+}
+
+static struct thp_scan_hint *get_next_hint(void)
+{
+       if (!list_empty(&khugepaged_scan.hint_list)) {
+               struct thp_scan_hint *hint = list_first_entry(
+                                       &khugepaged_scan.hint_list,
+                                       struct thp_scan_hint, hint_list);
+               list_del(&hint->hint_list);
+               khugepaged_scan.nr_hint--;
+               return hint;
+       }
+       return NULL;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                                            struct page **hpage)
        __releases(&khugepaged_mm_lock)
@@ -2063,6 +2855,38 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (khugepaged_scan.mm_slot)
+               mm_slot = khugepaged_scan.mm_slot;
+       else if (!list_empty(&khugepaged_scan.hint_list)) {
+               struct thp_scan_hint *hint;
+               long mem_diff;
+               unsigned long jiffies_diff;
+
+get_next_hint:
+               hint = get_next_hint();
+               if (!hint)
+                       goto get_next_slot;
+
+               mm_slot = hint->slot;
+               mem_diff = hint->diff;
+               jiffies_diff = jiffies - hint->jiffies;
+               kfree(hint);
+               clear_hint_list(mm_slot);
+
+               if (khugepaged_test_exit(mm_slot->mm))
+                       goto get_next_hint;
+               khugepaged_scan.address = 0;
+               khugepaged_scan.mm_slot = mm_slot;
+       } else {
+get_next_slot:
+               mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                    struct mm_slot, mm_node);
+               clear_hint_list(mm_slot);
+               khugepaged_scan.address = 0;
+               khugepaged_scan.mm_slot = mm_slot;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        if (khugepaged_scan.mm_slot)
                mm_slot = khugepaged_scan.mm_slot;
        else {
@@ -2071,6 +2895,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(&khugepaged_mm_lock);
        khugepaged_collapse_pte_mapped_thps(mm_slot);
 
@@ -2087,13 +2912,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
        progress++;
        for (; vma; vma = vma->vm_next) {
+#ifdef CONFIG_FINEGRAINED_THP
+               int ret;
+#else
                unsigned long hstart, hend;
+#endif
 
                cond_resched();
                if (unlikely(khugepaged_test_exit(mm))) {
                        progress++;
                        break;
                }
+#ifdef CONFIG_FINEGRAINED_THP
+               ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress);
+
+               if (ret == KHUGEPAGE_SCAN_CONTINUE) {
+                       progress++;
+                       continue;
+               } else if (ret == KHUGEPAGE_SCAN_BREAK)
+                       goto breakouterloop;
+               else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK)
+                       goto breakouterloop_mmap_lock;
+#else /* CONFIG_FINEGRAINED_THP */
                if (!hugepage_vma_check(vma, vma->vm_flags)) {
 skip:
                        progress++;
@@ -2143,6 +2983,7 @@ skip:
                        if (progress >= pages)
                                goto breakouterloop;
                }
+#endif /* CONFIG_FINEGRAINED_THP */
        }
 breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
@@ -2160,6 +3001,53 @@ breakouterloop_mmap_lock:
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
+#ifdef CONFIG_FINEGRAINED_THP
+               if (!list_empty(&khugepaged_scan.hint_list)) {
+                       unsigned long jiffies_diff;
+                       long mem_diff;
+                       struct thp_scan_hint *hint;
+                       struct mm_slot *next_slot;
+
+get_next_hint2:
+                       hint = get_next_hint();
+
+                       if (!hint) {
+                               /* no more hint */
+                               if (mm_slot->mm_node.next != &khugepaged_scan.mm_head)
+                                       goto get_next_slot2;
+                               else
+                                       goto end_loop;
+                       }
+
+                       mem_diff = hint->diff;
+                       jiffies_diff = jiffies - hint->jiffies;
+                       next_slot = hint->slot;
+                       kfree(hint);
+
+                       if (next_slot == mm_slot)
+                               goto get_next_hint2;
+
+                       if (!khugepaged_test_exit(next_slot->mm)) {
+                               list_move(&next_slot->mm_node, &mm_slot->mm_node);
+                               clear_hint_list(next_slot);
+                       } else
+                               goto get_next_hint2;
+
+                       khugepaged_scan.mm_slot = next_slot;
+                       khugepaged_scan.address = 0;
+               } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+get_next_slot2:
+                       khugepaged_scan.mm_slot = list_entry(
+                               mm_slot->mm_node.next,
+                               struct mm_slot, mm_node);
+                       clear_hint_list(khugepaged_scan.mm_slot);
+                       khugepaged_scan.address = 0;
+               } else {
+end_loop:
+                       khugepaged_scan.mm_slot = NULL;
+                       khugepaged_full_scans++;
+               }
+#else /* CONFIG_FINEGRAINED_THP */
                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
                        khugepaged_scan.mm_slot = list_entry(
                                mm_slot->mm_node.next,
@@ -2169,7 +3057,7 @@ breakouterloop_mmap_lock:
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }
-
+#endif /* CONFIG_FINEGRAINED_THP */
                collect_mm_slot(mm_slot);
        }
 
@@ -2250,6 +3138,9 @@ static void khugepaged_wait_work(void)
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
+#include <linux/delay.h>
+bool eager_allocation = false;
+
 static int khugepaged(void *none)
 {
        struct mm_slot *mm_slot;
index 24abc79..fdf4f2a 100644 (file)
@@ -407,6 +407,11 @@ regular_page:
                if (!page)
                        continue;
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (pte_cont(ptent))
+                       split_huge_pte_address(vma, addr, false, NULL);
+#endif
+
                /*
                 * Creating a THP page is expensive so split it only if we
                 * are sure it's worth. Split it if we are only owner.
@@ -616,6 +621,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                if (!page)
                        continue;
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (pte_cont(ptent))
+                       split_huge_pte_address(vma, addr, false, NULL);
+#endif /* CONFIG_FINEGRAINED_THP */
+
                /*
                 * If pmd isn't transhuge but the page is THP and
                 * is owned by only this process, split it and
index 3c99200..429e738 100644 (file)
@@ -3276,16 +3276,26 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
+#ifdef CONFIG_FINEGRAINED_THP
+       int page_nr = compound_nr(head);
+#endif
        struct mem_cgroup *memcg = head->mem_cgroup;
        int i;
 
        if (mem_cgroup_disabled())
                return;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       for (i = 1; i < page_nr; i++) {
+               css_get(&memcg->css);
+               head[i].mem_cgroup = memcg;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                css_get(&memcg->css);
                head[i].mem_cgroup = memcg;
        }
+#endif /* CONFIG_FINEGRAINED_THP */
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
index 2e59295..085287f 100644 (file)
@@ -82,6 +82,8 @@
 #include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/huge_mm.h>
+#include <asm/finegrained_thp.h>
 
 #include "pgalloc-track.h"
 #include "internal.h"
@@ -146,6 +148,19 @@ EXPORT_SYMBOL(zero_pfn);
 
 unsigned long highest_memmap_pfn __read_mostly;
 
+atomic_long_t nr_phys_cont_pte_pages;
+atomic_long_t nr_phys_huge_pmd_pages;
+
+unsigned long phys_cont_pte_pages(void)
+{
+       return atomic_long_read(&nr_phys_cont_pte_pages);
+}
+
+unsigned long phys_huge_pmd_pages(void)
+{
+       return atomic_long_read(&nr_phys_huge_pmd_pages);
+}
+
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
@@ -208,6 +223,11 @@ static void check_sync_rss_stat(struct task_struct *task)
 
 #endif /* SPLIT_RSS_COUNTING */
 
+#ifdef CONFIG_FINEGRAINED_THP
+void thp_print_cont_pte_table(struct mm_struct *mm,
+                       unsigned long addr, pte_t *ptep, unsigned long line);
+#endif /* CONFIG_FINEGRAINED_THP */
+
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -730,6 +750,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         */
                        make_migration_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
+                       pte = arch_pte_clearhuge(pte);
                        if (pte_swp_soft_dirty(*src_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(*src_pte))
@@ -763,11 +784,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                    is_cow_mapping(vm_flags)) {
                        make_device_private_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
+                       pte = arch_pte_clearhuge(pte);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        }
+       pte = arch_pte_clearhuge(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
 }
@@ -860,6 +883,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
        page = vm_normal_page(src_vma, addr, pte);
        if (page) {
                int retval;
+               /*
+                * when 64KB hugepage map is copied,
+                * clear contiguous bit
+                */
+               pte = arch_pte_clearhuge(pte);
 
                retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                           addr, rss, prealloc, pte, page);
@@ -887,7 +915,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
        if (vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);
-
+       pte = arch_pte_clearhuge(pte);
        /*
         * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
         * does not have the VM_UFFD_WP, which means that the uffd
@@ -965,6 +993,7 @@ again:
                        progress++;
                        continue;
                }
+
                if (unlikely(!pte_present(*src_pte))) {
                        entry.val = copy_nonpresent_pte(dst_mm, src_mm,
                                                        dst_pte, src_pte,
@@ -974,6 +1003,7 @@ again:
                        progress += 8;
                        continue;
                }
+
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
                                       addr, rss, &prealloc);
@@ -1123,6 +1153,21 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                       pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+                       unsigned long end, struct page *page,
+                       int *rss, spinlock_t *ptl);
+#else /* CONFIG_FINEGRAINED_THP */
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                       pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+                       unsigned long end, struct page *page,
+                       int *rss, spinlock_t *ptl)
+{
+       return false;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
@@ -1245,6 +1290,16 @@ again:
                                    details->check_mapping != page_rmapping(page))
                                        continue;
                        }
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (page && pte_cont(ptent) && PageTransHuge(compound_head(page))) {
+                               if (zap_cont_pte_range(tlb, vma, pmd, &pte,
+                                               &addr, end, page, rss, ptl)) {
+                                       force_flush = 1;
+                                       break;
+                               }
+                       } else if (pte_cont(ptent))
+                               atomic_long_dec(&nr_phys_cont_pte_pages);
+#endif /* CONFIG_FINEGRAINED_THP */
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
@@ -2156,16 +2211,26 @@ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
 
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
+#ifdef CONFIG_FINEGRAINED_THP
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, unsigned long end,
+                       unsigned long pfn, pgprot_t prot)
+{
+       return arch_remap_pte_range(mm, pmd, addr, end, pfn, prot);
+}
+#else /* CONFIG_FINEGRAINED_THP */
 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
 {
        pte_t *pte, *mapped_pte;
+       unsigned long next;
        spinlock_t *ptl;
        int err = 0;
 
@@ -2179,13 +2244,50 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        err = -EACCES;
                        break;
                }
+
+               next = pte_cont_addr_end(addr, end);
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
-       } while (pte++, addr += PAGE_SIZE, addr != end);
+               pte++;
+               addr += PAGE_SIZE;
+       } while (addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
 }
+#endif /* CONFIG_FINEGRAINED_THP */
+
+static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+                               unsigned long end, unsigned long pfn,
+                               pgprot_t prot)
+{
+       phys_addr_t phys_addr = __pfn_to_phys(pfn);
+        spinlock_t *ptl;
+       int ret;
+
+       if ((end - addr) != PMD_SIZE)
+               return 0;
+
+       if (!IS_ALIGNED(addr, PMD_SIZE))
+               return 0;
+
+       if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+               return 0;
+
+       /* fixme - is this correct? */
+       if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) {
+               pr_info("%s %d - freed pmd page??\n", __func__, __LINE__);
+               return 0;
+       }
+
+       ptl = pmd_lock(mm, pmd);
+       ret = pmd_set_huge(pmd, phys_addr, prot);
+       spin_unlock(ptl);
+
+       atomic_long_inc(&nr_phys_huge_pmd_pages);
+
+       return ret;
+}
 
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
@@ -2202,6 +2304,11 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
+
+               if (remap_try_huge_pmd(mm, pmd, addr, next,
+                                      pfn + (addr >> PAGE_SHIFT), prot))
+                       continue;
+
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
@@ -3480,6 +3587,8 @@ out_release:
        return ret;
 }
 
+extern bool eager_allocation;
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3538,6 +3647,22 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
+#ifdef CONFIG_FINEGRAINED_THP
+#ifndef CONFIG_THP_CONSERVATIVE
+       /*
+        * 64KB hugepage creation on page fault is only allowed
+        * in an aggressive policy or a near-conservative policy
+        */
+       if (__transparent_hugepage_enabled(vma)) {
+               ret = arch_do_huge_pte_anonymous_page(vmf);
+               if (!(ret & VM_FAULT_FALLBACK)) {
+                       return ret;
+               }
+               ret = 0;
+       }
+#endif /* CONFIG_THP_CONSERVATIVE */
+#endif /* CONFIG_FINEGRAINED_THP */
+
        page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
                goto oom;
@@ -3786,6 +3911,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
        BUILD_BUG();
        return 0;
 }
+
+#ifdef CONFIG_FINEGRAINED_THP
+static vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
+{
+       BUILD_BUG();
+       return 0;
+}
+#endif
 #endif
 
 /**
@@ -3810,12 +3943,23 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
        pte_t entry;
        vm_fault_t ret;
 
-       if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
+       if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
+                       compound_nr(compound_head(page)) == HPAGE_PMD_NR) {
                ret = do_set_pmd(vmf, page);
                if (ret != VM_FAULT_FALLBACK)
                        return ret;
        }
 
+#ifdef CONFIG_FINEGRAINED_THP
+       /* PageTransHuge cannot find hugepage if the page is not a head */
+       if (PageTransCompound(page) &&
+                       compound_nr(compound_head(page)) == HPAGE_CONT_PTE_NR) {
+               ret = arch_do_set_huge_pte(vmf, page);
+               if (ret != VM_FAULT_FALLBACK)
+                       return ret;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
+
        if (!vmf->pte) {
                ret = pte_alloc_one_map(vmf);
                if (ret)
@@ -3827,7 +3971,11 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                return VM_FAULT_NOPAGE;
        }
-
+       /*
+       if (!strcmp(current->comm, "org.tizen.nlp.s") || !strcmp(current->comm, "memps"))
+               pr_info("THP-wp: huge fault for addr (%lx) (%s) %s\n",
+                       vmf->address, current->comm, __func__);
+       */
        flush_icache_page(vma, page);
        entry = mk_pte(page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
@@ -4056,7 +4204,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;
-
        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
        __SetPageUptodate(vmf->cow_page);
 
@@ -4269,10 +4416,37 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static inline vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+       //struct timespec64 ts, te, diff;
+       int ret;
+
+#ifdef CONFIG_FINEGRAINED_THP
+       return VM_FAULT_FALLBACK;
+#endif
+
+       //ktime_get_ts64(&ts);
+       ret = do_huge_pmd_anonymous_page(vmf);
+       /*
+       ktime_get_ts64(&te);
+       diff = timespec64_sub(te, ts);
+       if (!(ret & VM_FAULT_FALLBACK))
+               pr_info("THP-fault: 2MB hugepage takes %lu nsecs\n",
+                               timespec64_to_ns(&diff));
+       */
+       return ret;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 {
        if (vma_is_anonymous(vmf->vma))
+#ifdef CONFIG_FINEGRAINED_THP
+               return __do_huge_pmd_anonymous_page(vmf);
+#else
                return do_huge_pmd_anonymous_page(vmf);
+#endif
        if (vmf->vma->vm_ops->huge_fault)
                return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
        return VM_FAULT_FALLBACK;
@@ -4299,6 +4473,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
        return VM_FAULT_FALLBACK;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+vm_fault_t wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte);
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static vm_fault_t create_huge_pud(struct vm_fault *vmf)
 {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
@@ -4407,8 +4585,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                goto unlock;
        }
        if (vmf->flags & FAULT_FLAG_WRITE) {
-               if (!pte_write(entry))
+               if (!pte_write(entry)) {
+                       int ret = arch_do_wp_page(vmf, entry);
+
+                       if (!(ret & VM_FAULT_FALLBACK))
+                               return ret;
                        return do_wp_page(vmf);
+               }
+               if (arch_huge_pte_set_accessed(vmf, entry))
+                       goto unlock;
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
index ba56339..b16e340 100644 (file)
@@ -266,6 +266,16 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                                page_dup_rmap(new, true);
                } else
 #endif
+#ifdef CONFIG_FINEGRAINED_THP
+               if (PageTransHuge(new)) {
+                       pte = pte_mkcont(pte_mkhuge(pte));
+                       arch_set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, 0);
+                       if (PageAnon(new))
+                               page_add_anon_rmap(new, vma, pvmw.address, true);
+                       else
+                               page_dup_rmap(new, true);
+               } else
+#endif /* CONFIG_FINEGRAINED_THP */
                {
                        set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 
index 5c8b448..02eb014 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -52,6 +52,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
+#include <asm/finegrained_thp.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mmap.h>
@@ -271,6 +272,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
 success:
        populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+       if (newbrk > oldbrk)
+               khugepaged_mem_hook(mm, origbrk, newbrk - oldbrk, __func__);
        if (downgraded)
                mmap_read_unlock(mm);
        else
@@ -1445,6 +1448,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if ((len >> PAGE_SHIFT) >= HPAGE_CONT_PTE_NR &&
+                       file && addr == 0)
+               flags |= MAP_FILE_THP;
+#endif
+
        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
@@ -1867,6 +1876,12 @@ unmap_writable:
                        allow_write_access(file);
        }
        file = vma->vm_file;
+       if (file && (vm_flags & VM_DENYWRITE))
+               /* read-only file pages */
+               khugepaged_mem_hook(mm, addr, len, __func__);
+       else if (!file && !vma->vm_ops)
+               /* anonymous pages */
+               khugepaged_mem_hook(mm, addr, len, __func__);
 out:
        perf_event_mmap(vma);
 
@@ -2190,6 +2205,19 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        info.high_limit = mmap_end;
        info.align_mask = 0;
        info.align_offset = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (!addr && len >= HPAGE_PMD_SIZE) {
+               info.align_mask = HPAGE_PMD_SIZE - 1;
+               info.align_offset = HPAGE_PMD_SIZE;
+#ifdef CONFIG_FINEGRAINED_THP
+       } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) {
+               info.align_mask = HPAGE_CONT_PTE_SIZE - 1;
+               info.align_offset = HPAGE_CONT_PTE_SIZE;
+#endif
+       }
+#endif
+
        return vm_unmapped_area(&info);
 }
 #endif
@@ -2232,6 +2260,19 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.align_mask = 0;
        info.align_offset = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (!addr && len >= HPAGE_PMD_SIZE) {
+               info.align_mask = HPAGE_PMD_SIZE - 1;
+               info.align_offset = HPAGE_PMD_SIZE;
+#ifdef CONFIG_FINEGRAINED_THP
+       } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) {
+               info.align_mask = HPAGE_CONT_PTE_SIZE - 1;
+               info.align_offset = HPAGE_CONT_PTE_SIZE;
+#endif
+       }
+#endif
+
        addr = vm_unmapped_area(&info);
 
        /*
index 56c02be..956745f 100644 (file)
@@ -77,6 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        pte_t ptent;
                        bool preserve_write = prot_numa && pte_write(oldpte);
 
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (pte_cont(oldpte)) {
+                               spin_unlock(ptl);
+                               __split_huge_pte(vma, pmd, pte, addr, false, NULL);
+                               spin_lock(ptl);
+                       }
+#endif /* CONFIG_FINEGRAINED_THP */
                        /*
                         * Avoid trapping faults against the zero or KSM
                         * pages. See similar comment in change_huge_pmd.
index 138abba..dc23cef 100644 (file)
@@ -161,6 +161,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                if (pte_none(*old_pte))
                        continue;
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (pte_cont(*old_pte)) {
+                       /*
+                        * Contiguous ptes will be moved,
+                        * and we cannot ensure their alignment.
+                        * So, simply split them.
+                        */
+                       split_huge_pte_address(vma, old_addr, false, NULL);
+               }
+#endif /* CONFIG_FINEGRAINED_THP */
+
                pte = ptep_get_and_clear(mm, old_addr, old_pte);
                /*
                 * If we are remapping a valid PTE, make sure
index 6657000..64de8c1 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1138,7 +1138,16 @@ void do_page_add_anon_rmap(struct page *page,
                 * disabled.
                 */
                if (compound)
+#ifdef CONFIG_FINEGRAINED_THP
+               {
+                       if (nr == HPAGE_PMD_NR)
+                               __inc_lruvec_page_state(page, NR_ANON_THPS);
+                       else
+                               __inc_lruvec_page_state(page, NR_ANON_64KB_THPS);
+               }
+#else /* CONFIG_FINEGRAINED_THP */
                        __inc_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
                __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
        }
 
@@ -1179,8 +1188,14 @@ void page_add_new_anon_rmap(struct page *page,
                atomic_set(compound_mapcount_ptr(page), 0);
                if (hpage_pincount_available(page))
                        atomic_set(compound_pincount_ptr(page), 0);
-
+#ifdef CONFIG_FINEGRAINED_THP
+               if (nr == HPAGE_PMD_NR)
+                       __inc_lruvec_page_state(page, NR_ANON_THPS);
+               else
+                       __inc_lruvec_page_state(page, NR_ANON_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
                __inc_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
        } else {
                /* Anon THP always mapped first with PMD */
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1212,9 +1227,19 @@ void page_add_file_rmap(struct page *page, bool compound)
                if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
                        goto out;
                if (PageSwapBacked(page))
+#ifdef CONFIG_FINEGRAINED_THP
+                       __inc_node_page_state(page, nr == HPAGE_PMD_NR ?
+                                       NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED);
+#else
                        __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+#endif
                else
+#ifdef CONFIG_FINEGRAINED_THP
+                       __inc_node_page_state(page, nr == HPAGE_PMD_NR ?
+                                       NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED);
+#else
                        __inc_node_page_state(page, NR_FILE_PMDMAPPED);
+#endif
        } else {
                if (PageTransCompound(page) && page_mapping(page)) {
                        VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1253,9 +1278,19 @@ static void page_remove_file_rmap(struct page *page, bool compound)
                if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                        return;
                if (PageSwapBacked(page))
+#ifdef CONFIG_FINEGRAINED_THP
+                       __dec_node_page_state(page, nr == HPAGE_PMD_NR ?
+                                       NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED);
+#else
                        __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+#endif
                else
+#ifdef CONFIG_FINEGRAINED_THP
+                       __dec_node_page_state(page, nr == HPAGE_PMD_NR ?
+                                       NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED);
+#else
                        __dec_node_page_state(page, NR_FILE_PMDMAPPED);
+#endif
        } else {
                if (!atomic_add_negative(-1, &page->_mapcount))
                        return;
@@ -1286,7 +1321,14 @@ static void page_remove_anon_compound_rmap(struct page *page)
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (thp_nr_pages(page) == HPAGE_PMD_NR)
+               __dec_lruvec_page_state(page, NR_ANON_THPS);
+       else
+               __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
        __dec_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
 
        if (TestClearPageDoubleMap(page)) {
                /*
@@ -1348,8 +1390,12 @@ void page_remove_rmap(struct page *page, bool compound)
         */
        __dec_lruvec_page_state(page, NR_ANON_MAPPED);
 
-       if (unlikely(PageMlocked(page)))
-               clear_page_mlock(page);
+       if (unlikely(PageMlocked(page))) {
+               if (unlikely(PageTransCompound(page)))
+                       clear_page_mlock(compound_head(page));
+               else
+                       clear_page_mlock(page);
+       }
 
        if (PageTransCompound(page))
                deferred_split_huge_page(compound_head(page));
@@ -1398,6 +1444,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                flags & TTU_SPLIT_FREEZE, page);
        }
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (flags & TTU_SPLIT_HUGE_PTE)
+               split_huge_pte_address(vma, address,
+                               flags & TTU_SPLIT_FREEZE, page);
+#endif
+
        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
@@ -1466,6 +1518,33 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * do this outside rmap routines.
                         */
                        VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (thp_nr_pages(page) == HPAGE_PMD_NR &&
+                                       huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+                               /*
+                                * huge_pmd_unshare unmapped an entire PMD
+                                * page.  There is no way of knowing exactly
+                                * which PMDs may be cached for this mm, so
+                                * we must flush them all.  start/end were
+                                * already adjusted above to cover this range.
+                                */
+                               flush_cache_range(vma, range.start, range.end);
+                               flush_tlb_range(vma, range.start, range.end);
+                               mmu_notifier_invalidate_range(mm, range.start,
+                                                             range.end);
+                               /*
+                                * The ref count of the PMD page was dropped
+                                * which is part of the way map counting
+                                * is done for shared PMDs.  Return 'true'
+                                * here.  When there is no other sharing,
+                                * huge_pmd_unshare returns false and we will
+                                * unmap the actual page and drop map count
+                                * to zero.
+                                */
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
+#else /* CONFIG_FINEGRAINED_THP */
                        if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
                                /*
                                 * huge_pmd_unshare unmapped an entire PMD
@@ -1491,6 +1570,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
+#endif /* CONFIG_FINEGRAINED_THP */
                }
 
                if (IS_ENABLED(CONFIG_MIGRATION) &&
index 537c137..01c9b74 100644 (file)
@@ -884,9 +884,15 @@ static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
                return true;
 
        /* Just proceed to delete a huge page wholly within the range punched */
+#ifdef CONFIG_FINEGRAINED_THP
+       if (PageHead(page) &&
+           page->index >= start && page->index + thp_nr_pages(page) <= end)
+               return true;
+#else
        if (PageHead(page) &&
            page->index >= start && page->index + HPAGE_PMD_NR <= end)
                return true;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        /* Try to split huge page, so we can truly punch the hole or truncate */
        return split_huge_page(page) >= 0;
@@ -1035,9 +1041,15 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                        clear_highpage(page);
                                        flush_dcache_page(page);
                                        set_page_dirty(page);
+#ifdef CONFIG_FINEGRAINED_THP
+                                       if (index <
+                                           round_up(start, thp_nr_pages(page)))
+                                               start = index + 1;
+#else /* CONFIG_FINEGRAINED_THP */
                                        if (index <
                                            round_up(start, HPAGE_PMD_NR))
                                                start = index + 1;
+#endif /* CONFIG_FINEGRAINED_THP */
                                }
                        }
                        unlock_page(page);
@@ -1531,22 +1543,40 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        return page;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *shmem_alloc_hugepage(gfp_t gfp,
+               struct shmem_inode_info *info, pgoff_t index, int page_nr)
+#else /* CONFIG_FINEGRAINED_THP */
 static struct page *shmem_alloc_hugepage(gfp_t gfp,
                struct shmem_inode_info *info, pgoff_t index)
+#endif/* CONFIG_FINEGRAINED_THP */
 {
        struct vm_area_struct pvma;
        struct address_space *mapping = info->vfs_inode.i_mapping;
        pgoff_t hindex;
        struct page *page;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       hindex = round_down(index, page_nr);
+       if (xa_find(&mapping->i_pages, &hindex, hindex + page_nr - 1,
+                                                               XA_PRESENT))
+               return NULL;
+#else /* CONFIG_FINEGRAINED_THP */
        hindex = round_down(index, HPAGE_PMD_NR);
        if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
                                                                XA_PRESENT))
                return NULL;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        shmem_pseudo_vma_init(&pvma, info, hindex);
+#ifdef CONFIG_FINEGRAINED_THP
+       page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
+                       page_nr == HPAGE_PMD_NR ? HPAGE_PMD_ORDER : HPAGE_CONT_PTE_ORDER,
+                       &pvma, 0, numa_node_id(), true);
+#else /* CONFIG_FINEGRAINED_THP */
        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+#endif /* CONFIG_FINEGRAINED_THP */
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);
@@ -1568,9 +1598,15 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        return page;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+               struct inode *inode,
+               pgoff_t index, bool huge, int page_nr)
+#else /* CONFIG_FINEGRAINED_THP */
 static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
                struct inode *inode,
                pgoff_t index, bool huge)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct page *page;
@@ -1579,13 +1615,21 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
 
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                huge = false;
+#ifdef CONFIG_FINEGRAINED_THP
+       nr = huge ? page_nr : 1;
+#else
        nr = huge ? HPAGE_PMD_NR : 1;
+#endif
 
        if (!shmem_inode_acct_block(inode, nr))
                goto failed;
 
        if (huge)
+#ifdef CONFIG_FINEGRAINED_THP
+               page = shmem_alloc_hugepage(gfp, info, index, nr);
+#else
                page = shmem_alloc_hugepage(gfp, info, index);
+#endif
        else
                page = shmem_alloc_page(gfp, info, index);
        if (page) {
@@ -1805,6 +1849,9 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        int error;
        int once = 0;
        int alloced = 0;
+#ifdef CONFIG_FINEGRAINED_THP
+       int nr_pages = HPAGE_PMD_NR;
+#endif
 
        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
@@ -1835,6 +1882,11 @@ repeat:
        if (page && sgp == SGP_WRITE)
                mark_page_accessed(page);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (page)
+               nr_pages = thp_nr_pages(page);
+#endif
+
        /* fallocated page? */
        if (page && !PageUptodate(page)) {
                if (sgp != SGP_READ)
@@ -1870,12 +1922,21 @@ repeat:
        case SHMEM_HUGE_WITHIN_SIZE: {
                loff_t i_size;
                pgoff_t off;
-
+#ifdef CONFIG_FINEGRAINED_THP
+               off = round_up(index, nr_pages);
+#else
                off = round_up(index, HPAGE_PMD_NR);
+#endif
                i_size = round_up(i_size_read(inode), PAGE_SIZE);
+#ifdef CONFIG_FINEGRAINED_THP
+               if (i_size >= nr_pages * PAGE_SIZE &&
+                   i_size >> PAGE_SHIFT >= off)
+                       goto alloc_huge;
+#else
                if (i_size >= HPAGE_PMD_SIZE &&
                    i_size >> PAGE_SHIFT >= off)
                        goto alloc_huge;
+#endif
 
                fallthrough;
        }
@@ -1887,11 +1948,20 @@ repeat:
        }
 
 alloc_huge:
+#ifdef CONFIG_FINEGRAINED_THP
+       page = shmem_alloc_and_acct_page(gfp, inode, index, true, nr_pages);
+#else
        page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+#endif
        if (IS_ERR(page)) {
 alloc_nohuge:
+#ifdef CONFIG_FINEGRAINED_THP
+               page = shmem_alloc_and_acct_page(gfp, inode,
+                                                index, false, 1);
+#else
                page = shmem_alloc_and_acct_page(gfp, inode,
                                                 index, false);
+#endif
        }
        if (IS_ERR(page)) {
                int retry = 5;
@@ -1917,7 +1987,11 @@ alloc_nohuge:
        }
 
        if (PageTransHuge(page))
+#ifdef CONFIG_FINEGRAINED_THP
+               hindex = round_down(index, nr_pages);
+#else
                hindex = round_down(index, HPAGE_PMD_NR);
+#endif
        else
                hindex = index;
 
@@ -1938,6 +2012,27 @@ alloc_nohuge:
        spin_unlock_irq(&info->lock);
        alloced = true;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (PageTransHuge(page) &&
+           DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+                       hindex + nr_pages - 1) {
+               /*
+                * Part of the huge page is beyond i_size: subject
+                * to shrink under memory pressure.
+                */
+               spin_lock(&sbinfo->shrinklist_lock);
+               /*
+                * _careful to defend against unlocked access to
+                * ->shrink_list in shmem_unused_huge_shrink()
+                */
+               if (list_empty_careful(&info->shrinklist)) {
+                       list_add_tail(&info->shrinklist,
+                                     &sbinfo->shrinklist);
+                       sbinfo->shrinklist_len++;
+               }
+               spin_unlock(&sbinfo->shrinklist_lock);
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        if (PageTransHuge(page) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                        hindex + HPAGE_PMD_NR - 1) {
@@ -1957,7 +2052,7 @@ alloc_nohuge:
                }
                spin_unlock(&sbinfo->shrinklist_lock);
        }
-
+#endif /* CONFIG_FINEGRAINED_THP */
        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
         */
@@ -2547,13 +2642,21 @@ shmem_write_end(struct file *file, struct address_space *mapping,
                struct page *head = compound_head(page);
                if (PageTransCompound(page)) {
                        int i;
-
+#ifdef CONFIG_FINEGRAINED_THP
+                       for (i = 0; i < thp_nr_pages(page); i++) {
+                               if (head + i == page)
+                                       continue;
+                               clear_highpage(head + i);
+                               flush_dcache_page(head + i);
+                       }
+#else /* CONFIG_FINEGRAINED_THP */
                        for (i = 0; i < HPAGE_PMD_NR; i++) {
                                if (head + i == page)
                                        continue;
                                clear_highpage(head + i);
                                flush_dcache_page(head + i);
                        }
+#endif /* CONFIG_FINEGRAINED_THP */
                }
                if (copied < PAGE_SIZE) {
                        unsigned from = pos & (PAGE_SIZE - 1);
@@ -4102,6 +4205,12 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
                        if (i_size >= HPAGE_PMD_SIZE &&
                                        i_size >> PAGE_SHIFT >= off)
                                return true;
+#ifdef CONFIG_FINEGRAINED_THP
+                       off = round_up(vma->vm_pgoff, HPAGE_CONT_PTE_NR);
+                       if (i_size >= HPAGE_CONT_PTE_SIZE &&
+                                       i_size >> PAGE_SHIFT >= off)
+                               return true;
+#endif /* CONFIG_FINEGRAINED_THP */
                        fallthrough;
                case SHMEM_HUGE_ADVISE:
                        /* TODO: implement fadvise() hints */
index 0357fbe..fed073f 100644 (file)
@@ -312,7 +312,11 @@ swp_entry_t get_swap_page(struct page *page)
 
        if (PageTransHuge(page)) {
                if (IS_ENABLED(CONFIG_THP_SWAP))
+#ifdef CONFIG_FINEGRAINED_THP
+                       get_swap_pages(1, &entry, thp_nr_pages(page));
+#else
                        get_swap_pages(1, &entry, HPAGE_PMD_NR);
+#endif
                goto out;
        }
 
index 5256c10..310e06a 100644 (file)
@@ -1673,7 +1673,12 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
        }
        if (map)
                ci = lock_cluster(si, offset);
-       for (i = 0; i < HPAGE_PMD_NR; i++) {
+#ifdef CONFIG_FINEGRAINED_THP
+       for (i = 0; i < thp_nr_pages(page); i++)
+#else
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+#endif
+       {
                mapcount = atomic_read(&page[i]._mapcount) + 1;
                _total_mapcount += mapcount;
                if (map) {
@@ -1685,7 +1690,11 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
        unlock_cluster(ci);
        if (PageDoubleMap(page)) {
                map_swapcount -= 1;
+#ifdef CONFIG_FINEGRAINED_THP
+               _total_mapcount -= thp_nr_pages(page);
+#else
                _total_mapcount -= HPAGE_PMD_NR;
+#endif
        }
        mapcount = compound_mapcount(page);
        map_swapcount += mapcount;
index 960edf5..c981ef5 100644 (file)
@@ -566,8 +566,13 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                                unlock_page(page);
                                continue;
                        } else if (PageTransHuge(page)) {
+#ifdef CONFIG_FINEGRAINED_THP
+                               index += thp_nr_pages(page) - 1;
+                               i += thp_nr_pages(page) - 1;
+#else /* CONFIG_FINEGRAINED_THP */
                                index += HPAGE_PMD_NR - 1;
                                i += HPAGE_PMD_NR - 1;
+#endif /* CONFIG_FINEGRAINED_THP */
                                /*
                                 * 'end' is in the middle of THP. Don't
                                 * invalidate the page as the part outside of
index 67d3833..5c23848 100644 (file)
@@ -1302,7 +1302,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                        bool was_swapbacked = PageSwapBacked(page);
 
                        if (unlikely(PageTransHuge(page)))
+#ifdef CONFIG_FINEGRAINED_THP
+                       {
+                               if (nr_pages == HPAGE_PMD_NR)
+                                       flags |= TTU_SPLIT_HUGE_PMD;
+                               else
+                                       flags |= TTU_SPLIT_HUGE_PTE;
+                       }
+#else /* CONFIG_FINEGRAINED_THP */
                                flags |= TTU_SPLIT_HUGE_PMD;
+#endif /* CONFIG_FINEGRAINED_THP */
 
                        if (!try_to_unmap(page, flags)) {
                                stat->nr_unmap_fail += nr_pages;
index 2cf6681..42f5ef2 100644 (file)
@@ -1201,9 +1201,19 @@ const char * const vmstat_text[] = {
        "nr_shmem",
        "nr_shmem_hugepages",
        "nr_shmem_pmdmapped",
+#ifdef CONFIG_FINEGRAINED_THP
+       "nr_shmem_ptemapped",
+       "nr_file_64kb_hugepages",
+#endif
        "nr_file_hugepages",
        "nr_file_pmdmapped",
+#ifdef CONFIG_FINEGRAINED_THP
+       "nr_file_ptemapped",
+#endif
        "nr_anon_transparent_hugepages",
+#ifdef CONFIG_FINEGRAINED_THP
+       "nr_anon_64KB_transparent_hugepages",
+#endif
        "nr_vmscan_write",
        "nr_vmscan_immediate_reclaim",
        "nr_dirtied",
@@ -1323,6 +1333,9 @@ const char * const vmstat_text[] = {
        "thp_split_page",
        "thp_split_page_failed",
        "thp_deferred_split_page",
+#ifdef CONFIG_FINEGRAINED_THP
+       "thp_split_cont_pte",
+#endif
        "thp_split_pmd",
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
        "thp_split_pud",