scripts: mkbootimg_rpi4: Fix url path to tizen_7.0
[platform/kernel/linux-rpi.git] / mm / khugepaged.c
index 4e3dff1..f88ad1b 100644 (file)
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/finegrained_thp.h>
+#include <asm/huge_mm.h>
+#else
+#include <asm-generic/finegrained_thp.h>
+#include <asm-generic/huge_mm.h>
+#endif
 #include "internal.h"
 
 enum scan_result {
@@ -78,6 +85,32 @@ static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
 static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * thp_scan_hint:
+ * it used for providing hints to khugepaged
+ * which address space is changed recently.
+ */
+struct thp_scan_hint {
+       struct mm_slot *slot;
+       struct vm_area_struct *vma;
+       unsigned long diff;             /* memory difference */
+       unsigned long jiffies;          /* time stamp for profiling purpose */
+       struct list_head hint_list;
+};
+
+/* THP type descriptor */
+enum {
+       THP_TYPE_FAIL,  /* cannot make hugepage */
+       THP_TYPE_64KB,  /* 64KB hugepage can be made, use CONT_PTE */
+       THP_TYPE_2MB,   /* 2MB hugepage can be made, use PMD */
+};
+
+static unsigned int khugepaged_max_ptes_none_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly;
+#endif /* CONFIG_FINEGRAINED_THP */
+
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
@@ -113,10 +146,18 @@ struct khugepaged_scan {
        struct list_head mm_head;
        struct mm_slot *mm_slot;
        unsigned long address;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_type;
+       int nr_hint;
+       struct list_head hint_list;
+#endif /* CONFIG_FINEGRAINED_THP */
 };
 
 static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+#ifdef CONFIG_FINEGRAINED_THP
+       .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list),
+#endif
 };
 
 #ifdef CONFIG_SYSFS
@@ -394,6 +435,11 @@ int __init khugepaged_init(void)
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1;
+       khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8;
+       khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2;
+#endif
        return 0;
 }
 
@@ -437,21 +483,42 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
        return atomic_read(&mm->mm_users) == 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void clear_hint_list(struct mm_slot *slot);
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static bool hugepage_vma_check(struct vm_area_struct *vma,
                               unsigned long vm_flags)
 {
-       if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-           (vm_flags & VM_NOHUGEPAGE) ||
-           test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+       if (!transhuge_vma_enabled(vma, vm_flags))
+               return false;
+
+       if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+                               vma->vm_pgoff, HPAGE_PMD_NR))
                return false;
 
-       if (shmem_file(vma->vm_file) ||
-           (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
-            vma->vm_file &&
-            (vm_flags & VM_DENYWRITE))) {
-               return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-                               HPAGE_PMD_NR);
+       /* Check arch-dependent shmem hugepage available */
+       if (arch_hugepage_vma_shmem_check(vma, vm_flags))
+               return true;
+       /* Enabled via shmem mount options or sysfs settings. */
+       if (shmem_file(vma->vm_file))
+               return shmem_huge_enabled(vma);
+
+       /* THP settings require madvise. */
+       if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+               return false;
+
+       /* Check arch-dependent file hugepage available */
+       if (arch_hugepage_vma_file_check(vma, vm_flags))
+               return true;
+       /* Only regular file is valid */
+       else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+           (vm_flags & VM_DENYWRITE)) {
+               struct inode *inode = vma->vm_file->f_inode;
+
+               return S_ISREG(inode->i_mode);
        }
+
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (vma_is_temporary_stack(vma))
@@ -509,6 +576,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
                return khugepaged_enter(vma, vm_flags);
+#ifdef CONFIG_FINEGRAINED_THP
+       hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+       hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+       if (hstart < hend)
+               return khugepaged_enter(vma, vm_flags);
+#endif /* CONFIG_FINEGRAINED_THP */
        return 0;
 }
 
@@ -520,6 +593,9 @@ void __khugepaged_exit(struct mm_struct *mm)
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+#ifdef CONFIG_FINEGRAINED_THP
+               clear_hint_list(mm_slot);
+#endif
                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
                free = 1;
@@ -584,23 +660,56 @@ static bool is_refcount_suitable(struct page *page)
        return page_count(page) == expected_refcount;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pte_t *pte,
+                                       struct list_head *compound_pagelist,
+                                       int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
                                        struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *page = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
        bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+       int max_ptes_shared, max_ptes_none;
+       int hpage_nr;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               hpage_nr = HPAGE_CONT_PTE_NR;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+       } else {
+               hpage_nr = HPAGE_PMD_NR;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+               max_ptes_none = khugepaged_max_ptes_none;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
 
-       for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+       for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+               _pte < pte + hpage_nr;
+#else
+               _pte < pte+HPAGE_PMD_NR;
+#endif
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
+#ifdef CONFIG_FINEGRAINED_THP
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+                           ++none_or_zero <= max_ptes_none)
+#else /* CONFIG_FINEGRAINED_THP */
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
+#endif /* CONFIG_FINEGRAINED_THP */
+                       {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -619,8 +728,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
                VM_BUG_ON_PAGE(!PageAnon(page), page);
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (page_mapcount(page) > 1 &&
+                               ++shared > max_ptes_shared)
+#else /* CONFIG_FINEGRAINED_THP */
                if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
+                               ++shared > khugepaged_max_ptes_shared)
+#endif /* CONFIG_FINEGRAINED_THP */
+               {
                        result = SCAN_EXCEED_SHARED_PTE;
                        goto out;
                }
@@ -704,17 +819,17 @@ next:
                if (pte_write(pteval))
                        writable = true;
        }
-       if (likely(writable)) {
-               if (likely(referenced)) {
-                       result = SCAN_SUCCEED;
-                       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
-                                                           referenced, writable, result);
-                       return 1;
-               }
-       } else {
+
+       if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
+       } else if (unlikely(!referenced)) {
+               result = SCAN_LACK_REFERENCED_PAGE;
+       } else {
+               result = SCAN_SUCCEED;
+               trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+                                                   referenced, writable, result);
+               return 1;
        }
-
 out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -722,15 +837,34 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                     struct vm_area_struct *vma,
+                                     unsigned long address,
+                                     spinlock_t *ptl,
+                                     struct list_head *compound_pagelist,
+                                     int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                                      struct vm_area_struct *vma,
                                      unsigned long address,
                                      spinlock_t *ptl,
                                      struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *src_page, *tmp;
        pte_t *_pte;
-       for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+#endif
+
+       for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+                               _pte < pte + hpage_nr;
+#else
+                               _pte < pte + HPAGE_PMD_NR;
+#endif
                                _pte++, page++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
 
@@ -884,12 +1018,21 @@ static int khugepaged_find_target_node(void)
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static inline struct page *alloc_khugepaged_hugepage(int hpage_order)
+#else
 static inline struct page *alloc_khugepaged_hugepage(void)
+#endif
 {
        struct page *page;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+                          hpage_order);
+#else
        page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
                           HPAGE_PMD_ORDER);
+#endif
        if (page)
                prep_transhuge_page(page);
        return page;
@@ -900,7 +1043,11 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
        struct page *hpage;
 
        do {
+#ifdef CONFIG_FINEGRAINED_THP
+               hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER);
+#else
                hpage = alloc_khugepaged_hugepage();
+#endif
                if (!hpage) {
                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        if (!*wait)
@@ -938,6 +1085,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type)
+{
+       struct page *page;
+
+       if (hpage_type == THP_TYPE_64KB)
+               page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER);
+       else {
+               VM_BUG_ON(!*hpage);
+               page = *hpage;
+       }
+       return page;
+}
+#else /* CONFIG_FINEGRAINED_THP */
 static struct page *
 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 {
@@ -945,6 +1107,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 
        return  *hpage;
 }
+#endif /* CONFIG_FINEGRAINED_THP */
 #endif
 
 /*
@@ -954,8 +1117,13 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
  * value (scan code).
  */
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+               struct vm_area_struct **vmap, int hpage_type)
+#else
 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                struct vm_area_struct **vmap)
+#endif
 {
        struct vm_area_struct *vma;
        unsigned long hstart, hend;
@@ -967,6 +1135,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
        if (!vma)
                return SCAN_VMA_NULL;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+               hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+               if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend)
+                       return SCAN_ADDRESS_RANGE;
+               if (!hugepage_vma_check(vma, vma->vm_flags))
+                       return SCAN_VMA_CHECK;
+               return 0;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -987,10 +1166,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
  * but with mmap_lock held to protect against vma changes.
  */
 
+#ifdef CONFIG_FINEGRAINED_THP
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd,
+                                       int referenced, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
                                        int referenced)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        int swapped_in = 0;
        vm_fault_t ret = 0;
@@ -1001,9 +1187,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                .pmd = pmd,
                .pgoff = linear_page_index(vma, address),
        };
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif
 
        vmf.pte = pte_offset_map(pmd, address);
-       for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+       for (;
+#ifdef CONFIG_FINEGRAINED_THP
+                       vmf.address < address + hpage_size;
+#else
+                       vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+#endif
                        vmf.pte++, vmf.address += PAGE_SIZE) {
                vmf.orig_pte = *vmf.pte;
                if (!is_swap_pte(vmf.orig_pte))
@@ -1014,7 +1209,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
                if (ret & VM_FAULT_RETRY) {
                        mmap_read_lock(mm);
-                       if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type))
+#else
+                       if (hugepage_vma_revalidate(mm, address, &vmf.vma))
+#endif
+                       {
                                /* vma is no longer available, don't continue to swapin */
                                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
                                return false;
@@ -1043,10 +1243,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
        return true;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_huge_page(struct mm_struct *mm,
+                                  unsigned long address,
+                                  struct page **hpage,
+                                  int node, int referenced, int unmapped,
+                                  int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
                                   int node, int referenced, int unmapped)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
@@ -1059,7 +1267,14 @@ static void collapse_huge_page(struct mm_struct *mm,
        struct mmu_notifier_range range;
        gfp_t gfp;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       pte_t _pte;
+
+       VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ?
+                               ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK));
+#else
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif
 
        /* Only allocate from the target node */
        gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
@@ -1071,7 +1286,11 @@ static void collapse_huge_page(struct mm_struct *mm,
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
        new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
        if (!new_page) {
                result = SCAN_ALLOC_HUGE_PAGE_FAIL;
                goto out_nolock;
@@ -1084,7 +1303,11 @@ static void collapse_huge_page(struct mm_struct *mm,
        count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
 
        mmap_read_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
        result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
        if (result) {
                mmap_read_unlock(mm);
                goto out_nolock;
@@ -1102,11 +1325,19 @@ static void collapse_huge_page(struct mm_struct *mm,
         * If it fails, we release mmap_lock and jump out_nolock.
         * Continuing to collapse causes inconsistency.
         */
+#ifdef CONFIG_FINEGRAINED_THP
+       if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+                                                    pmd, referenced, hpage_type)) {
+               mmap_read_unlock(mm);
+               goto out_nolock;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
                                                     pmd, referenced)) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
+#endif /* CONFIG_FINEGRAINED_THP*/
 
        mmap_read_unlock(mm);
        /*
@@ -1115,7 +1346,11 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        mmap_write_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
        result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
        if (result)
                goto out;
        /* check if the pmd is still valid */
@@ -1124,8 +1359,14 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        anon_vma_lock_write(vma->anon_vma);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+                               address, address + (hpage_type == THP_TYPE_64KB ?
+                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE));
+#else
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
                                address, address + HPAGE_PMD_SIZE);
+#endif
        mmu_notifier_invalidate_range_start(&range);
 
        pte = pte_offset_map(pmd, address);
@@ -1138,16 +1379,38 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-       _pmd = pmdp_collapse_flush(vma, address, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               /* FIXME: clearing ptes here causes
+                * __collapse_huge_page_isolate and __collapse_huge_page_copy
+                * to fail, __collapse_huge_page_copy also clears ptes
+                */
+               flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
+       else
+#endif /* CONFIG_FINEGRAINED_THP */
+               _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
 
        spin_lock(pte_ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+       isolated = __collapse_huge_page_isolate(vma, address, pte,
+                       &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
        isolated = __collapse_huge_page_isolate(vma, address, pte,
                        &compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(pte_ptl);
 
        if (unlikely(!isolated)) {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB) {
+                       pte_unmap(pte);
+                       anon_vma_unlock_write(vma->anon_vma);
+                       result = SCAN_FAIL;
+                       goto out;
+               }
+#endif /* CONFIG_FINEGRAINED_THP */
                pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
@@ -1169,15 +1432,34 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        anon_vma_unlock_write(vma->anon_vma);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+                       &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
        __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
                        &compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
        pte_unmap(pte);
        __SetPageUptodate(new_page);
+
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               /* 64KB hugepage */
+               _pte = arch_make_huge_pte(new_page, vma);
+               _pte = maybe_mkwrite(pte_mkdirty(_pte), vma);
+       } else {
+               /* 2MB hugepage */
+               pgtable = pmd_pgtable(_pmd);
+
+               _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+               _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        pgtable = pmd_pgtable(_pmd);
 
        _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
+#endif /* CONFIG_FINEGRAINED_THP */
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so
         * this is needed to avoid the copy_huge_page writes to become
@@ -1186,15 +1468,32 @@ static void collapse_huge_page(struct mm_struct *mm,
        smp_wmb();
 
        spin_lock(pmd_ptl);
-       BUG_ON(!pmd_none(*pmd));
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_2MB)
+#endif
+               BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address, true);
        lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               arch_set_huge_pte_at(mm, address, pte, _pte, 0);
+       else {
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
+               set_pmd_at(mm, address, pmd, _pmd);
+       }
+       update_mmu_cache_pmd(vma, address, pmd);
+#else /* CONFIG_FINEGRAINED_THP */
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(pmd_ptl);
 
-       *hpage = NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_2MB)
+#endif
+               *hpage = NULL;
 
        khugepaged_pages_collapsed++;
        result = SCAN_SUCCEED;
@@ -1203,16 +1502,27 @@ out_up_write:
 out_nolock:
        if (!IS_ERR_OR_NULL(*hpage))
                mem_cgroup_uncharge(*hpage);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+               put_page(new_page);
+#endif
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
 out:
        goto out_up_write;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                              struct vm_area_struct *vma,
+                              unsigned long address,
+                              struct page **hpage, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static int khugepaged_scan_pmd(struct mm_struct *mm,
                               struct vm_area_struct *vma,
                               unsigned long address,
                               struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
@@ -1224,7 +1534,26 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr;
+       int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+               hpage_nr = HPAGE_CONT_PTE_NR;
+               max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+       } else {
+               VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+               hpage_nr = HPAGE_PMD_NR;
+               max_ptes_swap = khugepaged_max_ptes_swap;
+               max_ptes_none = khugepaged_max_ptes_none;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif /* CONFIG_FINEGRAINED_THP */
 
        pmd = mm_find_pmd(mm, address);
        if (!pmd) {
@@ -1234,11 +1563,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-       for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+       for (_address = address, _pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+               _pte < pte + hpage_nr;
+#else
+               _pte < pte+HPAGE_PMD_NR;
+#endif
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (is_swap_pte(pteval)) {
-                       if (++unmapped <= khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (++unmapped <= max_ptes_swap)
+#else
+                       if (++unmapped <= khugepaged_max_ptes_swap)
+#endif
+                       {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
@@ -1256,7 +1595,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+                           ++none_or_zero <= max_ptes_none
+#else
+                           ++none_or_zero <= khugepaged_max_ptes_none
+#endif
+                       )
+                       {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
@@ -1289,8 +1634,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
 
+#ifdef CONFIG_FINEGRAINED_THP
+               if (PageCompound(page) && PageTransHuge(compound_head(page))) {
+                       result = SCAN_PAGE_COMPOUND;
+                       goto out_unmap;
+               }
+
                if (page_mapcount(page) > 1 &&
-                               ++shared > khugepaged_max_ptes_shared) {
+                               ++shared > max_ptes_shared)
+#else
+               if (page_mapcount(page) > 1 &&
+                               ++shared > khugepaged_max_ptes_shared)
+#endif
+               {
                        result = SCAN_EXCEED_SHARED_PTE;
                        goto out_unmap;
                }
@@ -1361,8 +1717,13 @@ out_unmap:
        if (ret) {
                node = khugepaged_find_target_node();
                /* collapse_huge_page will return with the mmap_lock released */
+#ifdef CONFIG_FINEGRAINED_THP
+               collapse_huge_page(mm, address, hpage, node,
+                               referenced, unmapped, hpage_type);
+#else
                collapse_huge_page(mm, address, hpage, node,
                                referenced, unmapped);
+#endif
        }
 out:
        trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1377,6 +1738,9 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
        lockdep_assert_held(&khugepaged_mm_lock);
 
        if (khugepaged_test_exit(mm)) {
+#ifdef CONFIG_FINEGRAINED_THP
+               clear_hint_list(mm_slot);
+#endif
                /* free mm_slot */
                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
@@ -1398,15 +1762,29 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
  * khugepaged should try to collapse the page table.
  */
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+                                        unsigned long addr, int hpage_type)
+#else
 static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
                                         unsigned long addr)
+#endif
 {
        struct mm_slot *mm_slot;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ?
+                                       ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK));
+#else
        VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+#endif
 
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB)
+               addr |= 0x01;
+#endif
        if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
                mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
        spin_unlock(&khugepaged_mm_lock);
@@ -1430,10 +1808,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        spinlock_t *ptl;
        int count = 0;
        int i;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB;
+       int hpage_nr = (hpage_type == THP_TYPE_64KB) ?
+                                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR;
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                                                       HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+
+       if (hpage_type == THP_TYPE_64KB)
+               haddr = addr & HPAGE_CONT_PTE_MASK;
+#endif
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (!vma || !vma->vm_file ||
+           vma->vm_start > haddr || vma->vm_end < haddr + hpage_size)
+               return;
+#else /* CONFIG_FINEGRAINED_THP */
        if (!vma || !vma->vm_file ||
            vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
                return;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        /*
         * This vm_flags may not have VM_HUGEPAGE if the page was not
@@ -1457,10 +1851,21 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                goto drop_hpage;
 
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (pte_cont(*start_pte)) {
+               pte_unmap_unlock(start_pte, ptl);
+               goto drop_hpage;
+       }
+#endif
 
        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
-            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+            i < hpage_nr;
+#else
+            i < HPAGE_PMD_NR;
+#endif
+            i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
 
                /* empty pte, skip */
@@ -1484,7 +1889,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
        /* step 2: adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte;
-            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+               i < hpage_nr;
+#else
+           i < HPAGE_PMD_NR;
+#endif
+            i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
 
                if (pte_none(*pte))
@@ -1503,10 +1913,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
        /* step 4: collapse pmd */
        ptl = pmd_lock(vma->vm_mm, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+       if (hpage_type == THP_TYPE_64KB) {
+               pte_t *ptep = pte_offset_map(pmd, haddr);
+               arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep);
+               spin_unlock(ptl);
+       } else {
+               _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+               spin_unlock(ptl);
+               mm_dec_nr_ptes(mm);
+               pte_free(mm, pmd_pgtable(_pmd));
+       }
+#else /* CONFIG_FINEGRAINED_THP*/
        _pmd = pmdp_collapse_flush(vma, haddr, pmd);
        spin_unlock(ptl);
        mm_dec_nr_ptes(mm);
        pte_free(mm, pmd_pgtable(_pmd));
+#endif /* CONFIG_FINEGRAINED_THP */
 
 drop_hpage:
        unlock_page(hpage);
@@ -1541,12 +1964,22 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+                                                       int hpage_type)
+#else
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+#endif
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
        unsigned long addr;
        pmd_t *pmd, _pmd;
+#ifdef CONFIG_FINEGRAINED_THP
+       pte_t *ptep;
+       int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+                               HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
 
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1569,6 +2002,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                if (vma->anon_vma)
                        continue;
                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK)
+                       continue;
+               else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK)
+                       continue;
+               if (vma->vm_end < addr + hpage_size)
+                       continue;
+
+               mm = vma->vm_mm;
+               pmd = mm_find_pmd(mm, addr);
+               if (!pmd)
+                       continue;
+               if (mmap_write_trylock(mm)) {
+                       spinlock_t *ptl = pmd_lock(mm, pmd);
+                       if (hpage_type == THP_TYPE_64KB) {
+                               /* 64KB hugepage */
+                               ptep = pte_offset_map(pmd, addr);
+                               /* pte maps are established on page fault handling */
+                               arch_clear_huge_pte_range(mm, addr, ptep);
+                               spin_unlock(ptl);
+                       } else {
+                               /* 2MB hugepage */
+                               /*
+                                * We need exclusive mmap_sem to retract page table.
+                                *
+                                * We use trylock due to lock inversion: we need to acquire
+                                * mmap_sem while holding page lock. Fault path does it in
+                                * reverse order. Trylock is a way to avoid deadlock.
+                                */
+                               _pmd = pmdp_collapse_flush(vma, addr, pmd);
+                               spin_unlock(ptl);
+
+                               mm_dec_nr_ptes(mm);
+                               pte_free(mm, pmd_pgtable(_pmd));
+                       }
+                       mmap_write_unlock(mm);
+               } else
+                       khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
                if (addr & ~HPAGE_PMD_MASK)
                        continue;
                if (vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1598,6 +2070,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                        /* Try again later */
                        khugepaged_add_pte_mapped_thp(mm, addr);
                }
+#endif /* CONFIG_FINEGRAINED_THP */
        }
        i_mmap_unlock_write(mapping);
 }
@@ -1620,26 +2093,52 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start,
+               struct page **hpage, int node, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void collapse_file(struct mm_struct *mm,
                struct file *file, pgoff_t start,
                struct page **hpage, int node)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct address_space *mapping = file->f_mapping;
        gfp_t gfp;
        struct page *new_page;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+       int hpage_order = (hpage_type == THP_TYPE_64KB ?
+                                       HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER);
+       pgoff_t index, end = start + hpage_nr;
+#else /* CONFIG_FINEGRAINED_THP */
        pgoff_t index, end = start + HPAGE_PMD_NR;
+#endif /* CONFIG_FINEGRAINED_THP */
        LIST_HEAD(pagelist);
+#ifdef CONFIG_FINEGRAINED_THP
+       XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order);
+#else
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
+#endif
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);
 
        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
+#ifdef CONFIG_FINEGRAINED_THP
+       VM_BUG_ON(start & (hpage_nr - 1));
+#else
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+#endif
 
        /* Only allocate from the target node */
        gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
 
+#ifdef CONFIG_FINEGRAINED_THP
+       new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
        new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
        if (!new_page) {
                result = SCAN_ALLOC_HUGE_PAGE_FAIL;
                goto out;
@@ -1750,6 +2249,10 @@ static void collapse_file(struct mm_struct *mm,
                                filemap_flush(mapping);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
+                       } else if (PageWriteback(page)) {
+                               xas_unlock_irq(&xas);
+                               result = SCAN_FAIL;
+                               goto xa_unlocked;
                        } else if (trylock_page(page)) {
                                get_page(page);
                                xas_unlock_irq(&xas);
@@ -1785,7 +2288,8 @@ static void collapse_file(struct mm_struct *mm,
                        goto out_unlock;
                }
 
-               if (!is_shmem && PageDirty(page)) {
+               if (!is_shmem && (PageDirty(page) ||
+                                 PageWriteback(page))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * page is dirty because it hasn't been flushed
@@ -1845,9 +2349,23 @@ out_unlock:
        }
 
        if (is_shmem)
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB)
+                       __inc_node_page_state(new_page, NR_SHMEM_64KB_THPS);
+               else
+                       __inc_node_page_state(new_page, NR_SHMEM_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
                __inc_node_page_state(new_page, NR_SHMEM_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
        else {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB)
+                       __inc_node_page_state(new_page, NR_FILE_64KB_THPS);
+               else
+                       __inc_node_page_state(new_page, NR_FILE_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
                __inc_node_page_state(new_page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
                filemap_nr_thps_inc(mapping);
        }
 
@@ -1863,6 +2381,9 @@ xa_unlocked:
 
        if (result == SCAN_SUCCEED) {
                struct page *page, *tmp;
+#ifdef CONFIG_FINEGRAINED_THP
+               int offset = 0;
+#endif
 
                /*
                 * Replacing old pages with new one has succeeded, now we
@@ -1870,12 +2391,28 @@ xa_unlocked:
                 */
                index = start;
                list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (hpage_type != THP_TYPE_64KB) {
+                               while (index < page->index) {
+                                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                                       index++;
+                               }
+                       }
+
+                       if (hpage_type == THP_TYPE_64KB) {
+                               copy_highpage(new_page + offset, page);
+                               offset++;
+                       } else
+                               copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+                                               page);
+#else /* CONFIG_FINEGRAINED_THP */
                        while (index < page->index) {
                                clear_highpage(new_page + (index % HPAGE_PMD_NR));
                                index++;
                        }
                        copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
                                        page);
+#endif /* CONFIG_FINEGRAINED_THP */
                        list_del(&page->lru);
                        page->mapping = NULL;
                        page_ref_unfreeze(page, 1);
@@ -1885,13 +2422,32 @@ xa_unlocked:
                        put_page(page);
                        index++;
                }
+#ifdef CONFIG_FINEGRAINED_THP
+               if (hpage_type == THP_TYPE_64KB) {
+                       while (index < end) {
+                               clear_highpage(new_page + offset);
+                               offset++;
+                               index++;
+                       }
+               } else {
+                       while (index < end) {
+                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
+                               index++;
+                       }
+               }
+#else /* CONFIG_FINEGRAINED_THP */
                while (index < end) {
                        clear_highpage(new_page + (index % HPAGE_PMD_NR));
                        index++;
                }
+#endif /* CONFIG_FINEGRAINED_THP */
 
                SetPageUptodate(new_page);
+#ifdef CONFIG_FINEGRAINED_THP
+               page_ref_add(new_page, hpage_nr - 1);
+#else
                page_ref_add(new_page, HPAGE_PMD_NR - 1);
+#endif
                if (is_shmem)
                        set_page_dirty(new_page);
                lru_cache_add(new_page);
@@ -1899,9 +2455,14 @@ xa_unlocked:
                /*
                 * Remove pte page tables, so we can re-fault the page as huge.
                 */
+#ifdef CONFIG_FINEGRAINED_THP
+               retract_page_tables(mapping, start, hpage_type);
+               if (hpage_type == THP_TYPE_2MB)
+                       *hpage = NULL;
+#else /* CONFIG_FINEGRAINED_THP */
                retract_page_tables(mapping, start);
                *hpage = NULL;
-
+#endif /* CONFIG_FINEGRAINED_THP */
                khugepaged_pages_collapsed++;
        } else {
                struct page *page;
@@ -1946,14 +2507,24 @@ xa_unlocked:
 
        unlock_page(new_page);
 out:
+#ifdef CONFIG_FINEGRAINED_THP
+       if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+               put_page(new_page);
+#endif
        VM_BUG_ON(!list_empty(&pagelist));
        if (!IS_ERR_OR_NULL(*hpage))
                mem_cgroup_uncharge(*hpage);
        /* TODO: tracepoints */
 }
 
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage,
+               int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void khugepaged_scan_file(struct mm_struct *mm,
                struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        struct page *page = NULL;
        struct address_space *mapping = file->f_mapping;
@@ -1961,17 +2532,43 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        int present, swap;
        int node = NUMA_NO_NODE;
        int result = SCAN_SUCCEED;
+#ifdef CONFIG_FINEGRAINED_THP
+       int hpage_nr;
+       int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */
+               max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+               max_ptes_none = khugepaged_max_ptes_none_64kb;
+               max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+       } else {
+               hpage_nr = HPAGE_PMD_NR; /* 2MB */
+               max_ptes_swap = khugepaged_max_ptes_swap;
+               max_ptes_none = khugepaged_max_ptes_none;
+               max_ptes_shared = khugepaged_max_ptes_shared;
+       }
+#endif /* CONFIG_FINEGRAINED_THP */
 
        present = 0;
        swap = 0;
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        rcu_read_lock();
-       xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+#ifdef CONFIG_FINEGRAINED_THP
+       xas_for_each(&xas, page, start + hpage_nr - 1)
+#else
+       xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1)
+#endif
+       {
                if (xas_retry(&xas, page))
                        continue;
 
                if (xa_is_value(page)) {
-                       if (++swap > khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+                       if (++swap > max_ptes_swap)
+#else
+                       if (++swap > khugepaged_max_ptes_swap)
+#endif
+                       {
                                result = SCAN_EXCEED_SWAP_PTE;
                                break;
                        }
@@ -2017,19 +2614,34 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        rcu_read_unlock();
 
        if (result == SCAN_SUCCEED) {
-               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+               if (present < hpage_nr - max_ptes_none)
+#else
+               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none)
+#endif
+               {
                        result = SCAN_EXCEED_NONE_PTE;
                } else {
                        node = khugepaged_find_target_node();
+#ifdef CONFIG_FINEGRAINED_THP
+                       collapse_file(mm, file, start, hpage, node, hpage_type);
+#else
                        collapse_file(mm, file, start, hpage, node);
+#endif
                }
        }
 
        /* TODO: tracepoints */
 }
 #else
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage,
+               int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
 static void khugepaged_scan_file(struct mm_struct *mm,
                struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
 {
        BUILD_BUG();
 }
@@ -2040,6 +2652,220 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 }
 #endif
 
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * if return value > 0 -> vma can make hugepage
+ *    calculated hugepage start and hugepage end are stored in pointers
+ * otherwise -> vma cannot make hugepage
+ */
+static inline int hugepage_determine_htype(unsigned long vm_start,
+               unsigned long vm_end, unsigned long *hstart, unsigned long *hend) {
+       unsigned long start, end;
+
+       /* determine 2MB hugepage */
+       start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+       end = vm_end & HPAGE_PMD_MASK;
+       if (start >= end) {
+               /* determine 64KB hugepage */
+               start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+               end = vm_end & HPAGE_CONT_PTE_MASK;
+               if (start >= end)
+                       return THP_TYPE_FAIL;
+               *hstart = start;
+               *hend = end;
+               return THP_TYPE_64KB;
+       }
+       *hstart = start;
+       *hend = end;
+       return THP_TYPE_2MB;
+}
+
+enum {
+       KHUGEPAGE_SCAN_CONTINUE,
+       KHUGEPAGE_SCAN_BREAK,
+       KHUGEPAGE_SCAN_BREAK_MMAP_LOCK,
+};
+
+static unsigned int khugepaged_scan_vma(struct mm_struct *mm,
+                       struct vm_area_struct *vma, struct page **hpage,
+                       unsigned int pages, int *progress)
+{
+       unsigned long hstart, hend;
+       int hpage_type, ret;
+       int hpage_size, hpage_nr;
+
+       if (!hugepage_vma_check(vma, vma->vm_flags))
+               return KHUGEPAGE_SCAN_CONTINUE;
+
+       hpage_type = hugepage_determine_htype(
+                               (vma->vm_start > khugepaged_scan.address) ?
+                               vma->vm_start : khugepaged_scan.address,
+                               vma->vm_end, &hstart, &hend);
+
+       if (hpage_type == THP_TYPE_FAIL)
+               return KHUGEPAGE_SCAN_CONTINUE;
+       if (khugepaged_scan.address > hend)
+               return KHUGEPAGE_SCAN_CONTINUE;
+       if (khugepaged_scan.address < hstart)
+               khugepaged_scan.address = hstart;
+
+       if (hpage_type == THP_TYPE_64KB) {
+               VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK);
+               hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+               hpage_nr = HPAGE_CONT_PTE_NR;
+       } else if (hpage_type == THP_TYPE_2MB) {
+               VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+               hpage_size = HPAGE_PMD_SIZE; /* 2MB */
+               hpage_nr = HPAGE_PMD_NR;
+               if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+                   !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+                               HPAGE_PMD_NR)) {
+                       /* fallback, vma or file not aligned to 2MB */
+                       hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+                       hpage_nr = HPAGE_CONT_PTE_NR;
+                       hpage_type = THP_TYPE_64KB;
+               }
+       } else
+               BUG();
+
+       while (khugepaged_scan.address < hend) {
+               if (khugepaged_scan.address + hpage_size > hend) {
+                       if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) {
+                               hpage_size = HPAGE_CONT_PTE_SIZE;
+                               hpage_nr = HPAGE_CONT_PTE_NR;
+                               hpage_type = THP_TYPE_64KB;
+                       }
+               }
+               ret = 0;
+               cond_resched();
+               if (unlikely(khugepaged_test_exit(mm)))
+                       return KHUGEPAGE_SCAN_BREAK;
+
+               VM_BUG_ON(khugepaged_scan.address < hstart ||
+                               khugepaged_scan.address + hpage_size >
+                               hend);
+               if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+                       struct file *file = get_file(vma->vm_file);
+                       pgoff_t pgoff = linear_page_index(vma,
+                                       khugepaged_scan.address);
+
+                       mmap_read_unlock(mm);
+                       ret = 1;
+                       khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type);
+                       fput(file);
+               } else {
+                       ret = khugepaged_scan_pmd(mm, vma,
+                                       khugepaged_scan.address,
+                                       hpage, hpage_type);
+               }
+               /* move to next address */
+               khugepaged_scan.address += hpage_size;
+               *progress += hpage_nr;
+               if (ret)
+                       /* we released mmap_sem so break loop */
+                       return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK;
+               if (*progress >= pages)
+                       return KHUGEPAGE_SCAN_BREAK;
+       }
+       return KHUGEPAGE_SCAN_CONTINUE;
+}
+
+static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot,
+                                                               unsigned long addr)
+{
+       struct thp_scan_hint *hint;
+
+       list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) {
+               if (hint->slot == slot)
+                       return hint;
+       }
+       return NULL;
+}
+
+#ifdef CONFIG_THP_CONSERVATIVE
+/* caller must hold a proper mmap_lock */
+void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr,
+               long diff, const char *debug)
+{
+       struct mm_slot *slot;
+       struct vm_area_struct *vma;
+       struct thp_scan_hint *hint;
+       bool wakeup = false;
+       bool retry = false;
+
+       vma = find_vma(mm, addr);
+       if (!hugepage_vma_check(vma, vma->vm_flags))
+               return;
+
+again:
+       spin_lock(&khugepaged_mm_lock);
+       slot = get_mm_slot(mm);
+       if (!slot) {
+               /* make a new slot or go out */
+               spin_unlock(&khugepaged_mm_lock);
+               if (retry)
+                       return;
+               if (__khugepaged_enter(mm))
+                       return;
+               retry = true;
+               goto again;
+       }
+
+       hint = find_scan_hint(slot, addr);
+       if (!hint) {
+               spin_unlock(&khugepaged_mm_lock);
+               hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL);
+               hint->vma = vma;
+               hint->slot = slot;
+               hint->diff = 0;
+               hint->jiffies = jiffies;
+               spin_lock(&khugepaged_mm_lock);
+               list_add(&hint->hint_list, &khugepaged_scan.hint_list);
+               khugepaged_scan.nr_hint++;
+       }
+       hint->diff += diff;
+       if (hint->diff >= HPAGE_CONT_PTE_SIZE) {
+               wakeup = true;
+               //list_move(&hint->hint_list, &khugepaged_scan.hint_list);
+       }
+       spin_unlock(&khugepaged_mm_lock);
+
+       /* if possible, wake khugepaged up for starting a scan */
+       if (wakeup) {
+               wake_up_interruptible(&khugepaged_wait);
+       }
+}
+#else /* CONFIG_THP_CONSERVATIVE */
+void khugepaged_mem_hook(struct mm_struct *mm,
+                       unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_THP_CONSERVATIVE */
+
+static void clear_hint_list(struct mm_slot *slot)
+{
+       struct thp_scan_hint *hint;
+       hint = find_scan_hint(slot, 0);
+       if (hint) {
+               list_del(&hint->hint_list);
+               kfree(hint);
+               khugepaged_scan.nr_hint--;
+       }
+}
+
+static struct thp_scan_hint *get_next_hint(void)
+{
+       if (!list_empty(&khugepaged_scan.hint_list)) {
+               struct thp_scan_hint *hint = list_first_entry(
+                                       &khugepaged_scan.hint_list,
+                                       struct thp_scan_hint, hint_list);
+               list_del(&hint->hint_list);
+               khugepaged_scan.nr_hint--;
+               return hint;
+       }
+       return NULL;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                                            struct page **hpage)
        __releases(&khugepaged_mm_lock)
@@ -2053,6 +2879,38 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
 
+#ifdef CONFIG_FINEGRAINED_THP
+       if (khugepaged_scan.mm_slot)
+               mm_slot = khugepaged_scan.mm_slot;
+       else if (!list_empty(&khugepaged_scan.hint_list)) {
+               struct thp_scan_hint *hint;
+               long mem_diff;
+               unsigned long jiffies_diff;
+
+get_next_hint:
+               hint = get_next_hint();
+               if (!hint)
+                       goto get_next_slot;
+
+               mm_slot = hint->slot;
+               mem_diff = hint->diff;
+               jiffies_diff = jiffies - hint->jiffies;
+               kfree(hint);
+               clear_hint_list(mm_slot);
+
+               if (khugepaged_test_exit(mm_slot->mm))
+                       goto get_next_hint;
+               khugepaged_scan.address = 0;
+               khugepaged_scan.mm_slot = mm_slot;
+       } else {
+get_next_slot:
+               mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                    struct mm_slot, mm_node);
+               clear_hint_list(mm_slot);
+               khugepaged_scan.address = 0;
+               khugepaged_scan.mm_slot = mm_slot;
+       }
+#else /* CONFIG_FINEGRAINED_THP */
        if (khugepaged_scan.mm_slot)
                mm_slot = khugepaged_scan.mm_slot;
        else {
@@ -2061,6 +2919,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
+#endif /* CONFIG_FINEGRAINED_THP */
        spin_unlock(&khugepaged_mm_lock);
        khugepaged_collapse_pte_mapped_thps(mm_slot);
 
@@ -2077,13 +2936,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
        progress++;
        for (; vma; vma = vma->vm_next) {
+#ifdef CONFIG_FINEGRAINED_THP
+               int ret;
+#else
                unsigned long hstart, hend;
+#endif
 
                cond_resched();
                if (unlikely(khugepaged_test_exit(mm))) {
                        progress++;
                        break;
                }
+#ifdef CONFIG_FINEGRAINED_THP
+               ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress);
+
+               if (ret == KHUGEPAGE_SCAN_CONTINUE) {
+                       progress++;
+                       continue;
+               } else if (ret == KHUGEPAGE_SCAN_BREAK)
+                       goto breakouterloop;
+               else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK)
+                       goto breakouterloop_mmap_lock;
+#else /* CONFIG_FINEGRAINED_THP */
                if (!hugepage_vma_check(vma, vma->vm_flags)) {
 skip:
                        progress++;
@@ -2133,6 +3007,7 @@ skip:
                        if (progress >= pages)
                                goto breakouterloop;
                }
+#endif /* CONFIG_FINEGRAINED_THP */
        }
 breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
@@ -2150,6 +3025,53 @@ breakouterloop_mmap_lock:
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
+#ifdef CONFIG_FINEGRAINED_THP
+               if (!list_empty(&khugepaged_scan.hint_list)) {
+                       unsigned long jiffies_diff;
+                       long mem_diff;
+                       struct thp_scan_hint *hint;
+                       struct mm_slot *next_slot;
+
+get_next_hint2:
+                       hint = get_next_hint();
+
+                       if (!hint) {
+                               /* no more hint */
+                               if (mm_slot->mm_node.next != &khugepaged_scan.mm_head)
+                                       goto get_next_slot2;
+                               else
+                                       goto end_loop;
+                       }
+
+                       mem_diff = hint->diff;
+                       jiffies_diff = jiffies - hint->jiffies;
+                       next_slot = hint->slot;
+                       kfree(hint);
+
+                       if (next_slot == mm_slot)
+                               goto get_next_hint2;
+
+                       if (!khugepaged_test_exit(next_slot->mm)) {
+                               list_move(&next_slot->mm_node, &mm_slot->mm_node);
+                               clear_hint_list(next_slot);
+                       } else
+                               goto get_next_hint2;
+
+                       khugepaged_scan.mm_slot = next_slot;
+                       khugepaged_scan.address = 0;
+               } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+get_next_slot2:
+                       khugepaged_scan.mm_slot = list_entry(
+                               mm_slot->mm_node.next,
+                               struct mm_slot, mm_node);
+                       clear_hint_list(khugepaged_scan.mm_slot);
+                       khugepaged_scan.address = 0;
+               } else {
+end_loop:
+                       khugepaged_scan.mm_slot = NULL;
+                       khugepaged_full_scans++;
+               }
+#else /* CONFIG_FINEGRAINED_THP */
                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
                        khugepaged_scan.mm_slot = list_entry(
                                mm_slot->mm_node.next,
@@ -2159,7 +3081,7 @@ breakouterloop_mmap_lock:
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }
-
+#endif /* CONFIG_FINEGRAINED_THP */
                collect_mm_slot(mm_slot);
        }
 
@@ -2240,6 +3162,9 @@ static void khugepaged_wait_work(void)
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
+#include <linux/delay.h>
+bool eager_allocation = false;
+
 static int khugepaged(void *none)
 {
        struct mm_slot *mm_slot;