X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=mm%2Fkhugepaged.c;h=f88ad1b74ed6036b07b25007c2d134427cf9a2f4;hb=refs%2Fheads%2Faccepted%2Ftizen_7.0_unified;hp=4e3dff13eb70c5a85d6a91a3b2dd446fe1513b2c;hpb=9313f8026328d0309d093f6774be4b8f5340c0e5;p=platform%2Fkernel%2Flinux-rpi.git diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e3dff1..f88ad1b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,6 +21,13 @@ #include #include +#ifdef CONFIG_FINEGRAINED_THP +#include +#include +#else +#include +#include +#endif #include "internal.h" enum scan_result { @@ -78,6 +85,32 @@ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; +#ifdef CONFIG_FINEGRAINED_THP +/* + * thp_scan_hint: + * it used for providing hints to khugepaged + * which address space is changed recently. + */ +struct thp_scan_hint { + struct mm_slot *slot; + struct vm_area_struct *vma; + unsigned long diff; /* memory difference */ + unsigned long jiffies; /* time stamp for profiling purpose */ + struct list_head hint_list; +}; + +/* THP type descriptor */ +enum { + THP_TYPE_FAIL, /* cannot make hugepage */ + THP_TYPE_64KB, /* 64KB hugepage can be made, use CONT_PTE */ + THP_TYPE_2MB, /* 2MB hugepage can be made, use PMD */ +}; + +static unsigned int khugepaged_max_ptes_none_64kb __read_mostly; +static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly; +static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly; +#endif /* CONFIG_FINEGRAINED_THP */ + #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -113,10 +146,18 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_type; + int nr_hint; + struct list_head hint_list; +#endif /* CONFIG_FINEGRAINED_THP */ }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +#ifdef CONFIG_FINEGRAINED_THP + .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list), +#endif }; #ifdef CONFIG_SYSFS @@ -394,6 +435,11 @@ int __init khugepaged_init(void) khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; +#ifdef CONFIG_FINEGRAINED_THP + khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1; + khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8; + khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2; +#endif return 0; } @@ -437,21 +483,42 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void clear_hint_list(struct mm_slot *slot); +#endif /* CONFIG_FINEGRAINED_THP */ + static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!transhuge_vma_enabled(vma, vm_flags)) + return false; + + if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - + vma->vm_pgoff, HPAGE_PMD_NR)) return false; - if (shmem_file(vma->vm_file) || - (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - vma->vm_file && - (vm_flags & VM_DENYWRITE))) { - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); + /* Check arch-dependent shmem hugepage available */ + if (arch_hugepage_vma_shmem_check(vma, vm_flags)) + return true; + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file)) + return shmem_huge_enabled(vma); + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Check arch-dependent file hugepage available */ + if (arch_hugepage_vma_file_check(vma, vm_flags)) + return true; + /* Only regular file is valid */ + else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + (vm_flags & VM_DENYWRITE)) { + struct inode *inode = vma->vm_file->f_inode; + + return S_ISREG(inode->i_mode); } + if (!vma->anon_vma || vma->vm_ops) return false; if (vma_is_temporary_stack(vma)) @@ -509,6 +576,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) return khugepaged_enter(vma, vm_flags); +#ifdef CONFIG_FINEGRAINED_THP + hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + hend = vma->vm_end & HPAGE_CONT_PTE_MASK; + if (hstart < hend) + return khugepaged_enter(vma, vm_flags); +#endif /* CONFIG_FINEGRAINED_THP */ return 0; } @@ -520,6 +593,9 @@ void __khugepaged_exit(struct mm_struct *mm) spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { +#ifdef CONFIG_FINEGRAINED_THP + clear_hint_list(mm_slot); +#endif hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); free = 1; @@ -584,23 +660,56 @@ static bool is_refcount_suitable(struct page *page) return page_count(page) == expected_refcount; } +#ifdef CONFIG_FINEGRAINED_THP +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte, + struct list_head *compound_pagelist, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, struct list_head *compound_pagelist) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *page = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; +#ifdef CONFIG_FINEGRAINED_THP + int max_ptes_shared, max_ptes_none; + int hpage_nr; + + if (hpage_type == THP_TYPE_64KB) { + hpage_nr = HPAGE_CONT_PTE_NR; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + } else { + hpage_nr = HPAGE_PMD_NR; + max_ptes_shared = khugepaged_max_ptes_shared; + max_ptes_none = khugepaged_max_ptes_none; + } +#endif /* CONFIG_FINEGRAINED_THP */ - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte+HPAGE_PMD_NR; +#endif _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { +#ifdef CONFIG_FINEGRAINED_THP if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + ++none_or_zero <= max_ptes_none) +#else /* CONFIG_FINEGRAINED_THP */ + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) +#endif /* CONFIG_FINEGRAINED_THP */ + { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -619,8 +728,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); +#ifdef CONFIG_FINEGRAINED_THP + if (page_mapcount(page) > 1 && + ++shared > max_ptes_shared) +#else /* CONFIG_FINEGRAINED_THP */ if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + ++shared > khugepaged_max_ptes_shared) +#endif /* CONFIG_FINEGRAINED_THP */ + { result = SCAN_EXCEED_SHARED_PTE; goto out; } @@ -704,17 +819,17 @@ next: if (pte_write(pteval)) writable = true; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { + + if (unlikely(!writable)) { result = SCAN_PAGE_RO; + } else if (unlikely(!referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; } - out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, @@ -722,15 +837,34 @@ out: return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *src_page, *tmp; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR); +#endif + + for (_pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte + HPAGE_PMD_NR; +#endif _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; @@ -884,12 +1018,21 @@ static int khugepaged_find_target_node(void) return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static inline struct page *alloc_khugepaged_hugepage(int hpage_order) +#else static inline struct page *alloc_khugepaged_hugepage(void) +#endif { struct page *page; +#ifdef CONFIG_FINEGRAINED_THP + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + hpage_order); +#else page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), HPAGE_PMD_ORDER); +#endif if (page) prep_transhuge_page(page); return page; @@ -900,7 +1043,11 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { +#ifdef CONFIG_FINEGRAINED_THP + hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER); +#else hpage = alloc_khugepaged_hugepage(); +#endif if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -938,6 +1085,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } +#ifdef CONFIG_FINEGRAINED_THP +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type) +{ + struct page *page; + + if (hpage_type == THP_TYPE_64KB) + page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER); + else { + VM_BUG_ON(!*hpage); + page = *hpage; + } + return page; +} +#else /* CONFIG_FINEGRAINED_THP */ static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) { @@ -945,6 +1107,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) return *hpage; } +#endif /* CONFIG_FINEGRAINED_THP */ #endif /* @@ -954,8 +1117,13 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) * value (scan code). */ +#ifdef CONFIG_FINEGRAINED_THP +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap, int hpage_type) +#else static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct **vmap) +#endif { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -967,6 +1135,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + hend = vma->vm_end & HPAGE_CONT_PTE_MASK; + if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend) + return SCAN_ADDRESS_RANGE; + if (!hugepage_vma_check(vma, vma->vm_flags)) + return SCAN_VMA_CHECK; + return 0; + } +#endif /* CONFIG_FINEGRAINED_THP */ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) @@ -987,10 +1166,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * but with mmap_lock held to protect against vma changes. */ +#ifdef CONFIG_FINEGRAINED_THP +static bool __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + int referenced, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, int referenced) +#endif /* CONFIG_FINEGRAINED_THP */ { int swapped_in = 0; vm_fault_t ret = 0; @@ -1001,9 +1187,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pmd = pmd, .pgoff = linear_page_index(vma, address), }; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; +#endif vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + for (; +#ifdef CONFIG_FINEGRAINED_THP + vmf.address < address + hpage_size; +#else + vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; +#endif vmf.pte++, vmf.address += PAGE_SIZE) { vmf.orig_pte = *vmf.pte; if (!is_swap_pte(vmf.orig_pte)) @@ -1014,7 +1209,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { +#ifdef CONFIG_FINEGRAINED_THP + if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type)) +#else + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) +#endif + { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -1043,10 +1243,18 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return true; } +#ifdef CONFIG_FINEGRAINED_THP +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + int node, int referenced, int unmapped, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, int node, int referenced, int unmapped) +#endif /* CONFIG_FINEGRAINED_THP */ { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1059,7 +1267,14 @@ static void collapse_huge_page(struct mm_struct *mm, struct mmu_notifier_range range; gfp_t gfp; +#ifdef CONFIG_FINEGRAINED_THP + pte_t _pte; + + VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ? + ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK)); +#else VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#endif /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; @@ -1071,7 +1286,11 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); +#ifdef CONFIG_FINEGRAINED_THP + new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type); +#else new_page = khugepaged_alloc_page(hpage, gfp, node); +#endif if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out_nolock; @@ -1084,7 +1303,11 @@ static void collapse_huge_page(struct mm_struct *mm, count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); +#ifdef CONFIG_FINEGRAINED_THP + result = hugepage_vma_revalidate(mm, address, &vma, hpage_type); +#else result = hugepage_vma_revalidate(mm, address, &vma); +#endif if (result) { mmap_read_unlock(mm); goto out_nolock; @@ -1102,11 +1325,19 @@ static void collapse_huge_page(struct mm_struct *mm, * If it fails, we release mmap_lock and jump out_nolock. * Continuing to collapse causes inconsistency. */ +#ifdef CONFIG_FINEGRAINED_THP + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced, hpage_type)) { + mmap_read_unlock(mm); + goto out_nolock; + } +#else /* CONFIG_FINEGRAINED_THP */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { mmap_read_unlock(mm); goto out_nolock; } +#endif /* CONFIG_FINEGRAINED_THP*/ mmap_read_unlock(mm); /* @@ -1115,7 +1346,11 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); +#ifdef CONFIG_FINEGRAINED_THP + result = hugepage_vma_revalidate(mm, address, &vma, hpage_type); +#else result = hugepage_vma_revalidate(mm, address, &vma); +#endif if (result) goto out; /* check if the pmd is still valid */ @@ -1124,8 +1359,14 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); +#ifdef CONFIG_FINEGRAINED_THP + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address, address + (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE)); +#else mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address, address + HPAGE_PMD_SIZE); +#endif mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1138,16 +1379,38 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_collapse_flush(vma, address, pmd); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + /* FIXME: clearing ptes here causes + * __collapse_huge_page_isolate and __collapse_huge_page_copy + * to fail, __collapse_huge_page_copy also clears ptes + */ + flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE); + else +#endif /* CONFIG_FINEGRAINED_THP */ + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); +#ifdef CONFIG_FINEGRAINED_THP + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ isolated = __collapse_huge_page_isolate(vma, address, pte, &compound_pagelist); +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(pte_ptl); if (unlikely(!isolated)) { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + pte_unmap(pte); + anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; + goto out; + } +#endif /* CONFIG_FINEGRAINED_THP */ pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1169,15 +1432,34 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); +#ifdef CONFIG_FINEGRAINED_THP + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); +#endif /* CONFIG_FINEGRAINED_THP */ pte_unmap(pte); __SetPageUptodate(new_page); + +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + /* 64KB hugepage */ + _pte = arch_make_huge_pte(new_page, vma); + _pte = maybe_mkwrite(pte_mkdirty(_pte), vma); + } else { + /* 2MB hugepage */ + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + } +#else /* CONFIG_FINEGRAINED_THP */ pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - +#endif /* CONFIG_FINEGRAINED_THP */ /* * spin_lock() below is not the equivalent of smp_wmb(), so * this is needed to avoid the copy_huge_page writes to become @@ -1186,15 +1468,32 @@ static void collapse_huge_page(struct mm_struct *mm, smp_wmb(); spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_2MB) +#endif + BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); lru_cache_add_inactive_or_unevictable(new_page, vma); + +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + arch_set_huge_pte_at(mm, address, pte, _pte, 0); + else { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + } + update_mmu_cache_pmd(vma, address, pmd); +#else /* CONFIG_FINEGRAINED_THP */ pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(pmd_ptl); - *hpage = NULL; +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_2MB) +#endif + *hpage = NULL; khugepaged_pages_collapsed++; result = SCAN_SUCCEED; @@ -1203,16 +1502,27 @@ out_up_write: out_nolock: if (!IS_ERR_OR_NULL(*hpage)) mem_cgroup_uncharge(*hpage); +#ifdef CONFIG_FINEGRAINED_THP + if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB) + put_page(new_page); +#endif trace_mm_collapse_huge_page(mm, isolated, result); return; out: goto out_up_write; } +#ifdef CONFIG_FINEGRAINED_THP +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { pmd_t *pmd; pte_t *pte, *_pte; @@ -1224,7 +1534,26 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr; + int max_ptes_swap, max_ptes_none, max_ptes_shared; + + if (hpage_type == THP_TYPE_64KB) { + VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK); + hpage_nr = HPAGE_CONT_PTE_NR; + max_ptes_swap = khugepaged_max_ptes_swap_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + } else { + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + hpage_nr = HPAGE_PMD_NR; + max_ptes_swap = khugepaged_max_ptes_swap; + max_ptes_none = khugepaged_max_ptes_none; + max_ptes_shared = khugepaged_max_ptes_shared; + } +#else /* CONFIG_FINEGRAINED_THP */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#endif /* CONFIG_FINEGRAINED_THP */ pmd = mm_find_pmd(mm, address); if (!pmd) { @@ -1234,11 +1563,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_address = address, _pte = pte; +#ifdef CONFIG_FINEGRAINED_THP + _pte < pte + hpage_nr; +#else + _pte < pte+HPAGE_PMD_NR; +#endif _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { +#ifdef CONFIG_FINEGRAINED_THP + if (++unmapped <= max_ptes_swap) +#else + if (++unmapped <= khugepaged_max_ptes_swap) +#endif + { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1256,7 +1595,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { +#ifdef CONFIG_FINEGRAINED_THP + ++none_or_zero <= max_ptes_none +#else + ++none_or_zero <= khugepaged_max_ptes_none +#endif + ) + { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1289,8 +1634,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } +#ifdef CONFIG_FINEGRAINED_THP + if (PageCompound(page) && PageTransHuge(compound_head(page))) { + result = SCAN_PAGE_COMPOUND; + goto out_unmap; + } + if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + ++shared > max_ptes_shared) +#else + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) +#endif + { result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } @@ -1361,8 +1717,13 @@ out_unmap: if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_lock released */ +#ifdef CONFIG_FINEGRAINED_THP + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped, hpage_type); +#else collapse_huge_page(mm, address, hpage, node, referenced, unmapped); +#endif } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1377,6 +1738,9 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_test_exit(mm)) { +#ifdef CONFIG_FINEGRAINED_THP + clear_hint_list(mm_slot); +#endif /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1398,15 +1762,29 @@ static void collect_mm_slot(struct mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ +#ifdef CONFIG_FINEGRAINED_THP +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, int hpage_type) +#else static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +#endif { struct mm_slot *mm_slot; +#ifdef CONFIG_FINEGRAINED_THP + VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ? + ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK)); +#else VM_BUG_ON(addr & ~HPAGE_PMD_MASK); +#endif spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + addr |= 0x01; +#endif if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); @@ -1430,10 +1808,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) spinlock_t *ptl; int count = 0; int i; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB; + int hpage_nr = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR; + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; + + if (hpage_type == THP_TYPE_64KB) + haddr = addr & HPAGE_CONT_PTE_MASK; +#endif +#ifdef CONFIG_FINEGRAINED_THP + if (!vma || !vma->vm_file || + vma->vm_start > haddr || vma->vm_end < haddr + hpage_size) + return; +#else /* CONFIG_FINEGRAINED_THP */ if (!vma || !vma->vm_file || vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) return; +#endif /* CONFIG_FINEGRAINED_THP */ /* * This vm_flags may not have VM_HUGEPAGE if the page was not @@ -1457,10 +1851,21 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); +#ifdef CONFIG_FINEGRAINED_THP + if (pte_cont(*start_pte)) { + pte_unmap_unlock(start_pte, ptl); + goto drop_hpage; + } +#endif /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { +#ifdef CONFIG_FINEGRAINED_THP + i < hpage_nr; +#else + i < HPAGE_PMD_NR; +#endif + i++, addr += PAGE_SIZE, pte++) { struct page *page; /* empty pte, skip */ @@ -1484,7 +1889,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 2: adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { +#ifdef CONFIG_FINEGRAINED_THP + i < hpage_nr; +#else + i < HPAGE_PMD_NR; +#endif + i++, addr += PAGE_SIZE, pte++) { struct page *page; if (pte_none(*pte)) @@ -1503,10 +1913,23 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 4: collapse pmd */ ptl = pmd_lock(vma->vm_mm, pmd); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + pte_t *ptep = pte_offset_map(pmd, haddr); + arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep); + spin_unlock(ptl); + } else { + _pmd = pmdp_collapse_flush(vma, haddr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + } +#else /* CONFIG_FINEGRAINED_THP*/ _pmd = pmdp_collapse_flush(vma, haddr, pmd); spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); +#endif /* CONFIG_FINEGRAINED_THP */ drop_hpage: unlock_page(hpage); @@ -1541,12 +1964,22 @@ out: return 0; } +#ifdef CONFIG_FINEGRAINED_THP +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + int hpage_type) +#else static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +#endif { struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, _pmd; +#ifdef CONFIG_FINEGRAINED_THP + pte_t *ptep; + int hpage_size = (hpage_type == THP_TYPE_64KB) ? + HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE; +#endif /* CONFIG_FINEGRAINED_THP */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1569,6 +2002,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->anon_vma) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK) + continue; + else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK) + continue; + if (vma->vm_end < addr + hpage_size) + continue; + + mm = vma->vm_mm; + pmd = mm_find_pmd(mm, addr); + if (!pmd) + continue; + if (mmap_write_trylock(mm)) { + spinlock_t *ptl = pmd_lock(mm, pmd); + if (hpage_type == THP_TYPE_64KB) { + /* 64KB hugepage */ + ptep = pte_offset_map(pmd, addr); + /* pte maps are established on page fault handling */ + arch_clear_huge_pte_range(mm, addr, ptep); + spin_unlock(ptl); + } else { + /* 2MB hugepage */ + /* + * We need exclusive mmap_sem to retract page table. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_sem while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. + */ + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + } + mmap_write_unlock(mm); + } else + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type); +#else /* CONFIG_FINEGRAINED_THP */ if (addr & ~HPAGE_PMD_MASK) continue; if (vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1598,6 +2070,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* Try again later */ khugepaged_add_pte_mapped_thp(mm, addr); } +#endif /* CONFIG_FINEGRAINED_THP */ } i_mmap_unlock_write(mapping); } @@ -1620,26 +2093,52 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ +#ifdef CONFIG_FINEGRAINED_THP +static void collapse_file(struct mm_struct *mm, + struct file *file, pgoff_t start, + struct page **hpage, int node, int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void collapse_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage, int node) +#endif /* CONFIG_FINEGRAINED_THP */ { struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_NR : HPAGE_PMD_NR); + int hpage_order = (hpage_type == THP_TYPE_64KB ? + HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER); + pgoff_t index, end = start + hpage_nr; +#else /* CONFIG_FINEGRAINED_THP */ pgoff_t index, end = start + HPAGE_PMD_NR; +#endif /* CONFIG_FINEGRAINED_THP */ LIST_HEAD(pagelist); +#ifdef CONFIG_FINEGRAINED_THP + XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order); +#else XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); +#endif int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); +#ifdef CONFIG_FINEGRAINED_THP + VM_BUG_ON(start & (hpage_nr - 1)); +#else VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); +#endif /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; +#ifdef CONFIG_FINEGRAINED_THP + new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type); +#else new_page = khugepaged_alloc_page(hpage, gfp, node); +#endif if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out; @@ -1750,6 +2249,10 @@ static void collapse_file(struct mm_struct *mm, filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; + } else if (PageWriteback(page)) { + xas_unlock_irq(&xas); + result = SCAN_FAIL; + goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1785,7 +2288,8 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } - if (!is_shmem && PageDirty(page)) { + if (!is_shmem && (PageDirty(page) || + PageWriteback(page))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1845,9 +2349,23 @@ out_unlock: } if (is_shmem) +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + __inc_node_page_state(new_page, NR_SHMEM_64KB_THPS); + else + __inc_node_page_state(new_page, NR_SHMEM_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __inc_node_page_state(new_page, NR_SHMEM_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ else { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) + __inc_node_page_state(new_page, NR_FILE_64KB_THPS); + else + __inc_node_page_state(new_page, NR_FILE_THPS); +#else /* CONFIG_FINEGRAINED_THP */ __inc_node_page_state(new_page, NR_FILE_THPS); +#endif /* CONFIG_FINEGRAINED_THP */ filemap_nr_thps_inc(mapping); } @@ -1863,6 +2381,9 @@ xa_unlocked: if (result == SCAN_SUCCEED) { struct page *page, *tmp; +#ifdef CONFIG_FINEGRAINED_THP + int offset = 0; +#endif /* * Replacing old pages with new one has succeeded, now we @@ -1870,12 +2391,28 @@ xa_unlocked: */ index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type != THP_TYPE_64KB) { + while (index < page->index) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } + } + + if (hpage_type == THP_TYPE_64KB) { + copy_highpage(new_page + offset, page); + offset++; + } else + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); +#else /* CONFIG_FINEGRAINED_THP */ while (index < page->index) { clear_highpage(new_page + (index % HPAGE_PMD_NR)); index++; } copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); +#endif /* CONFIG_FINEGRAINED_THP */ list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1885,13 +2422,32 @@ xa_unlocked: put_page(page); index++; } +#ifdef CONFIG_FINEGRAINED_THP + if (hpage_type == THP_TYPE_64KB) { + while (index < end) { + clear_highpage(new_page + offset); + offset++; + index++; + } + } else { + while (index < end) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } + } +#else /* CONFIG_FINEGRAINED_THP */ while (index < end) { clear_highpage(new_page + (index % HPAGE_PMD_NR)); index++; } +#endif /* CONFIG_FINEGRAINED_THP */ SetPageUptodate(new_page); +#ifdef CONFIG_FINEGRAINED_THP + page_ref_add(new_page, hpage_nr - 1); +#else page_ref_add(new_page, HPAGE_PMD_NR - 1); +#endif if (is_shmem) set_page_dirty(new_page); lru_cache_add(new_page); @@ -1899,9 +2455,14 @@ xa_unlocked: /* * Remove pte page tables, so we can re-fault the page as huge. */ +#ifdef CONFIG_FINEGRAINED_THP + retract_page_tables(mapping, start, hpage_type); + if (hpage_type == THP_TYPE_2MB) + *hpage = NULL; +#else /* CONFIG_FINEGRAINED_THP */ retract_page_tables(mapping, start); *hpage = NULL; - +#endif /* CONFIG_FINEGRAINED_THP */ khugepaged_pages_collapsed++; } else { struct page *page; @@ -1946,14 +2507,24 @@ xa_unlocked: unlock_page(new_page); out: +#ifdef CONFIG_FINEGRAINED_THP + if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB) + put_page(new_page); +#endif VM_BUG_ON(!list_empty(&pagelist)); if (!IS_ERR_OR_NULL(*hpage)) mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } +#ifdef CONFIG_FINEGRAINED_THP +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1961,17 +2532,43 @@ static void khugepaged_scan_file(struct mm_struct *mm, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; +#ifdef CONFIG_FINEGRAINED_THP + int hpage_nr; + int max_ptes_swap, max_ptes_none, max_ptes_shared; + + if (hpage_type == THP_TYPE_64KB) { + hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */ + max_ptes_swap = khugepaged_max_ptes_swap_64kb; + max_ptes_none = khugepaged_max_ptes_none_64kb; + max_ptes_shared = khugepaged_max_ptes_shared_64kb; + } else { + hpage_nr = HPAGE_PMD_NR; /* 2MB */ + max_ptes_swap = khugepaged_max_ptes_swap; + max_ptes_none = khugepaged_max_ptes_none; + max_ptes_shared = khugepaged_max_ptes_shared; + } +#endif /* CONFIG_FINEGRAINED_THP */ present = 0; swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { +#ifdef CONFIG_FINEGRAINED_THP + xas_for_each(&xas, page, start + hpage_nr - 1) +#else + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) +#endif + { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { +#ifdef CONFIG_FINEGRAINED_THP + if (++swap > max_ptes_swap) +#else + if (++swap > khugepaged_max_ptes_swap) +#endif + { result = SCAN_EXCEED_SWAP_PTE; break; } @@ -2017,19 +2614,34 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { +#ifdef CONFIG_FINEGRAINED_THP + if (present < hpage_nr - max_ptes_none) +#else + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) +#endif + { result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); +#ifdef CONFIG_FINEGRAINED_THP + collapse_file(mm, file, start, hpage, node, hpage_type); +#else collapse_file(mm, file, start, hpage, node); +#endif } } /* TODO: tracepoints */ } #else +#ifdef CONFIG_FINEGRAINED_THP +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, + int hpage_type) +#else /* CONFIG_FINEGRAINED_THP */ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, pgoff_t start, struct page **hpage) +#endif /* CONFIG_FINEGRAINED_THP */ { BUILD_BUG(); } @@ -2040,6 +2652,220 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif +#ifdef CONFIG_FINEGRAINED_THP +/* + * if return value > 0 -> vma can make hugepage + * calculated hugepage start and hugepage end are stored in pointers + * otherwise -> vma cannot make hugepage + */ +static inline int hugepage_determine_htype(unsigned long vm_start, + unsigned long vm_end, unsigned long *hstart, unsigned long *hend) { + unsigned long start, end; + + /* determine 2MB hugepage */ + start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + end = vm_end & HPAGE_PMD_MASK; + if (start >= end) { + /* determine 64KB hugepage */ + start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK; + end = vm_end & HPAGE_CONT_PTE_MASK; + if (start >= end) + return THP_TYPE_FAIL; + *hstart = start; + *hend = end; + return THP_TYPE_64KB; + } + *hstart = start; + *hend = end; + return THP_TYPE_2MB; +} + +enum { + KHUGEPAGE_SCAN_CONTINUE, + KHUGEPAGE_SCAN_BREAK, + KHUGEPAGE_SCAN_BREAK_MMAP_LOCK, +}; + +static unsigned int khugepaged_scan_vma(struct mm_struct *mm, + struct vm_area_struct *vma, struct page **hpage, + unsigned int pages, int *progress) +{ + unsigned long hstart, hend; + int hpage_type, ret; + int hpage_size, hpage_nr; + + if (!hugepage_vma_check(vma, vma->vm_flags)) + return KHUGEPAGE_SCAN_CONTINUE; + + hpage_type = hugepage_determine_htype( + (vma->vm_start > khugepaged_scan.address) ? + vma->vm_start : khugepaged_scan.address, + vma->vm_end, &hstart, &hend); + + if (hpage_type == THP_TYPE_FAIL) + return KHUGEPAGE_SCAN_CONTINUE; + if (khugepaged_scan.address > hend) + return KHUGEPAGE_SCAN_CONTINUE; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + + if (hpage_type == THP_TYPE_64KB) { + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK); + hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */ + hpage_nr = HPAGE_CONT_PTE_NR; + } else if (hpage_type == THP_TYPE_2MB) { + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + hpage_size = HPAGE_PMD_SIZE; /* 2MB */ + hpage_nr = HPAGE_PMD_NR; + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR)) { + /* fallback, vma or file not aligned to 2MB */ + hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */ + hpage_nr = HPAGE_CONT_PTE_NR; + hpage_type = THP_TYPE_64KB; + } + } else + BUG(); + + while (khugepaged_scan.address < hend) { + if (khugepaged_scan.address + hpage_size > hend) { + if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) { + hpage_size = HPAGE_CONT_PTE_SIZE; + hpage_nr = HPAGE_CONT_PTE_NR; + hpage_type = THP_TYPE_64KB; + } + } + ret = 0; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + return KHUGEPAGE_SCAN_BREAK; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + hpage_size > + hend); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + + mmap_read_unlock(mm); + ret = 1; + khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type); + fput(file); + } else { + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage, hpage_type); + } + /* move to next address */ + khugepaged_scan.address += hpage_size; + *progress += hpage_nr; + if (ret) + /* we released mmap_sem so break loop */ + return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK; + if (*progress >= pages) + return KHUGEPAGE_SCAN_BREAK; + } + return KHUGEPAGE_SCAN_CONTINUE; +} + +static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot, + unsigned long addr) +{ + struct thp_scan_hint *hint; + + list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) { + if (hint->slot == slot) + return hint; + } + return NULL; +} + +#ifdef CONFIG_THP_CONSERVATIVE +/* caller must hold a proper mmap_lock */ +void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr, + long diff, const char *debug) +{ + struct mm_slot *slot; + struct vm_area_struct *vma; + struct thp_scan_hint *hint; + bool wakeup = false; + bool retry = false; + + vma = find_vma(mm, addr); + if (!hugepage_vma_check(vma, vma->vm_flags)) + return; + +again: + spin_lock(&khugepaged_mm_lock); + slot = get_mm_slot(mm); + if (!slot) { + /* make a new slot or go out */ + spin_unlock(&khugepaged_mm_lock); + if (retry) + return; + if (__khugepaged_enter(mm)) + return; + retry = true; + goto again; + } + + hint = find_scan_hint(slot, addr); + if (!hint) { + spin_unlock(&khugepaged_mm_lock); + hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL); + hint->vma = vma; + hint->slot = slot; + hint->diff = 0; + hint->jiffies = jiffies; + spin_lock(&khugepaged_mm_lock); + list_add(&hint->hint_list, &khugepaged_scan.hint_list); + khugepaged_scan.nr_hint++; + } + hint->diff += diff; + if (hint->diff >= HPAGE_CONT_PTE_SIZE) { + wakeup = true; + //list_move(&hint->hint_list, &khugepaged_scan.hint_list); + } + spin_unlock(&khugepaged_mm_lock); + + /* if possible, wake khugepaged up for starting a scan */ + if (wakeup) { + wake_up_interruptible(&khugepaged_wait); + } +} +#else /* CONFIG_THP_CONSERVATIVE */ +void khugepaged_mem_hook(struct mm_struct *mm, + unsigned long addr, long diff, const char *debug) +{} +#endif /* CONFIG_THP_CONSERVATIVE */ + +static void clear_hint_list(struct mm_slot *slot) +{ + struct thp_scan_hint *hint; + hint = find_scan_hint(slot, 0); + if (hint) { + list_del(&hint->hint_list); + kfree(hint); + khugepaged_scan.nr_hint--; + } +} + +static struct thp_scan_hint *get_next_hint(void) +{ + if (!list_empty(&khugepaged_scan.hint_list)) { + struct thp_scan_hint *hint = list_first_entry( + &khugepaged_scan.hint_list, + struct thp_scan_hint, hint_list); + list_del(&hint->hint_list); + khugepaged_scan.nr_hint--; + return hint; + } + return NULL; +} +#endif /* CONFIG_FINEGRAINED_THP */ + static unsigned int khugepaged_scan_mm_slot(unsigned int pages, struct page **hpage) __releases(&khugepaged_mm_lock) @@ -2053,6 +2879,38 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); +#ifdef CONFIG_FINEGRAINED_THP + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else if (!list_empty(&khugepaged_scan.hint_list)) { + struct thp_scan_hint *hint; + long mem_diff; + unsigned long jiffies_diff; + +get_next_hint: + hint = get_next_hint(); + if (!hint) + goto get_next_slot; + + mm_slot = hint->slot; + mem_diff = hint->diff; + jiffies_diff = jiffies - hint->jiffies; + kfree(hint); + clear_hint_list(mm_slot); + + if (khugepaged_test_exit(mm_slot->mm)) + goto get_next_hint; + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } else { +get_next_slot: + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + clear_hint_list(mm_slot); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } +#else /* CONFIG_FINEGRAINED_THP */ if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; else { @@ -2061,6 +2919,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } +#endif /* CONFIG_FINEGRAINED_THP */ spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); @@ -2077,13 +2936,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; for (; vma; vma = vma->vm_next) { +#ifdef CONFIG_FINEGRAINED_THP + int ret; +#else unsigned long hstart, hend; +#endif cond_resched(); if (unlikely(khugepaged_test_exit(mm))) { progress++; break; } +#ifdef CONFIG_FINEGRAINED_THP + ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress); + + if (ret == KHUGEPAGE_SCAN_CONTINUE) { + progress++; + continue; + } else if (ret == KHUGEPAGE_SCAN_BREAK) + goto breakouterloop; + else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK) + goto breakouterloop_mmap_lock; +#else /* CONFIG_FINEGRAINED_THP */ if (!hugepage_vma_check(vma, vma->vm_flags)) { skip: progress++; @@ -2133,6 +3007,7 @@ skip: if (progress >= pages) goto breakouterloop; } +#endif /* CONFIG_FINEGRAINED_THP */ } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ @@ -2150,6 +3025,53 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ +#ifdef CONFIG_FINEGRAINED_THP + if (!list_empty(&khugepaged_scan.hint_list)) { + unsigned long jiffies_diff; + long mem_diff; + struct thp_scan_hint *hint; + struct mm_slot *next_slot; + +get_next_hint2: + hint = get_next_hint(); + + if (!hint) { + /* no more hint */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) + goto get_next_slot2; + else + goto end_loop; + } + + mem_diff = hint->diff; + jiffies_diff = jiffies - hint->jiffies; + next_slot = hint->slot; + kfree(hint); + + if (next_slot == mm_slot) + goto get_next_hint2; + + if (!khugepaged_test_exit(next_slot->mm)) { + list_move(&next_slot->mm_node, &mm_slot->mm_node); + clear_hint_list(next_slot); + } else + goto get_next_hint2; + + khugepaged_scan.mm_slot = next_slot; + khugepaged_scan.address = 0; + } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { +get_next_slot2: + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + clear_hint_list(khugepaged_scan.mm_slot); + khugepaged_scan.address = 0; + } else { +end_loop: + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } +#else /* CONFIG_FINEGRAINED_THP */ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { khugepaged_scan.mm_slot = list_entry( mm_slot->mm_node.next, @@ -2159,7 +3081,7 @@ breakouterloop_mmap_lock: khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - +#endif /* CONFIG_FINEGRAINED_THP */ collect_mm_slot(mm_slot); } @@ -2240,6 +3162,9 @@ static void khugepaged_wait_work(void) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } +#include +bool eager_allocation = false; + static int khugepaged(void *none) { struct mm_slot *mm_slot;