#include <asm/tlb.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/finegrained_thp.h>
+#include <asm/huge_mm.h>
+#else
+#include <asm-generic/finegrained_thp.h>
+#include <asm-generic/huge_mm.h>
+#endif
#include "internal.h"
enum scan_result {
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * thp_scan_hint:
+ * it used for providing hints to khugepaged
+ * which address space is changed recently.
+ */
+struct thp_scan_hint {
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ unsigned long diff; /* memory difference */
+ unsigned long jiffies; /* time stamp for profiling purpose */
+ struct list_head hint_list;
+};
+
+/* THP type descriptor */
+enum {
+ THP_TYPE_FAIL, /* cannot make hugepage */
+ THP_TYPE_64KB, /* 64KB hugepage can be made, use CONT_PTE */
+ THP_TYPE_2MB, /* 2MB hugepage can be made, use PMD */
+};
+
+static unsigned int khugepaged_max_ptes_none_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly;
+#endif /* CONFIG_FINEGRAINED_THP */
+
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_type;
+ int nr_hint;
+ struct list_head hint_list;
+#endif /* CONFIG_FINEGRAINED_THP */
};
static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+#ifdef CONFIG_FINEGRAINED_THP
+ .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list),
+#endif
};
#ifdef CONFIG_SYSFS
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
+#ifdef CONFIG_FINEGRAINED_THP
+ khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1;
+ khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8;
+ khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2;
+#endif
return 0;
}
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ return atomic_read(&mm->mm_users) == 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void clear_hint_list(struct mm_slot *slot);
+#endif /* CONFIG_FINEGRAINED_THP */
+
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (!transhuge_vma_enabled(vma, vm_flags))
+ return false;
+
+ if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+ vma->vm_pgoff, HPAGE_PMD_NR))
return false;
- if (shmem_file(vma->vm_file) ||
- (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- vma->vm_file &&
- (vm_flags & VM_DENYWRITE))) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
+ /* Check arch-dependent shmem hugepage available */
+ if (arch_hugepage_vma_shmem_check(vma, vm_flags))
+ return true;
+ /* Enabled via shmem mount options or sysfs settings. */
+ if (shmem_file(vma->vm_file))
+ return shmem_huge_enabled(vma);
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ return false;
+
+ /* Check arch-dependent file hugepage available */
+ if (arch_hugepage_vma_file_check(vma, vm_flags))
+ return true;
+ /* Only regular file is valid */
+ else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ (vm_flags & VM_DENYWRITE)) {
+ struct inode *inode = vma->vm_file->f_inode;
+
+ return S_ISREG(inode->i_mode);
}
+
if (!vma->anon_vma || vma->vm_ops)
return false;
if (vma_is_temporary_stack(vma))
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
return khugepaged_enter(vma, vm_flags);
+#ifdef CONFIG_FINEGRAINED_THP
+ hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma, vm_flags);
+#endif /* CONFIG_FINEGRAINED_THP */
return 0;
}
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+#ifdef CONFIG_FINEGRAINED_THP
+ clear_hint_list(mm_slot);
+#endif
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
free = 1;
return page_count(page) == expected_refcount;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte,
+ struct list_head *compound_pagelist,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *page = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+ int max_ptes_shared, max_ptes_none;
+ int hpage_nr;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ } else {
+ hpage_nr = HPAGE_PMD_NR;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ max_ptes_none = khugepaged_max_ptes_none;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte+HPAGE_PMD_NR;
+#endif
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+#ifdef CONFIG_FINEGRAINED_THP
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ ++none_or_zero <= max_ptes_none)
+#else /* CONFIG_FINEGRAINED_THP */
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
+#endif /* CONFIG_FINEGRAINED_THP */
+ {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
VM_BUG_ON_PAGE(!PageAnon(page), page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (page_mapcount(page) > 1 &&
+ ++shared > max_ptes_shared)
+#else /* CONFIG_FINEGRAINED_THP */
if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
+ ++shared > khugepaged_max_ptes_shared)
+#endif /* CONFIG_FINEGRAINED_THP */
+ {
result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
if (pte_write(pteval))
writable = true;
}
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
+
+ if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
+ } else if (unlikely(!referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
}
-
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl,
struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *src_page, *tmp;
pte_t *_pte;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+#endif
+
+ for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte + HPAGE_PMD_NR;
+#endif
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline struct page *alloc_khugepaged_hugepage(int hpage_order)
+#else
static inline struct page *alloc_khugepaged_hugepage(void)
+#endif
{
struct page *page;
+#ifdef CONFIG_FINEGRAINED_THP
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ hpage_order);
+#else
page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
HPAGE_PMD_ORDER);
+#endif
if (page)
prep_transhuge_page(page);
return page;
struct page *hpage;
do {
+#ifdef CONFIG_FINEGRAINED_THP
+ hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER);
+#else
hpage = alloc_khugepaged_hugepage();
+#endif
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
return true;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type)
+{
+ struct page *page;
+
+ if (hpage_type == THP_TYPE_64KB)
+ page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER);
+ else {
+ VM_BUG_ON(!*hpage);
+ page = *hpage;
+ }
+ return page;
+}
+#else /* CONFIG_FINEGRAINED_THP */
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
return *hpage;
}
+#endif /* CONFIG_FINEGRAINED_THP */
#endif
/*
* value (scan code).
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+ struct vm_area_struct **vmap, int hpage_type)
+#else
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct vm_area_struct **vmap)
+#endif
{
struct vm_area_struct *vma;
unsigned long hstart, hend;
if (!vma)
return SCAN_VMA_NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+ if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend)
+ return SCAN_ADDRESS_RANGE;
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return SCAN_VMA_CHECK;
+ return 0;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
* but with mmap_lock held to protect against vma changes.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ int referenced, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
int referenced)
+#endif /* CONFIG_FINEGRAINED_THP */
{
int swapped_in = 0;
vm_fault_t ret = 0;
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
};
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif
vmf.pte = pte_offset_map(pmd, address);
- for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ for (;
+#ifdef CONFIG_FINEGRAINED_THP
+ vmf.address < address + hpage_size;
+#else
+ vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+#endif
vmf.pte++, vmf.address += PAGE_SIZE) {
vmf.orig_pte = *vmf.pte;
if (!is_swap_pte(vmf.orig_pte))
/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type))
+#else
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma))
+#endif
+ {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
return true;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ int node, int referenced, int unmapped,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
int node, int referenced, int unmapped)
+#endif /* CONFIG_FINEGRAINED_THP */
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
struct mmu_notifier_range range;
gfp_t gfp;
+#ifdef CONFIG_FINEGRAINED_THP
+ pte_t _pte;
+
+ VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ?
+ ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK));
+#else
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out_nolock;
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
if (result) {
mmap_read_unlock(mm);
goto out_nolock;
* If it fails, we release mmap_lock and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced, hpage_type)) {
+ mmap_read_unlock(mm);
+ goto out_nolock;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
pmd, referenced)) {
mmap_read_unlock(mm);
goto out_nolock;
}
+#endif /* CONFIG_FINEGRAINED_THP*/
mmap_read_unlock(mm);
/*
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
if (result)
goto out;
/* check if the pmd is still valid */
anon_vma_lock_write(vma->anon_vma);
+#ifdef CONFIG_FINEGRAINED_THP
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE));
+#else
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
+#endif
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map(pmd, address);
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ /* FIXME: clearing ptes here causes
+ * __collapse_huge_page_isolate and __collapse_huge_page_copy
+ * to fail, __collapse_huge_page_copy also clears ptes
+ */
+ flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
+ else
+#endif /* CONFIG_FINEGRAINED_THP */
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
isolated = __collapse_huge_page_isolate(vma, address, pte,
&compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ pte_unmap(pte);
+ anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
+ goto out;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
*/
anon_vma_unlock_write(vma->anon_vma);
+#ifdef CONFIG_FINEGRAINED_THP
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
&compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
pte_unmap(pte);
__SetPageUptodate(new_page);
+
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ /* 64KB hugepage */
+ _pte = arch_make_huge_pte(new_page, vma);
+ _pte = maybe_mkwrite(pte_mkdirty(_pte), vma);
+ } else {
+ /* 2MB hugepage */
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
* this is needed to avoid the copy_huge_page writes to become
smp_wmb();
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_2MB)
+#endif
+ BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ arch_set_huge_pte_at(mm, address, pte, _pte, 0);
+ else {
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, address, pmd, _pmd);
+ }
+ update_mmu_cache_pmd(vma, address, pmd);
+#else /* CONFIG_FINEGRAINED_THP */
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(pmd_ptl);
- *hpage = NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_2MB)
+#endif
+ *hpage = NULL;
khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_nolock:
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+ put_page(new_page);
+#endif
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
goto out_up_write;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
pmd_t *pmd;
pte_t *pte, *_pte;
int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr;
+ int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ } else {
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ hpage_nr = HPAGE_PMD_NR;
+ max_ptes_swap = khugepaged_max_ptes_swap;
+ max_ptes_none = khugepaged_max_ptes_none;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif /* CONFIG_FINEGRAINED_THP */
pmd = mm_find_pmd(mm, address);
if (!pmd) {
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_address = address, _pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte+HPAGE_PMD_NR;
+#endif
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (++unmapped <= max_ptes_swap)
+#else
+ if (++unmapped <= khugepaged_max_ptes_swap)
+#endif
+ {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+ ++none_or_zero <= max_ptes_none
+#else
+ ++none_or_zero <= khugepaged_max_ptes_none
+#endif
+ )
+ {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageCompound(page) && PageTransHuge(compound_head(page))) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unmap;
+ }
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > max_ptes_shared)
+#else
if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
+ ++shared > khugepaged_max_ptes_shared)
+#endif
+ {
result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_lock released */
+#ifdef CONFIG_FINEGRAINED_THP
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped, hpage_type);
+#else
collapse_huge_page(mm, address, hpage, node,
referenced, unmapped);
+#endif
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_test_exit(mm)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ clear_hint_list(mm_slot);
+#endif
/* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, int hpage_type)
+#else
static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr)
+#endif
{
struct mm_slot *mm_slot;
+#ifdef CONFIG_FINEGRAINED_THP
+ VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ?
+ ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK));
+#else
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+#endif
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ addr |= 0x01;
+#endif
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
spin_unlock(&khugepaged_mm_lock);
spinlock_t *ptl;
int count = 0;
int i;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB;
+ int hpage_nr = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR;
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+
+ if (hpage_type == THP_TYPE_64KB)
+ haddr = addr & HPAGE_CONT_PTE_MASK;
+#endif
+#ifdef CONFIG_FINEGRAINED_THP
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + hpage_size)
+ return;
+#else /* CONFIG_FINEGRAINED_THP */
if (!vma || !vma->vm_file ||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
return;
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* This vm_flags may not have VM_HUGEPAGE if the page was not
goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (pte_cont(*start_pte)) {
+ pte_unmap_unlock(start_pte, ptl);
+ goto drop_hpage;
+ }
+#endif
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+ i < hpage_nr;
+#else
+ i < HPAGE_PMD_NR;
+#endif
+ i++, addr += PAGE_SIZE, pte++) {
struct page *page;
/* empty pte, skip */
/* step 2: adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+ i < hpage_nr;
+#else
+ i < HPAGE_PMD_NR;
+#endif
+ i++, addr += PAGE_SIZE, pte++) {
struct page *page;
if (pte_none(*pte))
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ pte_t *ptep = pte_offset_map(pmd, haddr);
+ arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep);
+ spin_unlock(ptl);
+ } else {
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+#else /* CONFIG_FINEGRAINED_THP*/
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+#endif /* CONFIG_FINEGRAINED_THP */
drop_hpage:
unlock_page(hpage);
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ int hpage_type)
+#else
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+#endif
{
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
+#ifdef CONFIG_FINEGRAINED_THP
+ pte_t *ptep;
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK)
+ continue;
+ else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK)
+ continue;
+ if (vma->vm_end < addr + hpage_size)
+ continue;
+
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ continue;
+ if (mmap_write_trylock(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ if (hpage_type == THP_TYPE_64KB) {
+ /* 64KB hugepage */
+ ptep = pte_offset_map(pmd, addr);
+ /* pte maps are established on page fault handling */
+ arch_clear_huge_pte_range(mm, addr, ptep);
+ spin_unlock(ptl);
+ } else {
+ /* 2MB hugepage */
+ /*
+ * We need exclusive mmap_sem to retract page table.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
+ */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
+ } else
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
if (addr & ~HPAGE_PMD_MASK)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
/* Try again later */
khugepaged_add_pte_mapped_thp(mm, addr);
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
i_mmap_unlock_write(mapping);
}
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
+ struct page **hpage, int node, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void collapse_file(struct mm_struct *mm,
struct file *file, pgoff_t start,
struct page **hpage, int node)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+ int hpage_order = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER);
+ pgoff_t index, end = start + hpage_nr;
+#else /* CONFIG_FINEGRAINED_THP */
pgoff_t index, end = start + HPAGE_PMD_NR;
+#endif /* CONFIG_FINEGRAINED_THP */
LIST_HEAD(pagelist);
+#ifdef CONFIG_FINEGRAINED_THP
+ XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order);
+#else
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
+#endif
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
+#ifdef CONFIG_FINEGRAINED_THP
+ VM_BUG_ON(start & (hpage_nr - 1));
+#else
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+#endif
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+#ifdef CONFIG_FINEGRAINED_THP
+ new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out;
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
+ } else if (PageWriteback(page)) {
+ xas_unlock_irq(&xas);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
goto out_unlock;
}
- if (!is_shmem && PageDirty(page)) {
+ if (!is_shmem && (PageDirty(page) ||
+ PageWriteback(page))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
}
if (is_shmem)
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ __inc_node_page_state(new_page, NR_SHMEM_64KB_THPS);
+ else
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__inc_node_page_state(new_page, NR_SHMEM_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
else {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ __inc_node_page_state(new_page, NR_FILE_64KB_THPS);
+ else
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__inc_node_page_state(new_page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
filemap_nr_thps_inc(mapping);
}
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
+#ifdef CONFIG_FINEGRAINED_THP
+ int offset = 0;
+#endif
/*
* Replacing old pages with new one has succeeded, now we
*/
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type != THP_TYPE_64KB) {
+ while (index < page->index) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+ }
+
+ if (hpage_type == THP_TYPE_64KB) {
+ copy_highpage(new_page + offset, page);
+ offset++;
+ } else
+ copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+ page);
+#else /* CONFIG_FINEGRAINED_THP */
while (index < page->index) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
page);
+#endif /* CONFIG_FINEGRAINED_THP */
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
put_page(page);
index++;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ while (index < end) {
+ clear_highpage(new_page + offset);
+ offset++;
+ index++;
+ }
+ } else {
+ while (index < end) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+ }
+#else /* CONFIG_FINEGRAINED_THP */
while (index < end) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
+#endif /* CONFIG_FINEGRAINED_THP */
SetPageUptodate(new_page);
+#ifdef CONFIG_FINEGRAINED_THP
+ page_ref_add(new_page, hpage_nr - 1);
+#else
page_ref_add(new_page, HPAGE_PMD_NR - 1);
+#endif
if (is_shmem)
set_page_dirty(new_page);
lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ retract_page_tables(mapping, start, hpage_type);
+ if (hpage_type == THP_TYPE_2MB)
+ *hpage = NULL;
+#else /* CONFIG_FINEGRAINED_THP */
retract_page_tables(mapping, start);
*hpage = NULL;
-
+#endif /* CONFIG_FINEGRAINED_THP */
khugepaged_pages_collapsed++;
} else {
struct page *page;
unlock_page(new_page);
out:
+#ifdef CONFIG_FINEGRAINED_THP
+ if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+ put_page(new_page);
+#endif
VM_BUG_ON(!list_empty(&pagelist));
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr;
+ int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */
+ max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ } else {
+ hpage_nr = HPAGE_PMD_NR; /* 2MB */
+ max_ptes_swap = khugepaged_max_ptes_swap;
+ max_ptes_none = khugepaged_max_ptes_none;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
present = 0;
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+#ifdef CONFIG_FINEGRAINED_THP
+ xas_for_each(&xas, page, start + hpage_nr - 1)
+#else
+ xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1)
+#endif
+ {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (++swap > max_ptes_swap)
+#else
+ if (++swap > khugepaged_max_ptes_swap)
+#endif
+ {
result = SCAN_EXCEED_SWAP_PTE;
break;
}
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (present < hpage_nr - max_ptes_none)
+#else
+ if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none)
+#endif
+ {
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
+#ifdef CONFIG_FINEGRAINED_THP
+ collapse_file(mm, file, start, hpage, node, hpage_type);
+#else
collapse_file(mm, file, start, hpage, node);
+#endif
}
}
/* TODO: tracepoints */
}
#else
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
BUILD_BUG();
}
}
#endif
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * if return value > 0 -> vma can make hugepage
+ * calculated hugepage start and hugepage end are stored in pointers
+ * otherwise -> vma cannot make hugepage
+ */
+static inline int hugepage_determine_htype(unsigned long vm_start,
+ unsigned long vm_end, unsigned long *hstart, unsigned long *hend) {
+ unsigned long start, end;
+
+ /* determine 2MB hugepage */
+ start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ end = vm_end & HPAGE_PMD_MASK;
+ if (start >= end) {
+ /* determine 64KB hugepage */
+ start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ end = vm_end & HPAGE_CONT_PTE_MASK;
+ if (start >= end)
+ return THP_TYPE_FAIL;
+ *hstart = start;
+ *hend = end;
+ return THP_TYPE_64KB;
+ }
+ *hstart = start;
+ *hend = end;
+ return THP_TYPE_2MB;
+}
+
+enum {
+ KHUGEPAGE_SCAN_CONTINUE,
+ KHUGEPAGE_SCAN_BREAK,
+ KHUGEPAGE_SCAN_BREAK_MMAP_LOCK,
+};
+
+static unsigned int khugepaged_scan_vma(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct page **hpage,
+ unsigned int pages, int *progress)
+{
+ unsigned long hstart, hend;
+ int hpage_type, ret;
+ int hpage_size, hpage_nr;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return KHUGEPAGE_SCAN_CONTINUE;
+
+ hpage_type = hugepage_determine_htype(
+ (vma->vm_start > khugepaged_scan.address) ?
+ vma->vm_start : khugepaged_scan.address,
+ vma->vm_end, &hstart, &hend);
+
+ if (hpage_type == THP_TYPE_FAIL)
+ return KHUGEPAGE_SCAN_CONTINUE;
+ if (khugepaged_scan.address > hend)
+ return KHUGEPAGE_SCAN_CONTINUE;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK);
+ hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ } else if (hpage_type == THP_TYPE_2MB) {
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ hpage_size = HPAGE_PMD_SIZE; /* 2MB */
+ hpage_nr = HPAGE_PMD_NR;
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR)) {
+ /* fallback, vma or file not aligned to 2MB */
+ hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ hpage_type = THP_TYPE_64KB;
+ }
+ } else
+ BUG();
+
+ while (khugepaged_scan.address < hend) {
+ if (khugepaged_scan.address + hpage_size > hend) {
+ if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) {
+ hpage_size = HPAGE_CONT_PTE_SIZE;
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ hpage_type = THP_TYPE_64KB;
+ }
+ }
+ ret = 0;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ return KHUGEPAGE_SCAN_BREAK;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + hpage_size >
+ hend);
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma,
+ khugepaged_scan.address);
+
+ mmap_read_unlock(mm);
+ ret = 1;
+ khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type);
+ fput(file);
+ } else {
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage, hpage_type);
+ }
+ /* move to next address */
+ khugepaged_scan.address += hpage_size;
+ *progress += hpage_nr;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK;
+ if (*progress >= pages)
+ return KHUGEPAGE_SCAN_BREAK;
+ }
+ return KHUGEPAGE_SCAN_CONTINUE;
+}
+
+static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot,
+ unsigned long addr)
+{
+ struct thp_scan_hint *hint;
+
+ list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) {
+ if (hint->slot == slot)
+ return hint;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_THP_CONSERVATIVE
+/* caller must hold a proper mmap_lock */
+void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr,
+ long diff, const char *debug)
+{
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ struct thp_scan_hint *hint;
+ bool wakeup = false;
+ bool retry = false;
+
+ vma = find_vma(mm, addr);
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return;
+
+again:
+ spin_lock(&khugepaged_mm_lock);
+ slot = get_mm_slot(mm);
+ if (!slot) {
+ /* make a new slot or go out */
+ spin_unlock(&khugepaged_mm_lock);
+ if (retry)
+ return;
+ if (__khugepaged_enter(mm))
+ return;
+ retry = true;
+ goto again;
+ }
+
+ hint = find_scan_hint(slot, addr);
+ if (!hint) {
+ spin_unlock(&khugepaged_mm_lock);
+ hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL);
+ hint->vma = vma;
+ hint->slot = slot;
+ hint->diff = 0;
+ hint->jiffies = jiffies;
+ spin_lock(&khugepaged_mm_lock);
+ list_add(&hint->hint_list, &khugepaged_scan.hint_list);
+ khugepaged_scan.nr_hint++;
+ }
+ hint->diff += diff;
+ if (hint->diff >= HPAGE_CONT_PTE_SIZE) {
+ wakeup = true;
+ //list_move(&hint->hint_list, &khugepaged_scan.hint_list);
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ /* if possible, wake khugepaged up for starting a scan */
+ if (wakeup) {
+ wake_up_interruptible(&khugepaged_wait);
+ }
+}
+#else /* CONFIG_THP_CONSERVATIVE */
+void khugepaged_mem_hook(struct mm_struct *mm,
+ unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_THP_CONSERVATIVE */
+
+static void clear_hint_list(struct mm_slot *slot)
+{
+ struct thp_scan_hint *hint;
+ hint = find_scan_hint(slot, 0);
+ if (hint) {
+ list_del(&hint->hint_list);
+ kfree(hint);
+ khugepaged_scan.nr_hint--;
+ }
+}
+
+static struct thp_scan_hint *get_next_hint(void)
+{
+ if (!list_empty(&khugepaged_scan.hint_list)) {
+ struct thp_scan_hint *hint = list_first_entry(
+ &khugepaged_scan.hint_list,
+ struct thp_scan_hint, hint_list);
+ list_del(&hint->hint_list);
+ khugepaged_scan.nr_hint--;
+ return hint;
+ }
+ return NULL;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else if (!list_empty(&khugepaged_scan.hint_list)) {
+ struct thp_scan_hint *hint;
+ long mem_diff;
+ unsigned long jiffies_diff;
+
+get_next_hint:
+ hint = get_next_hint();
+ if (!hint)
+ goto get_next_slot;
+
+ mm_slot = hint->slot;
+ mem_diff = hint->diff;
+ jiffies_diff = jiffies - hint->jiffies;
+ kfree(hint);
+ clear_hint_list(mm_slot);
+
+ if (khugepaged_test_exit(mm_slot->mm))
+ goto get_next_hint;
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ } else {
+get_next_slot:
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ clear_hint_list(mm_slot);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
progress++;
for (; vma; vma = vma->vm_next) {
+#ifdef CONFIG_FINEGRAINED_THP
+ int ret;
+#else
unsigned long hstart, hend;
+#endif
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress);
+
+ if (ret == KHUGEPAGE_SCAN_CONTINUE) {
+ progress++;
+ continue;
+ } else if (ret == KHUGEPAGE_SCAN_BREAK)
+ goto breakouterloop;
+ else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK)
+ goto breakouterloop_mmap_lock;
+#else /* CONFIG_FINEGRAINED_THP */
if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
if (progress >= pages)
goto breakouterloop;
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
breakouterloop:
mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ if (!list_empty(&khugepaged_scan.hint_list)) {
+ unsigned long jiffies_diff;
+ long mem_diff;
+ struct thp_scan_hint *hint;
+ struct mm_slot *next_slot;
+
+get_next_hint2:
+ hint = get_next_hint();
+
+ if (!hint) {
+ /* no more hint */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head)
+ goto get_next_slot2;
+ else
+ goto end_loop;
+ }
+
+ mem_diff = hint->diff;
+ jiffies_diff = jiffies - hint->jiffies;
+ next_slot = hint->slot;
+ kfree(hint);
+
+ if (next_slot == mm_slot)
+ goto get_next_hint2;
+
+ if (!khugepaged_test_exit(next_slot->mm)) {
+ list_move(&next_slot->mm_node, &mm_slot->mm_node);
+ clear_hint_list(next_slot);
+ } else
+ goto get_next_hint2;
+
+ khugepaged_scan.mm_slot = next_slot;
+ khugepaged_scan.address = 0;
+ } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+get_next_slot2:
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ clear_hint_list(khugepaged_scan.mm_slot);
+ khugepaged_scan.address = 0;
+ } else {
+end_loop:
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
khugepaged_scan.mm_slot = list_entry(
mm_slot->mm_node.next,
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
-
+#endif /* CONFIG_FINEGRAINED_THP */
collect_mm_slot(mm_slot);
}
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
+#include <linux/delay.h>
+bool eager_allocation = false;
+
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}