--- /dev/null
+#ifndef __ASM_FINEGRAINED_THP_H
+#define __ASM_FINEGRAINED_THP_H
+#ifdef CONFIG_FINEGRAINED_THP
+extern void khugepaged_mem_hook(struct mm_struct *mm,
+ unsigned long addr, long diff, const char *debug);
+#else /* CONFIG_FINEGRAINED_THP */
+static inline void khugepaged_mem_hook(struct mm_struct *mm,
+ unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_FINEGRAINED_THP_H */
--- /dev/null
+#ifndef __ASM_HUGE_MM_H
+#define __ASM_HUGE_MM_H
+
+#ifdef CONFIG_FINEGRAINED_THP
+#include <linux/mm.h> /* for compound_order/compound_nr */
+#include <asm/pgtable.h>
+
+#define HPAGE_CONT_PTE_MASK CONT_PTE_MASK
+#define HPAGE_CONT_PTE_SIZE CONT_PTE_SIZE
+#define HPAGE_CONT_PTE_ORDER (CONT_PTE_SHIFT-PAGE_SHIFT)
+#define HPAGE_CONT_PTE_NR (1 << HPAGE_CONT_PTE_ORDER)
+
+extern int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr,
+ struct vm_area_struct *vma, int *rss);
+
+extern vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf);
+
+static inline vm_fault_t arch_do_huge_pte_anonymous_page(
+ struct vm_fault *vmf)
+{
+ return arm64_do_huge_pte_anonymous_page(vmf);
+}
+
+extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte);
+extern int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags);
+
+extern pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+/*
+ * Below codes should be moved to arm64-dependent codes
+ * Most codes are borrowed from arch/arm64/mm/hugetlbpage.c
+ */
+
+#define HPAGE_CONT_PTE_CACHE_INDEX_MASK (HPAGE_CONT_PTE_NR - 1)
+
+static inline bool transhuge_adv_vma_suitable(struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ /* Don't have to check pgoff for anonymous vma */
+ if (!vma_is_anonymous(vma)) {
+ if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CONT_PTE_CACHE_INDEX_MASK)
+ != (vma->vm_pgoff & HPAGE_CONT_PTE_CACHE_INDEX_MASK))
+ return false;
+ }
+
+ if (haddr < vma->vm_start || haddr + HPAGE_CONT_PTE_SIZE >= vma->vm_end)
+ return false;
+ return true;
+}
+
+static inline pgprot_t thp_pte_pgprot(pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+static inline pte_t arm64_make_huge_pte(struct page *hpage,
+ struct vm_area_struct *vma)
+{
+ return pte_mkcont(pte_mkhuge(mk_pte(hpage, vma->vm_page_prot)));
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+ struct vm_area_struct *vma)
+{
+ return arm64_make_huge_pte(hpage, vma);
+}
+
+static inline void arm64_clear_and_flush(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ int i;
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long saddr = addr;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ pte_clear(mm, addr, ptep);
+
+ flush_tlb_range(&vma, saddr, addr);
+}
+
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline void arm64_set_huge_pte_at(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{
+ int i;
+ unsigned long pfn;
+ pgprot_t hugeprot;
+
+ pfn = pte_pfn(pte);
+ hugeprot = thp_pte_pgprot(pte);
+
+ arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR);
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE, pfn += 1)
+ set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{
+ arm64_set_huge_pte_at(mm, addr, ptep, pte, headoff);
+}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ arm64_clear_and_flush(mm, addr, ptep, PAGE_SIZE, HPAGE_CONT_PTE_NR);
+}
+
+extern vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page);
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+ struct page *page)
+{
+ return arm64_do_set_huge_pte(vmf, page);
+}
+
+extern vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte);
+
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+ int ret = VM_FAULT_FALLBACK;
+
+ if (pte_cont(entry))
+ ret = arm64_wp_huge_pte(vmf, entry);
+ return ret;
+}
+
+extern void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte);
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf, pte_t entry)
+{
+ if (pte_cont(entry)) {
+ huge_cont_pte_set_accessed(vmf, entry);
+ return true;
+ }
+ return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+ if (pte_cont(pte))
+ return pte_clearhuge(pte);
+ return pte;
+}
+
+extern int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot);
+
+static inline int arch_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ return arm64_remap_pte_range(mm, pmd, addr, end, pfn, prot);
+}
+
+void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
+ pte_t *pte, unsigned long address,
+ bool freeze, struct page *page);
+
+#define split_huge_pte(__vma, __pmd, __pte, __address) \
+ do { \
+ pte_t *____pte = (__pte); \
+ if (is_swap_pte(*____pte) || pte_cont(*____pte) \
+ || pte_devmap(*____pte)) \
+ __split_huge_pte(__vma, __pmd, __pte, __address, \
+ false, NULL); \
+ } while (0)
+
+void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page);
+extern bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma,
+ unsigned long vm_flags, int nr_pages);
+extern bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma,
+ unsigned long vm_flags, int nr_pages);
+
+static inline bool arch_hugepage_vma_shmem_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return arm64_hugepage_vma_shmem_check(vma, vm_flags,
+ HPAGE_CONT_PTE_NR);
+}
+
+static inline bool arch_hugepage_vma_file_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return arm64_hugepage_vma_file_check(vma, vm_flags,
+ HPAGE_CONT_PTE_NR);
+}
+
+#else /* CONFIG_FINEGRAINED_THP */
+
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+ return VM_FAULT_FALLBACK;
+}
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf,
+ pte_t entry)
+{
+ return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+ struct vm_area_struct *vma)
+{
+ return mk_pte(hpage, vma->vm_page_prot);
+}
+
+static inline vm_fault_t arch_do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+ return VM_FAULT_FALLBACK;
+}
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+ struct page *page)
+{
+ return VM_FAULT_FALLBACK;
+}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{}
+
+static inline bool arch_hugepage_vma_shmem_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return false;
+}
+
+static inline bool arch_hugepage_vma_file_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return false;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_HUGE_MM_H */
return pte;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline pte_t pte_clearhuge(pte_t pte)
+{
+ pte = clear_pte_bit(pte, __pgprot(PTE_CONT));
+ return pte;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
{
pmd_val(pmd) &= ~pgprot_val(prot);
*/
#define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_FINEGRAINED_THP
+/* 64KB hugepage definition for THP */
+#define pte_trans_huge(pte) (pte_val(pte) && !(pte_val(pte) & PTE_TABLE_BIT))
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+
/*
* Hugetlb definitions.
*/
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ARM64_MTE) += mteswap.o
+obj-$(CONFIG_FINEGRAINED_THP) += huge_memory.o
+obj-$(CONFIG_FINEGRAINED_THP) += finegrained_thp.o
KASAN_SANITIZE_physaddr.o += n
obj-$(CONFIG_KASAN) += kasan_init.o
--- /dev/null
+#include <linux/shmem_fs.h>
+#include <asm/huge_mm.h>
+
+bool arm64_hugepage_vma_shmem_check(struct vm_area_struct *vma,
+ unsigned long vm_flags, int nr_pages)
+{
+ /* Enabled via shmem mount options or sysfs settings. */
+ if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ nr_pages);
+ }
+ return false;
+}
+
+bool arm64_hugepage_vma_file_check(struct vm_area_struct *vma,
+ unsigned long vm_flags, int nr_pages)
+{
+ /* Read-only file mappings need to be aligned for THP to work. */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ (vm_flags & VM_DENYWRITE)) {
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ nr_pages);
+ }
+ return false;
+}
+
--- /dev/null
+/*
+ * Hugepage support for arm64 architecture
+ *
+ * 21.08.07.
+ *
+ */
+
+#include <linux/huge_mm.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/khugepaged.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/oom.h>
+
+#include <asm/huge_mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_FINEGRAINED_THP
+pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t pte;
+ int i;
+
+ VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+ VM_BUG_ON(!pte_cont(*ptep));
+ pte = ptep_get_and_clear(vma->vm_mm, address, ptep);
+
+ for (i = 1; i < HPAGE_CONT_PTE_NR; i++)
+ ptep_get_and_clear(vma->vm_mm, address + PAGE_SIZE * i, ptep + i);
+
+ flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
+ return pte;
+}
+
+#define USE_THP_PRINT_CONT_TABLE
+#ifdef USE_THP_PRINT_CONT_TABLE
+void thp_print_cont_pte_table(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned long line)
+{
+ int i, pid = 0;
+
+ if (mm->owner) {
+ pr_info("THP: %s from %lu proc-%d(%s)\n", __func__, line,
+ task_pid_nr(mm->owner), mm->owner->comm);
+ pid = task_pid_nr(mm->owner);
+ } else
+ pr_info("THP: %s from %lu\n", __func__, line);
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE) {
+ pr_info("%lx: %llx pid(%d)\n", addr, pte_val(*ptep), pid);
+ }
+}
+#else
+void thp_print_cont_pte_table(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned long line)
+{}
+#endif /* USE_THP_PRINT_CONT_TABLE */
+
+/*
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ * fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+ const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+ /* Always do synchronous compaction */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+ return GFP_TRANSHUGE_LIGHT;
+}
+
+/*
+ * a caller must hold both locks of dst and src
+ */
+int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long haddr,
+ struct vm_area_struct *vma, int *rss)
+{
+ struct page *src_page;
+ unsigned long addr = haddr;
+ pte_t pte, *_pte;
+
+ pte = *src_pte;
+
+ src_page = vm_normal_page(vma, addr, pte);
+ if (!src_page)
+ return -EAGAIN;
+
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ if (rss)
+ rss[MM_ANONPAGES] += HPAGE_CONT_PTE_NR;
+ else
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+
+ _pte = src_pte;
+ while (addr < haddr + HPAGE_CONT_PTE_SIZE) {
+ ptep_set_wrprotect(src_mm, addr, _pte);
+ addr += PAGE_SIZE;
+ }
+ pte = pte_mkold(pte_wrprotect(pte));
+ arm64_set_huge_pte_at(dst_mm, haddr, dst_pte, pte, 0);
+
+ return 0;
+}
+
+vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
+{
+ int i;
+ pte_t entry;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ pgoff_t index, pgoff, addroff, headoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ if (!transhuge_adv_vma_suitable(vma, haddr))
+ return VM_FAULT_FALLBACK;
+
+ page = compound_head(page);
+ index = page->index;
+ pgoff = vmf->pgoff;
+ addroff = (vmf->address - haddr) >> PAGE_SHIFT;
+
+ if (pgoff - index != addroff)
+ return VM_FAULT_FALLBACK;
+
+ /*
+ * Archs like ppc64 need additonal space to store information
+ * related to pte entry. Use the preallocated table for that.
+ */
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ if (unlikely(pmd_none(*vmf->pmd))) {
+ if (pte_alloc(vma->vm_mm, vmf->pmd))
+ return VM_FAULT_OOM;
+ smp_wmb();
+ }
+
+ /* The head offset indicates the position of the first page in the hugepage */
+ headoff = (addroff + (HPAGE_CONT_PTE_NR - pgoff)) % HPAGE_CONT_PTE_NR;
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, haddr, &vmf->ptl);
+ if (!vmf->pte || unlikely(!pte_none(*vmf->pte))) {
+ spin_unlock(vmf->ptl);
+ vmf->pte = NULL;
+ return ret;
+ }
+
+ entry = arm64_make_huge_pte(compound_head(page), vma);
+ if (write)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+ flush_icache_page(vma, page + i);
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
+ if (PageAnon(page))
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ } else {
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
+ page_add_file_rmap(page, true);
+ }
+
+ arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, headoff);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
+ count_vm_event(THP_FILE_MAPPED);
+ return 0;
+}
+
+static vm_fault_t arm64_do_huge_pte_wp_page_fallback(struct vm_fault *vmf,
+ pte_t orig_pte, struct page *page)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ int i;
+ vm_fault_t ret = 0;
+ struct page **pages;
+ struct mmu_notifier_range range;
+
+ pages = kmalloc_array(HPAGE_CONT_PTE_NR, sizeof(struct page *),
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+ pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
+ if (unlikely(!pages[i] ||
+ mem_cgroup_charge(pages[i], vma->vm_mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ while (--i >= 0) {
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SIZE * i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_CONT_PTE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pte_same(*vmf->pte, orig_pte)))
+ goto out_free_pages;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ /*
+ * Leave pmd empty until pte is filled note we must notify here as
+ * concurrent CPU thread might write to new page before the call to
+ * mmu_notifier_invalidate_range_end() happens which can lead to a
+ * device seeing memory write in different order than CPU.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ vmf->pte = pte_offset_map(vmf->pmd, haddr);
+ ptep_huge_clear_flush_notify(vma, haddr, vmf->pte);
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, haddr += PAGE_SIZE) {
+ pte_t entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ set_page_private(pages[i], 0);
+
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
+ lru_cache_add_inactive_or_unevictable(pages[i], vma);
+ vmf->pte = pte_offset_map(vmf->pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
+ }
+ kfree(pages);
+
+ smp_wmb(); /* make pte visible before pmd */
+ page_remove_rmap(page, true);
+ spin_unlock(vmf->ptl);
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+ set_page_private(pages[i], 0);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ goto out;
+}
+
+vm_fault_t arm64_do_huge_pte_wp_page(struct vm_fault *vmf, pte_t orig_pte)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = NULL, *new_page;
+ unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ struct mmu_notifier_range range;
+ gfp_t huge_gfp; /* for allocation and charge */
+ vm_fault_t ret = 0;
+
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
+ VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+ spin_unlock(vmf->ptl);
+ return ret;
+ }
+
+ page = pte_page(orig_pte);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+ page = compound_head(page);
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+ spin_unlock(vmf->ptl);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+ }
+ put_page(page);
+ }
+
+ if (reuse_swap_page(page, NULL)) {
+ huge_cont_pte_set_accessed(vmf, orig_pte);
+ unlock_page(page);
+ spin_unlock(vmf->ptl);
+ return VM_FAULT_WRITE;
+ }
+ unlock_page(page);
+ get_page(page);
+ spin_unlock(vmf->ptl);
+
+ /*
+ * For 2MB hugepage, the kernel just splits it
+ * into standard-sized pages and fallbacks to
+ * normal page fault handling path.
+ *
+ * For 64KB hugepage, I think alloc-on-COW can
+ * be get a performance benefit. This is because,
+ * significant time is consumed for copying contents
+ * of 2MB page, but 64KB page is much smaller than
+ * 2MB page. So, I guess that the overhead can be
+ * negligible.
+ *
+ * TODO: accounting time overhead of below procedure
+ */
+#ifdef CONFIG_THP_CONSERVATIVE
+ goto fallback;
+#endif
+ if (__transparent_hugepage_enabled(vma)) {
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr,
+ HPAGE_CONT_PTE_ORDER);
+ } else
+ new_page = NULL;
+
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
+ if (!page) {
+ split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+ ret |= VM_FAULT_FALLBACK;
+ } else {
+ ret = arm64_do_huge_pte_wp_page_fallback(vmf, orig_pte, page);
+ if (ret & VM_FAULT_OOM) {
+ split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+ ret |= VM_FAULT_FALLBACK;
+ }
+ put_page(page);
+ }
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_charge(new_page, vma->vm_mm,
+ huge_gfp))) {
+ put_page(new_page);
+ split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
+ if (page)
+ put_page(page);
+ ret |= VM_FAULT_FALLBACK;
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ if (!page)
+ clear_huge_page(new_page, vmf->address, HPAGE_CONT_PTE_NR);
+ else
+ copy_user_huge_page(new_page, page, vmf->address,
+ vma, HPAGE_CONT_PTE_NR);
+ __SetPageUptodate(new_page);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_CONT_PTE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ spin_lock(vmf->ptl);
+ if (page)
+ put_page(page);
+ if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
+ spin_unlock(vmf->ptl);
+ mem_cgroup_uncharge(new_page);
+ put_page(new_page);
+ goto out_mn;
+ } else {
+ pte_t entry;
+
+ entry = arm64_make_huge_pte(new_page, vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+ vmf->pte = pte_offset_map(vmf->pmd, haddr);
+
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+ arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, 0);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ if (!page) {
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+ } else {
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
+ ret |= VM_FAULT_WRITE;
+ }
+ spin_unlock(vmf->ptl);
+out_mn:
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+out:
+ return ret;
+#ifdef CONFIG_THP_CONSERVATIVE
+fallback:
+ __split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
+#endif /* CONFIG_THP_CONSERVATIVE */
+}
+
+/* the caller must hold lock */
+vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte)
+{
+ unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ pte_t *hpte_p;
+
+ if (vma_is_anonymous(vmf->vma)) {
+ spin_unlock(vmf->ptl);
+ return arm64_do_huge_pte_wp_page(vmf, orig_pte);
+ }
+
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+
+ hpte_p = pte_offset_map(vmf->pmd, haddr);
+ spin_unlock(vmf->ptl);
+ __split_huge_pte(vmf->vma, vmf->pmd, hpte_p, haddr, false, NULL);
+ spin_lock(vmf->ptl);
+
+ return VM_FAULT_FALLBACK;
+}
+
+static inline int check_huge_pte_range(pte_t *head)
+{
+ int i;
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, head++) {
+ if (!pte_none(*head))
+ return 1;
+ }
+ return 0;
+}
+
+void thp_print_cont_pte_table(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned long line);
+
+static vm_fault_t __do_huge_pte_anonymous_page(struct vm_fault *vmf,
+ struct page *page, gfp_t gfp)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long offset, haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ pte_t entry;
+ vm_fault_t ret = 0;
+
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+ return VM_FAULT_FALLBACK;
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
+ clear_huge_page(compound_head(page), haddr, HPAGE_CONT_PTE_NR);
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * clear_huge_page writes become visible before the set_pmd_at()
+ * write.
+ */
+ __SetPageUptodate(page);
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+
+ if (userfaultfd_missing(vma)) {
+ spin_unlock(vmf->ptl);
+ put_page(page);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
+ }
+
+ entry = arm64_make_huge_pte(page, vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ offset = (vmf->address - haddr) >> PAGE_SHIFT;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ if (!pte_none(*vmf->pte)) {
+ ret = VM_FAULT_FALLBACK;
+ goto unlock_release;
+ }
+ if (check_huge_pte_range(vmf->pte - offset)) {
+ /* recheck */
+ /* TODO: COPY? */
+ ret = VM_FAULT_FALLBACK;
+ goto unlock_release;
+ }
+
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte - offset, entry, 0);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
+
+ spin_unlock(vmf->ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_release:
+ spin_unlock(vmf->ptl);
+ put_page(page);
+
+ return ret;
+}
+
+vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ spinlock_t *ptl;
+ gfp_t gfp;
+
+ if (!transhuge_adv_vma_suitable(vma, haddr))
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
+ return VM_FAULT_OOM;
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm) &&
+ transparent_hugepage_use_zero_page()) {
+ return VM_FAULT_FALLBACK;
+ }
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ vmf->pte = pte_offset_map(vmf->pmd, haddr);
+ if (check_huge_pte_range(vmf->pte)) {
+ pte_unmap(vmf->pte);
+ spin_unlock(ptl);
+ return VM_FAULT_FALLBACK;
+ }
+ pte_unmap(vmf->pte);
+ spin_unlock(ptl);
+
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ page = alloc_hugepage_vma(gfp, vma,
+ haddr,
+ HPAGE_CONT_PTE_ORDER);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ prep_transhuge_page(page);
+ return __do_huge_pte_anonymous_page(vmf, page, gfp);
+}
+
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+ unsigned long end, struct page *page,
+ int *rss, spinlock_t *ptl)
+{
+ struct mm_struct *mm = tlb->mm;
+ unsigned long haddr = (*addr) & HPAGE_CONT_PTE_MASK;
+ unsigned long range_end =
+ ((haddr + HPAGE_CONT_PTE_SIZE) > end) ? end :
+ haddr + HPAGE_CONT_PTE_SIZE;
+ size_t size = range_end - haddr;
+ unsigned long map_count = size >> PAGE_SHIFT;
+ pte_t *pte;
+
+ pte = pte_offset_map(pmd, haddr);
+
+ if ((*addr) == haddr && haddr + HPAGE_CONT_PTE_SIZE <= range_end) {
+ arm64_clear_and_flush(mm, *addr, pte, PAGE_SIZE, map_count);
+ page_remove_rmap(compound_head(page), true);
+ rss[mm_counter(page)] -= map_count;
+ __tlb_adjust_range(tlb, *addr, size);
+ __tlb_remove_tlb_entry(tlb, pte, *addr);
+ tlb_remove_page_size(tlb, page, size);
+
+ *addr += size;
+ pte += map_count;
+
+ if (*addr >= end)
+ *addr = end - PAGE_SIZE;
+
+ *ptep = pte;
+ } else {
+ if (haddr < vma->vm_start) {
+ pr_err("haddr(%lx) is less than vm start(%lx)\n",
+ haddr, vma->vm_start);
+ thp_print_cont_pte_table(mm, haddr, pte, __LINE__);
+ }
+
+ spin_unlock(ptl);
+ __split_huge_pte(vma, pmd, pte, haddr, false, NULL);
+ spin_lock(ptl);
+ }
+
+ pte_unmap(pte);
+
+ return map_count == HPAGE_CONT_PTE_NR;
+}
+
+/* caller must hold a proper lock */
+void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte)
+{
+ int i;
+ pte_t entry, *pte;
+ unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ haddr = vmf->address & HPAGE_CONT_PTE_MASK;
+ pte = pte_offset_map(vmf->pmd, haddr);
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, pte++, haddr += PAGE_SIZE) {
+ entry = pte_mkyoung(*pte);
+ if (write)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ ptep_set_access_flags(vmf->vma, haddr, pte, entry, write);
+ }
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+}
+
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
+extern void mlock_vma_page(struct page *page);
+extern void clear_page_mlock(struct page *page);
+
+struct page *follow_trans_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ pte_t *pte;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+ goto out;
+
+ /* Avoid dumping huge zero page */
+ if ((flags & FOLL_DUMP))
+ return ERR_PTR(-EFAULT);
+
+ /* Full NUMA hinting faults to serialise migration in fault paths */
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ goto out;
+
+ pte = pte_offset_map(pmd, addr);
+ page = pte_page(*pte);
+ VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if (!try_grab_page(page, flags))
+ return ERR_PTR(-ENOMEM);
+
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * For anon THP:
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ *
+ * For file THP:
+ *
+ * We can expect PageDoubleMap() to be stable under page lock:
+ * for file pages we set it in page_add_file_rmap(), which
+ * requires page to be locked.
+ */
+
+ if (PageAnon(page) && compound_mapcount(page) != 1)
+ goto skip_mlock;
+ if (PageDoubleMap(page) || !page->mapping)
+ goto skip_mlock;
+ if (!trylock_page(page))
+ goto skip_mlock;
+ if (page->mapping && !PageDoubleMap(page))
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+skip_mlock:
+ page += (addr & ~HPAGE_CONT_PTE_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
+
+out:
+ return page;
+}
+
+static inline pte_t ptep_invalidate(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ return __pte(xchg_relaxed(&pte_val(*ptep), (pte_val(*ptep) & ~PTE_VALID)));
+}
+
+extern atomic_long_t nr_phys_cont_pte_pages;
+
+static int remap_try_huge_pte(struct mm_struct *mm, pte_t *pte, unsigned long addr,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ phys_addr_t phys_addr = __pfn_to_phys(pfn);
+ pte_t entry;
+
+ if ((end - addr) != CONT_PTE_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
+ return 0;
+
+ entry = pte_mkspecial(pte_mkcont(pte_mkhuge(pfn_pte(pfn, prot))));
+ arch_set_huge_pte_at(mm, addr, pte, entry, 0);
+
+ atomic_long_add(HPAGE_CONT_PTE_NR, &nr_phys_cont_pte_pages);
+
+ return 1;
+}
+
+int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pte_t *pte, *mapped_pte;
+ unsigned long next;
+ spinlock_t *ptl;
+ int err = 0;
+
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
+ do {
+ BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
+
+ next = pte_cont_addr_end(addr, end);
+ if (remap_try_huge_pte(mm, pte, addr, next, pfn, prot)) {
+ pte += HPAGE_CONT_PTE_NR;
+ pfn += HPAGE_CONT_PTE_NR;
+ addr += HPAGE_CONT_PTE_SIZE;
+ } else {
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+ pfn++;
+ pte++;
+ addr += PAGE_SIZE;
+ }
+ } while (addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(mapped_pte, ptl);
+ return err;
+}
+
+/* caller must hold appropriate lock (pmd lock) */
+int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t entry;
+ bool preserve_write;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ int i, ret;
+
+ preserve_write = prot_numa && pte_write(*pte);
+ ret = 1;
+
+ /* currently, we don't consider numa cases, but just remain them
+ * for the future work */
+ if (prot_numa && is_huge_zero_page(pte_page(*pte)))
+ goto out;
+
+ if (prot_numa && pte_protnone(*pte))
+ goto out;
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+ entry = ptep_invalidate(vma, addr, pte);
+ entry = pte_modify(entry, newprot);
+ if (preserve_write)
+ entry = pte_mk_savedwrite(entry);
+ entry = pte_mkcont(entry);
+
+ set_pte_at(mm, addr, pte, entry);
+ pte++;
+ addr += PAGE_SIZE;
+ }
+
+ flush_tlb_range(vma, addr, addr + HPAGE_CONT_PTE_SIZE);
+ ret = HPAGE_CONT_PTE_NR;
+out:
+ return ret;
+}
+
+static void __split_huge_pte_locked(struct vm_area_struct *vma, pte_t *pte,
+ unsigned long haddr, bool freeze)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pte_t old_pte, _pte;
+ bool young, write, soft_dirty, pte_migration = false, uffd_wp = false;
+ unsigned long addr;
+ int i;
+
+ VM_BUG_ON(haddr & ~HPAGE_CONT_PTE_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_CONT_PTE_SIZE, vma);
+
+ count_vm_event(THP_SPLIT_CONT_PTE);
+
+ if (!vma_is_anonymous(vma)) {
+ _pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
+ if (vma_is_dax(vma))
+ return;
+ page = pte_page(_pte);
+ if (!PageDirty(page) && pte_dirty(_pte))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pte_young(_pte))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ add_mm_counter(mm, mm_counter_file(page), -HPAGE_CONT_PTE_NR);
+ return;
+ } else if (is_huge_zero_page(pte_page(*pte))) {
+ pr_err("contiguous pte mapping for zero anon pages are not supported yet");
+ BUG();
+ }
+
+ old_pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
+
+ pte_migration = is_pte_migration_entry(old_pte);
+ if (unlikely(pte_migration)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(old_pte);
+ page = pfn_to_page(swp_offset(entry));
+ write = is_write_migration_entry(entry);
+ young = false;
+ soft_dirty = pte_swp_soft_dirty(old_pte);
+ uffd_wp = pte_swp_uffd_wp(old_pte);
+ } else {
+ page = pte_page(old_pte);
+ if (pte_dirty(old_pte))
+ SetPageDirty(page);
+ write = pte_write(old_pte);
+ young = pte_young(old_pte);
+ soft_dirty = pte_soft_dirty(old_pte);
+ uffd_wp = pte_uffd_wp(old_pte);
+ }
+
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ page_ref_add(page, HPAGE_CONT_PTE_NR - 1);
+
+ for (i = 0, addr = haddr; i < HPAGE_CONT_PTE_NR;
+ i++, addr += PAGE_SIZE, pte++) {
+ pte_t entry;
+
+ if (freeze || pte_migration) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ } else {
+ entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
+ entry = maybe_mkwrite(entry, vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ if (soft_dirty)
+ entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
+ }
+ //BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, addr, pte, entry);
+ if (!pte_migration)
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
+
+ if (!pte_migration) {
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 &&
+ !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ lock_page_memcg(page);
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+ unlock_page_memcg(page);
+ }
+
+ smp_wmb();
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
+}
+
+void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
+ pte_t *pte, unsigned long address,
+ bool freeze, struct page *page)
+{
+ spinlock_t *ptl;
+ struct mmu_notifier_range range;
+ pte_t _pte;
+ bool locked = false;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_CONT_PTE_MASK,
+ (address & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ if (page != pte_page(*pte))
+ goto out;
+ }
+repeat:
+ if (pte_cont(*pte)) {
+ if (!page) {
+ page = pte_page(*pte);
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pte_range()
+ * calls __split_huge_pte() while i_mmap_lock is held.
+ */
+ if (PageAnon(page)) {
+ if (unlikely(!trylock_page(page))) {
+ _pte = *pte;
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, _pte))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ locked = true;
+ }
+ }
+ if (PageMlocked(page))
+ clear_page_mlock(page);
+ } else if (!(pte_devmap(*pte) || is_pte_migration_entry(*pte)))
+ goto out;
+ __split_huge_pte_locked(vma, pte, range.start, freeze);
+out:
+ spin_unlock(ptl);
+ if (locked && page)
+ unlock_page(page);
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+
+void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
+{
+ unsigned long haddr = address & HPAGE_CONT_PTE_MASK;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(vma->vm_mm, haddr);
+ if (!pgd_present(*pgd))
+ return;
+
+ p4d = p4d_offset(pgd, haddr);
+ if (!p4d_present(*p4d))
+ return;
+
+ pud = pud_offset(p4d, haddr);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, haddr);
+ if (!pmd_present(*pmd))
+ return;
+
+ pte = pte_offset_map(pmd, haddr);
+ if (!pte_present(*pte))
+ return;
+
+ __split_huge_pte(vma, pmd, pte, haddr, freeze, page);
+}
+#endif /* CONFIG_FINEGRAINED_THP */
return 1;
}
+#ifdef CONFIG_FINEGRAINED_THP
+int cont_pte_set_huge(pte_t *ptep, phys_addr_t phys, pgprot_t prot)
+{
+ int i;
+ pte_t new_pte;
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++, phys += PAGE_SIZE, ptep++) {
+ new_pte = pfn_pte(__phys_to_pfn(phys), prot);
+ new_pte = pte_mkcont(new_pte);
+ set_pte(ptep, new_pte);
+ }
+
+ return 1;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
int pud_clear_huge(pud_t *pudp)
{
if (!pud_sect(READ_ONCE(*pudp)))
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+ show_val_kb(m, "Anon64KBPages: ",
+ global_node_page_state(NR_ANON_64KB_THPS) * HPAGE_CONT_PTE_NR);
+#endif /* CONFIG_FINEGRAINED_THP */
show_val_kb(m, "ShmemHugePages: ",
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+ show_val_kb(m, "ShmemPteMapped: ",
+ global_node_page_state(NR_SHMEM_PTEMAPPED) * HPAGE_CONT_PTE_NR);
+ show_val_kb(m, "File64KBPages: ",
+ global_node_page_state(NR_FILE_64KB_THPS) * HPAGE_CONT_PTE_NR);
+#endif /* CONFIG_FINEGRAINED_THP */
show_val_kb(m, "FileHugePages: ",
global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "FilePmdMapped: ",
global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+#ifdef CONFIG_FINEGRAINED_THP
+ show_val_kb(m, "FileCPteMapped: ",
+ global_node_page_state(NR_FILE_PTEMAPPED) * HPAGE_CONT_PTE_NR);
+ show_val_kb(m, "PhysCPteMapped: ",
+ phys_cont_pte_pages());
+#endif /* CONFIG_FINEGRAINED_THP */
+ show_val_kb(m, "PhysPmdMapped: ",
+ phys_huge_pmd_pages());
#endif
-
#ifdef CONFIG_CMA
show_val_kb(m, "CmaTotal: ", totalcma_pages);
show_val_kb(m, "CmaFree: ",
--- /dev/null
+/* a generic header for fine-grained thp */
+#ifndef __ASM_FINEGRAINED_THP_H
+#define __ASM_FINEGRAINED_THP_H
+#ifndef CONFIG_FINEGRAINED_THP
+static inline void khugepaged_mem_hook(struct mm_struct *mm,
+ unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_FINEGRAINED_THP */
+#endif /* __ASM_FINEGRAINED_THP_H */
--- /dev/null
+/* a generic header for architecture-dependent hugepage */
+#ifndef __ASM_HUGE_MM_H
+#define __ASM_HUGE_MM_H
+#ifndef CONFIG_FINEGRAINED_THP
+static inline int arch_do_wp_page(struct vm_fault *vmf, pte_t entry)
+{
+ return VM_FAULT_FALLBACK;
+}
+
+static inline bool arch_huge_pte_set_accessed(struct vm_fault *vmf,
+ pte_t entry)
+{
+ return false;
+}
+
+static inline pte_t arch_pte_clearhuge(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t arch_make_huge_pte(struct page *hpage,
+ struct vm_area_struct *vma)
+{
+ return mk_pte(hpage, vma->vm_page_prot);
+}
+
+static inline vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf,
+ struct page *page)
+{
+ return VM_FAULT_FALLBACK;
+}
+
+static inline void arch_set_huge_pte_at(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t pte, unsigned long headoff)
+{}
+
+static inline void arch_clear_huge_pte_range(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{}
+
+static inline bool arch_hugepage_vma_shmem_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return false;
+}
+
+static inline bool arch_hugepage_vma_file_check(
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return false;
+}
+#endif /* CONFIG_FINGRAINED_THP */
+#endif /* __ASM_HUGE_MM_H */
#include <linux/mm_types.h>
#include <linux/fs.h> /* only for vma_is_dax() */
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/huge_mm.h> /* for compound_order/compound_nr */
+#endif
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static inline unsigned int thp_order(struct page *page)
{
VM_BUG_ON_PGFLAGS(PageTail(page), page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageHead(page))
+ return page[1].compound_order;
+#else
if (PageHead(page))
return HPAGE_PMD_ORDER;
+#endif
return 0;
}
{
VM_BUG_ON_PGFLAGS(PageTail(page), page);
if (PageHead(page))
+#ifdef CONFIG_FINEGRAINED_THP
+ return page[1].compound_nr;
+#else
return HPAGE_PMD_NR;
+#endif
return 1;
}
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_SPLIT_PTE 0x100000 /* pslit huge pte before returning */
+
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
* other. Here is what they mean, and how to use them:
return !memcmp_pages(page1, page2);
}
+extern unsigned long phys_cont_pte_pages(void);
+extern unsigned long phys_huge_pmd_pages(void);
+
#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr,
___pte; \
})
+#ifdef CONFIG_FINEGRAINED_THP
+#define ptep_huge_clear_flush_notify(__vma, __address, __ptep) \
+({ \
+ unsigned long ___addr = __address & HPAGE_CONT_PTE_MASK; \
+ struct mm_struct *___mm = (__vma)->vm_mm; \
+ pte_t ___pte; \
+ \
+ ___pte = ptep_huge_clear_flush(__vma, __address, __ptep); \
+ mmu_notifier_invalidate_range(___mm, ___addr, \
+ ___addr + HPAGE_CONT_PTE_SIZE); \
+ \
+ ___pte; \
+})
+#endif /* CONFIG_FINEGRAINED_THP */
+
#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
({ \
unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
#define set_pte_at_notify set_pte_at
+#ifdef CONFIG_FINEGRAINED_THP
+#define ptep_huge_clear_flush_notify ptep_huge_clear_flush
+#endif
+
static inline void mmu_notifier_synchronize(void)
{
}
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
+#ifdef CONFIG_FINEGRAINED_THP
+ NR_SHMEM_PTEMAPPED,
+ NR_FILE_64KB_THPS,
+#endif /* CONFIG_FINEGRAINED_THP */
NR_FILE_THPS,
+#ifdef CONFIG_FINEGRAINED_THP
+ NR_FILE_PTEMAPPED,
+#endif /* CONFIG_FINEGRAINED_THP */
NR_FILE_PMDMAPPED,
NR_ANON_THPS,
+#ifdef CONFIG_FINEGRAINED_THP
+ NR_ANON_64KB_THPS,
+#endif
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
return 0;
}
#endif /* !__PAGETABLE_P4D_FOLDED */
-
+#ifdef CONFIG_FINEGRAINED_THP
+int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot);
+#endif /* CONFIG_FINEGRAINED_THP */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int cont_pte_set_huge(pte_t *pte, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
return 0;
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
+#ifdef CONFIG_FINEGRAINED_THP
+ TTU_SPLIT_HUGE_PTE = 0x200, /* split huge PTE if any */
+#endif
};
#ifdef CONFIG_MMU
{
return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int is_pte_migration_entry(pte_t pte)
+{
+ return !pte_present(pte) && is_migration_entry(pte_to_swp_entry(pte));
+}
+#endif
#else
static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
return 0;
}
+
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int is_pte_migration_entry(pte_t pte)
+{
+ return 0;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
#endif
#ifdef CONFIG_MEMORY_FAILURE
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
THP_DEFERRED_SPLIT_PAGE,
+#ifdef CONFIG_FINEGRAINED_THP
+ THP_SPLIT_CONT_PTE,
+#endif
THP_SPLIT_PMD,
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
THP_SPLIT_PUD,
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_FILE_THP 0x200000 /* MAP_FIXED which doesn't unmap underlying mapping */
+
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
config CMA_ALIGNMENT
int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 9 12 if FINEGRAINED_THP
range 2 12
default 8
help
retry:
if (is_register)
+#ifdef CONFIG_FINEGRAINED_THP
+ {
+ gup_flags |= FOLL_SPLIT_PMD | FOLL_SPLIT_PTE;
+ pr_info("THP-%s: FOLL_SPLIT_PTE called comm(%s)\n", __func__, current->comm);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
gup_flags |= FOLL_SPLIT_PMD;
+#endif /* CONFIG_FINEGRAINED_THP */
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
support of file THPs will be developed in the next few release
cycles.
+config FINEGRAINED_THP
+ bool "Fine-grained THP support (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE
+
+ help
+ Allow khugepaged to create 64KB hugepages and 64KB hugepage
+ allocation on page faults.
+
+ It is only supported by ARM64 architecture for now.
+
+config THP_CONSERVATIVE
+ bool "A conservative policy for fTHP (EXPERIMENTAL)"
+ depends on FINEGRAINED_THP
+
+ help
+ In the conservative policy, only khugepaged can make hugepages
+
config ARCH_HAS_PTE_SPECIAL
bool
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (thp_nr_pages(page) == HPAGE_PMD_NR)
+ __dec_node_page_state(page, NR_FILE_THPS);
+ else
+ __dec_node_page_state(page, NR_FILE_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__dec_node_page_state(page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
filemap_nr_thps_dec(mapping);
}
return ERR_PTR(ret);
goto retry;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ else if (flags & FOLL_SPLIT_PTE && pte_cont(pte))
+ split_huge_pte(vma, pmd, ptep, address);
+#endif /* CONFIG_FINEGRAINED_THP */
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
+#ifndef CONFIG_FINEGRAINED_THP
+vm_fault_t do_huge_pte_anonymous_page(struct vm_fault *vmf)
+{
+ return VM_FAULT_FALLBACK;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
pgtable_t pgtable)
return ret;
}
+#ifdef CONFIG_FINEGRAINED_THP
+#endif /* CONFIG_FINEGRAINED_THP */
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, int flags)
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
+ atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages);
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
}
}
+static int thp_pte_alloc_locked(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t new = pte_alloc_one(mm);
+ if (!new)
+ return -ENOMEM;
+
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ mm_inc_nr_ptes(mm);
+ pmd_populate(mm, pmd, new);
+ new = NULL;
+ }
+ if (new)
+ pte_free(mm, new);
+ return 0;
+}
+
+static int thp_remap_pte_range_locked(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pte_t *pte;
+ int err = 0;
+
+ err = thp_pte_alloc_locked(mm, pmd);
+ if (err)
+ return err;
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+ do {
+ BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
+
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+ pfn++;
+ pte++;
+ addr += PAGE_SIZE;
+ } while (addr != end);
+ arch_leave_lazy_mmu_mode();
+ return err;
+}
+
+static inline pgprot_t thp_pmd_pgprot(pmd_t pmd)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
+}
+
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct page *page)
{
}
repeat:
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd) && !vm_normal_page_pmd(vma, address, *pmd)) {
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pmd_t orig_pmd;
+
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, haddr, pmd, 0);
+ atomic_long_add(-HPAGE_PMD_NR, &nr_phys_huge_pmd_pages);
+ thp_remap_pte_range_locked(mm, pmd, haddr,
+ haddr + HPAGE_PMD_SIZE,
+ pmd_pfn(orig_pmd),
+ thp_pmd_pgprot(orig_pmd));
+ goto out;
+ } else if (pmd_trans_huge(*pmd) && vm_normal_page_pmd(vma, address, *pmd)) {
if (!page) {
page = pmd_page(*pmd);
/*
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
split_huge_pmd_address(vma, start, false, NULL);
-
+#ifdef CONFIG_FINEGRAINED_THP
+ if (start & ~HPAGE_CONT_PTE_MASK &&
+ (start & HPAGE_CONT_PTE_MASK) >= vma->vm_start &&
+ (start & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end)
+ split_huge_pte_address(vma, start, false, NULL);
+#endif
/*
* If the new end address isn't hpage aligned and it could
* previously contain an hugepage: check if we need to split
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
split_huge_pmd_address(vma, end, false, NULL);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (end & ~HPAGE_CONT_PTE_MASK &&
+ (end & HPAGE_CONT_PTE_MASK) >= vma->vm_start &&
+ (end & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= vma->vm_end)
+ split_huge_pte_address(vma, end, false, NULL);
+#endif
/*
* If we're also updating the vma->vm_next->vm_start, if the new
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
split_huge_pmd_address(next, nstart, false, NULL);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (nstart & ~HPAGE_CONT_PTE_MASK &&
+ (nstart & HPAGE_CONT_PTE_MASK) >= next->vm_start &&
+ (nstart & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE <= next->vm_end)
+ split_huge_pte_address(next, nstart, false, NULL);
+#endif
}
}
static void unmap_page(struct page *page)
{
+#ifdef CONFIG_FINEGRAINED_THP
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
+ TTU_RMAP_LOCKED;
+#else
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+#endif
VM_BUG_ON_PAGE(!PageHead(page), page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (compound_order(page) == HPAGE_PMD_ORDER)
+ ttu_flags |= TTU_SPLIT_HUGE_PMD;
+ else
+ ttu_flags |= TTU_SPLIT_HUGE_PTE;
+#endif /* CONFIG_FINEGRAINED_THP */
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
if (mapping) {
if (PageSwapBacked(head))
__dec_node_page_state(head, NR_SHMEM_THPS);
- else
+ else {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (thp_nr_pages(head) == HPAGE_CONT_PTE_NR)
+ __dec_node_page_state(head, NR_FILE_64KB_THPS);
+ else
+#endif /* CONFIG_FINEGRAINED_THP */
__dec_node_page_state(head, NR_FILE_THPS);
+ }
}
__split_huge_page(page, list, end, flags);
void setup_zone_pageset(struct zone *zone);
+extern atomic_long_t nr_phys_cont_pte_pages;
+extern atomic_long_t nr_phys_huge_pmd_pages;
+
struct migration_target_control {
int nid; /* preferred node id */
nodemask_t *nmask;
static int __read_mostly ioremap_pmd_capable;
static int __read_mostly ioremap_huge_disabled;
+#ifdef CONFIG_FINEGRAINED_THP
+static int __read_mostly ioremap_cont_pte_capable;
+#endif
+
static int __init set_nohugeiomap(char *str)
{
ioremap_huge_disabled = 1;
return ioremap_pmd_capable;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int ioremap_cont_pte_enabled(void)
+{
+ return ioremap_cont_pte_capable;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int ioremap_p4d_enabled(void) { return 0; }
static inline int ioremap_pud_enabled(void) { return 0; }
static inline int ioremap_pmd_enabled(void) { return 0; }
+#ifdef CONFIG_FINEGRAINED_THP
+static inline int ioremap_cont_pte_enabled(void) { return 0; }
+#endif /* CONFIG_FINEGRAINED_THP */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+#ifdef CONFIG_FINEGRAINED_THP
+static int ioremap_try_huge_pte(pte_t *pte, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ int i;
+
+ if (!ioremap_cont_pte_enabled())
+ return 0;
+ if ((end - addr) != CONT_PTE_SIZE)
+ return 0;
+ if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
+ return 0;
+ if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
+ return 0;
+
+ for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
+ if (pte_present(*(pte + i)))
+ return 0;
+ return cont_pte_set_huge(pte, phys_addr, prot);
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
pgtbl_mod_mask *mask)
if (!pte)
return -ENOMEM;
do {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (addr + HPAGE_CONT_PTE_SIZE < end &&
+ ioremap_try_huge_pte(pte, addr, end, phys_addr, prot)) {
+ pte += HPAGE_CONT_PTE_NR - 1;
+ pfn += HPAGE_CONT_PTE_NR;
+ addr += HPAGE_CONT_PTE_SIZE - PAGE_SIZE;
+ phys_addr += HPAGE_CONT_PTE_SIZE;
+ continue;
+ }
+
+#endif /* CONFIG_FINEGRAINED_THP */
BUG_ON(!pte_none(*pte));
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
+#ifdef CONFIG_FINEGRAINED_THP
+ phys_addr += PAGE_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
} while (pte++, addr += PAGE_SIZE, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
#include <asm/tlb.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/finegrained_thp.h>
+#include <asm/huge_mm.h>
+#else
+#include <asm-generic/finegrained_thp.h>
+#include <asm-generic/huge_mm.h>
+#endif
#include "internal.h"
enum scan_result {
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * thp_scan_hint:
+ * it used for providing hints to khugepaged
+ * which address space is changed recently.
+ */
+struct thp_scan_hint {
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ unsigned long diff; /* memory difference */
+ unsigned long jiffies; /* time stamp for profiling purpose */
+ struct list_head hint_list;
+};
+
+/* THP type descriptor */
+enum {
+ THP_TYPE_FAIL, /* cannot make hugepage */
+ THP_TYPE_64KB, /* 64KB hugepage can be made, use CONT_PTE */
+ THP_TYPE_2MB, /* 2MB hugepage can be made, use PMD */
+};
+
+static unsigned int khugepaged_max_ptes_none_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_swap_64kb __read_mostly;
+static unsigned int khugepaged_max_ptes_shared_64kb __read_mostly;
+#endif /* CONFIG_FINEGRAINED_THP */
+
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_type;
+ int nr_hint;
+ struct list_head hint_list;
+#endif /* CONFIG_FINEGRAINED_THP */
};
static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+#ifdef CONFIG_FINEGRAINED_THP
+ .hint_list = LIST_HEAD_INIT(khugepaged_scan.hint_list),
+#endif
};
#ifdef CONFIG_SYSFS
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
+#ifdef CONFIG_FINEGRAINED_THP
+ khugepaged_max_ptes_none_64kb = HPAGE_CONT_PTE_NR - 1;
+ khugepaged_max_ptes_swap_64kb = HPAGE_CONT_PTE_NR / 8;
+ khugepaged_max_ptes_shared_64kb = HPAGE_CONT_PTE_NR / 2;
+#endif
return 0;
}
return atomic_read(&mm->mm_users) == 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void clear_hint_list(struct mm_slot *slot);
+#endif /* CONFIG_FINEGRAINED_THP */
+
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
vma->vm_pgoff, HPAGE_PMD_NR))
return false;
+ /* Check arch-dependent shmem hugepage available */
+ if (arch_hugepage_vma_shmem_check(vma, vm_flags))
+ return true;
/* Enabled via shmem mount options or sysfs settings. */
if (shmem_file(vma->vm_file))
return shmem_huge_enabled(vma);
if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
return false;
+ /* Check arch-dependent file hugepage available */
+ if (arch_hugepage_vma_file_check(vma, vm_flags))
+ return true;
/* Only regular file is valid */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
(vm_flags & VM_DENYWRITE)) {
struct inode *inode = vma->vm_file->f_inode;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
return khugepaged_enter(vma, vm_flags);
+#ifdef CONFIG_FINEGRAINED_THP
+ hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma, vm_flags);
+#endif /* CONFIG_FINEGRAINED_THP */
return 0;
}
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+#ifdef CONFIG_FINEGRAINED_THP
+ clear_hint_list(mm_slot);
+#endif
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
free = 1;
return page_count(page) == expected_refcount;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte,
+ struct list_head *compound_pagelist,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *page = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+ int max_ptes_shared, max_ptes_none;
+ int hpage_nr;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ } else {
+ hpage_nr = HPAGE_PMD_NR;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ max_ptes_none = khugepaged_max_ptes_none;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte+HPAGE_PMD_NR;
+#endif
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+#ifdef CONFIG_FINEGRAINED_THP
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ ++none_or_zero <= max_ptes_none)
+#else /* CONFIG_FINEGRAINED_THP */
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
+#endif /* CONFIG_FINEGRAINED_THP */
+ {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
VM_BUG_ON_PAGE(!PageAnon(page), page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (page_mapcount(page) > 1 &&
+ ++shared > max_ptes_shared)
+#else /* CONFIG_FINEGRAINED_THP */
if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
+ ++shared > khugepaged_max_ptes_shared)
+#endif /* CONFIG_FINEGRAINED_THP */
+ {
result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl,
struct list_head *compound_pagelist)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *src_page, *tmp;
pte_t *_pte;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+#endif
+
+ for (_pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte + HPAGE_PMD_NR;
+#endif
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline struct page *alloc_khugepaged_hugepage(int hpage_order)
+#else
static inline struct page *alloc_khugepaged_hugepage(void)
+#endif
{
struct page *page;
+#ifdef CONFIG_FINEGRAINED_THP
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ hpage_order);
+#else
page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
HPAGE_PMD_ORDER);
+#endif
if (page)
prep_transhuge_page(page);
return page;
struct page *hpage;
do {
+#ifdef CONFIG_FINEGRAINED_THP
+ hpage = alloc_khugepaged_hugepage(HPAGE_PMD_ORDER);
+#else
hpage = alloc_khugepaged_hugepage();
+#endif
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
return true;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node, int hpage_type)
+{
+ struct page *page;
+
+ if (hpage_type == THP_TYPE_64KB)
+ page = alloc_khugepaged_hugepage(HPAGE_CONT_PTE_ORDER);
+ else {
+ VM_BUG_ON(!*hpage);
+ page = *hpage;
+ }
+ return page;
+}
+#else /* CONFIG_FINEGRAINED_THP */
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
return *hpage;
}
+#endif /* CONFIG_FINEGRAINED_THP */
#endif
/*
* value (scan code).
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+ struct vm_area_struct **vmap, int hpage_type)
+#else
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct vm_area_struct **vmap)
+#endif
{
struct vm_area_struct *vma;
unsigned long hstart, hend;
if (!vma)
return SCAN_VMA_NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ hstart = (vma->vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ hend = vma->vm_end & HPAGE_CONT_PTE_MASK;
+ if (address < hstart || address + HPAGE_CONT_PTE_SIZE > hend)
+ return SCAN_ADDRESS_RANGE;
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return SCAN_VMA_CHECK;
+ return 0;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
* but with mmap_lock held to protect against vma changes.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ int referenced, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
int referenced)
+#endif /* CONFIG_FINEGRAINED_THP */
{
int swapped_in = 0;
vm_fault_t ret = 0;
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
};
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif
vmf.pte = pte_offset_map(pmd, address);
- for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ for (;
+#ifdef CONFIG_FINEGRAINED_THP
+ vmf.address < address + hpage_size;
+#else
+ vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+#endif
vmf.pte++, vmf.address += PAGE_SIZE) {
vmf.orig_pte = *vmf.pte;
if (!is_swap_pte(vmf.orig_pte))
/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma, hpage_type))
+#else
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma))
+#endif
+ {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
return true;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ int node, int referenced, int unmapped,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
int node, int referenced, int unmapped)
+#endif /* CONFIG_FINEGRAINED_THP */
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
struct mmu_notifier_range range;
gfp_t gfp;
+#ifdef CONFIG_FINEGRAINED_THP
+ pte_t _pte;
+
+ VM_BUG_ON(address & (hpage_type == THP_TYPE_64KB ?
+ ~HPAGE_CONT_PTE_MASK : ~HPAGE_PMD_MASK));
+#else
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out_nolock;
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
if (result) {
mmap_read_unlock(mm);
goto out_nolock;
* If it fails, we release mmap_lock and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced, hpage_type)) {
+ mmap_read_unlock(mm);
+ goto out_nolock;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
pmd, referenced)) {
mmap_read_unlock(mm);
goto out_nolock;
}
+#endif /* CONFIG_FINEGRAINED_THP*/
mmap_read_unlock(mm);
/*
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ result = hugepage_vma_revalidate(mm, address, &vma, hpage_type);
+#else
result = hugepage_vma_revalidate(mm, address, &vma);
+#endif
if (result)
goto out;
/* check if the pmd is still valid */
anon_vma_lock_write(vma->anon_vma);
+#ifdef CONFIG_FINEGRAINED_THP
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE));
+#else
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
+#endif
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map(pmd, address);
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ /* FIXME: clearing ptes here causes
+ * __collapse_huge_page_isolate and __collapse_huge_page_copy
+ * to fail, __collapse_huge_page_copy also clears ptes
+ */
+ ;
+ else
+#endif /* CONFIG_FINEGRAINED_THP */
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
+#ifdef CONFIG_FINEGRAINED_THP
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
isolated = __collapse_huge_page_isolate(vma, address, pte,
&compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ pte_unmap(pte);
+ anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
+ goto out;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
*/
anon_vma_unlock_write(vma->anon_vma);
+#ifdef CONFIG_FINEGRAINED_THP
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
&compound_pagelist);
+#endif /* CONFIG_FINEGRAINED_THP */
pte_unmap(pte);
__SetPageUptodate(new_page);
+
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ /* 64KB hugepage */
+ _pte = arch_make_huge_pte(new_page, vma);
+ _pte = maybe_mkwrite(pte_mkdirty(_pte), vma);
+ } else {
+ /* 2MB hugepage */
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
* this is needed to avoid the copy_huge_page writes to become
smp_wmb();
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_2MB)
+#endif
+ BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
lru_cache_add_inactive_or_unevictable(new_page, vma);
+
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ arch_set_huge_pte_at(mm, address, pte, _pte, 0);
+ else {
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, address, pmd, _pmd);
+ }
+ update_mmu_cache_pmd(vma, address, pmd);
+#else /* CONFIG_FINEGRAINED_THP */
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(pmd_ptl);
- *hpage = NULL;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_2MB)
+#endif
+ *hpage = NULL;
khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_nolock:
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+ put_page(new_page);
+#endif
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
goto out_up_write;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
pmd_t *pmd;
pte_t *pte, *_pte;
int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr;
+ int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ } else {
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ hpage_nr = HPAGE_PMD_NR;
+ max_ptes_swap = khugepaged_max_ptes_swap;
+ max_ptes_none = khugepaged_max_ptes_none;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#endif /* CONFIG_FINEGRAINED_THP */
pmd = mm_find_pmd(mm, address);
if (!pmd) {
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_address = address, _pte = pte;
+#ifdef CONFIG_FINEGRAINED_THP
+ _pte < pte + hpage_nr;
+#else
+ _pte < pte+HPAGE_PMD_NR;
+#endif
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (++unmapped <= max_ptes_swap)
+#else
+ if (++unmapped <= khugepaged_max_ptes_swap)
+#endif
+ {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+ ++none_or_zero <= max_ptes_none
+#else
+ ++none_or_zero <= khugepaged_max_ptes_none
+#endif
+ )
+ {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageCompound(page) && PageTransHuge(compound_head(page))) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unmap;
+ }
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > max_ptes_shared)
+#else
if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
+ ++shared > khugepaged_max_ptes_shared)
+#endif
+ {
result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_lock released */
+#ifdef CONFIG_FINEGRAINED_THP
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped, hpage_type);
+#else
collapse_huge_page(mm, address, hpage, node,
referenced, unmapped);
+#endif
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_test_exit(mm)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ clear_hint_list(mm_slot);
+#endif
/* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, int hpage_type)
+#else
static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr)
+#endif
{
struct mm_slot *mm_slot;
+#ifdef CONFIG_FINEGRAINED_THP
+ VM_BUG_ON(addr & (hpage_type == THP_TYPE_64KB ?
+ ~HPAGE_CONT_PTE_MASK :~HPAGE_PMD_MASK));
+#else
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+#endif
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ addr |= 0x01;
+#endif
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
spin_unlock(&khugepaged_mm_lock);
spinlock_t *ptl;
int count = 0;
int i;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_type = (addr & 0x01) ? THP_TYPE_64KB : THP_TYPE_2MB;
+ int hpage_nr = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR;
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+
+ if (hpage_type == THP_TYPE_64KB)
+ haddr = addr & HPAGE_CONT_PTE_MASK;
+#endif
+#ifdef CONFIG_FINEGRAINED_THP
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + hpage_size)
+ return;
+#else /* CONFIG_FINEGRAINED_THP */
if (!vma || !vma->vm_file ||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
return;
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* This vm_flags may not have VM_HUGEPAGE if the page was not
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+ i < hpage_nr;
+#else
+ i < HPAGE_PMD_NR;
+#endif
+ i++, addr += PAGE_SIZE, pte++) {
struct page *page;
/* empty pte, skip */
/* step 2: adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+#ifdef CONFIG_FINEGRAINED_THP
+ i < hpage_nr;
+#else
+ i < HPAGE_PMD_NR;
+#endif
+ i++, addr += PAGE_SIZE, pte++) {
struct page *page;
if (pte_none(*pte))
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ pte_t *ptep = pte_offset_map(pmd, haddr);
+ arch_clear_huge_pte_range(vma->vm_mm, haddr, ptep);
+ spin_unlock(ptl);
+ } else {
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+#else /* CONFIG_FINEGRAINED_THP*/
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+#endif /* CONFIG_FINEGRAINED_THP */
drop_hpage:
unlock_page(hpage);
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ int hpage_type)
+#else
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+#endif
{
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
+#ifdef CONFIG_FINEGRAINED_THP
+ pte_t *ptep;
+ int hpage_size = (hpage_type == THP_TYPE_64KB) ?
+ HPAGE_CONT_PTE_SIZE : HPAGE_PMD_SIZE;
+#endif /* CONFIG_FINEGRAINED_THP */
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB && addr & ~HPAGE_CONT_PTE_MASK)
+ continue;
+ else if (hpage_type == THP_TYPE_2MB && addr & ~HPAGE_PMD_MASK)
+ continue;
+ if (vma->vm_end < addr + hpage_size)
+ continue;
+
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ continue;
+ if (mmap_write_trylock(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ if (hpage_type == THP_TYPE_64KB) {
+ /* 64KB hugepage */
+ ptep = pte_offset_map(pmd, addr);
+ /* pte maps are established on page fault handling */
+ arch_clear_huge_pte_range(mm, addr, ptep);
+ spin_unlock(ptl);
+ } else {
+ /* 2MB hugepage */
+ /*
+ * We need exclusive mmap_sem to retract page table.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
+ */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
+ } else
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr, hpage_type);
+#else /* CONFIG_FINEGRAINED_THP */
if (addr & ~HPAGE_PMD_MASK)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
/* Try again later */
khugepaged_add_pte_mapped_thp(mm, addr);
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
i_mmap_unlock_write(mapping);
}
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
+ struct page **hpage, int node, int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void collapse_file(struct mm_struct *mm,
struct file *file, pgoff_t start,
struct page **hpage, int node)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_NR : HPAGE_PMD_NR);
+ int hpage_order = (hpage_type == THP_TYPE_64KB ?
+ HPAGE_CONT_PTE_ORDER : HPAGE_PMD_ORDER);
+ pgoff_t index, end = start + hpage_nr;
+#else /* CONFIG_FINEGRAINED_THP */
pgoff_t index, end = start + HPAGE_PMD_NR;
+#endif /* CONFIG_FINEGRAINED_THP */
LIST_HEAD(pagelist);
+#ifdef CONFIG_FINEGRAINED_THP
+ XA_STATE_ORDER(xas, &mapping->i_pages, start, hpage_order);
+#else
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
+#endif
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
+#ifdef CONFIG_FINEGRAINED_THP
+ VM_BUG_ON(start & (hpage_nr - 1));
+#else
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+#endif
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+#ifdef CONFIG_FINEGRAINED_THP
+ new_page = khugepaged_alloc_page(hpage, gfp, node, hpage_type);
+#else
new_page = khugepaged_alloc_page(hpage, gfp, node);
+#endif
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out;
if (is_shmem)
__inc_node_page_state(new_page, NR_SHMEM_THPS);
else {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB)
+ __inc_node_page_state(new_page, NR_FILE_64KB_THPS);
+ else
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__inc_node_page_state(new_page, NR_FILE_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
filemap_nr_thps_inc(mapping);
}
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
+#ifdef CONFIG_FINEGRAINED_THP
+ int offset = 0;
+#endif
/*
* Replacing old pages with new one has succeeded, now we
*/
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type != THP_TYPE_64KB) {
+ while (index < page->index) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+ }
+
+ if (hpage_type == THP_TYPE_64KB) {
+ copy_highpage(new_page + offset, page);
+ offset++;
+ } else
+ copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+ page);
+#else /* CONFIG_FINEGRAINED_THP */
while (index < page->index) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
page);
+#endif /* CONFIG_FINEGRAINED_THP */
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
put_page(page);
index++;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (hpage_type == THP_TYPE_64KB) {
+ while (index < end) {
+ clear_highpage(new_page + offset);
+ offset++;
+ index++;
+ }
+ } else {
+ while (index < end) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+ }
+#else /* CONFIG_FINEGRAINED_THP */
while (index < end) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
+#endif /* CONFIG_FINEGRAINED_THP */
SetPageUptodate(new_page);
+#ifdef CONFIG_FINEGRAINED_THP
+ page_ref_add(new_page, hpage_nr - 1);
+#else
page_ref_add(new_page, HPAGE_PMD_NR - 1);
+#endif
if (is_shmem)
set_page_dirty(new_page);
lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ retract_page_tables(mapping, start, hpage_type);
+ if (hpage_type == THP_TYPE_2MB)
+ *hpage = NULL;
+#else /* CONFIG_FINEGRAINED_THP */
retract_page_tables(mapping, start);
*hpage = NULL;
-
+#endif /* CONFIG_FINEGRAINED_THP */
khugepaged_pages_collapsed++;
} else {
struct page *page;
unlock_page(new_page);
out:
+#ifdef CONFIG_FINEGRAINED_THP
+ if (result != SCAN_SUCCEED && new_page && hpage_type == THP_TYPE_64KB)
+ put_page(new_page);
+#endif
VM_BUG_ON(!list_empty(&pagelist));
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
+#ifdef CONFIG_FINEGRAINED_THP
+ int hpage_nr;
+ int max_ptes_swap, max_ptes_none, max_ptes_shared;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ hpage_nr = HPAGE_CONT_PTE_NR; /* 64KB */
+ max_ptes_swap = khugepaged_max_ptes_swap_64kb;
+ max_ptes_none = khugepaged_max_ptes_none_64kb;
+ max_ptes_shared = khugepaged_max_ptes_shared_64kb;
+ } else {
+ hpage_nr = HPAGE_PMD_NR; /* 2MB */
+ max_ptes_swap = khugepaged_max_ptes_swap;
+ max_ptes_none = khugepaged_max_ptes_none;
+ max_ptes_shared = khugepaged_max_ptes_shared;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
present = 0;
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+#ifdef CONFIG_FINEGRAINED_THP
+ xas_for_each(&xas, page, start + hpage_nr - 1)
+#else
+ xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1)
+#endif
+ {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (++swap > max_ptes_swap)
+#else
+ if (++swap > khugepaged_max_ptes_swap)
+#endif
+ {
result = SCAN_EXCEED_SWAP_PTE;
break;
}
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+#ifdef CONFIG_FINEGRAINED_THP
+ if (present < hpage_nr - max_ptes_none)
+#else
+ if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none)
+#endif
+ {
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
+#ifdef CONFIG_FINEGRAINED_THP
+ collapse_file(mm, file, start, hpage, node, hpage_type);
+#else
collapse_file(mm, file, start, hpage, node);
+#endif
}
}
/* TODO: tracepoints */
}
#else
+#ifdef CONFIG_FINEGRAINED_THP
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage,
+ int hpage_type)
+#else /* CONFIG_FINEGRAINED_THP */
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
+#endif /* CONFIG_FINEGRAINED_THP */
{
BUILD_BUG();
}
}
#endif
+#ifdef CONFIG_FINEGRAINED_THP
+/*
+ * if return value > 0 -> vma can make hugepage
+ * calculated hugepage start and hugepage end are stored in pointers
+ * otherwise -> vma cannot make hugepage
+ */
+static inline int hugepage_determine_htype(unsigned long vm_start,
+ unsigned long vm_end, unsigned long *hstart, unsigned long *hend) {
+ unsigned long start, end;
+
+ /* determine 2MB hugepage */
+ start = (vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ end = vm_end & HPAGE_PMD_MASK;
+ if (start >= end) {
+ /* determine 64KB hugepage */
+ start = (vm_start + ~HPAGE_CONT_PTE_MASK) & HPAGE_CONT_PTE_MASK;
+ end = vm_end & HPAGE_CONT_PTE_MASK;
+ if (start >= end)
+ return THP_TYPE_FAIL;
+ *hstart = start;
+ *hend = end;
+ return THP_TYPE_64KB;
+ }
+ *hstart = start;
+ *hend = end;
+ return THP_TYPE_2MB;
+}
+
+enum {
+ KHUGEPAGE_SCAN_CONTINUE,
+ KHUGEPAGE_SCAN_BREAK,
+ KHUGEPAGE_SCAN_BREAK_MMAP_LOCK,
+};
+
+static unsigned int khugepaged_scan_vma(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct page **hpage,
+ unsigned int pages, int *progress)
+{
+ unsigned long hstart, hend;
+ int hpage_type, ret;
+ int hpage_size, hpage_nr;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return KHUGEPAGE_SCAN_CONTINUE;
+
+ hpage_type = hugepage_determine_htype(
+ (vma->vm_start > khugepaged_scan.address) ?
+ vma->vm_start : khugepaged_scan.address,
+ vma->vm_end, &hstart, &hend);
+
+ if (hpage_type == THP_TYPE_FAIL)
+ return KHUGEPAGE_SCAN_CONTINUE;
+ if (khugepaged_scan.address > hend)
+ return KHUGEPAGE_SCAN_CONTINUE;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+
+ if (hpage_type == THP_TYPE_64KB) {
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_CONT_PTE_MASK);
+ hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ } else if (hpage_type == THP_TYPE_2MB) {
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ hpage_size = HPAGE_PMD_SIZE; /* 2MB */
+ hpage_nr = HPAGE_PMD_NR;
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR)) {
+ /* fallback, vma or file not aligned to 2MB */
+ hpage_size = HPAGE_CONT_PTE_SIZE; /* 64KB */
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ hpage_type = THP_TYPE_64KB;
+ }
+ } else
+ BUG();
+
+ while (khugepaged_scan.address < hend) {
+ if (khugepaged_scan.address + hpage_size >= hend) {
+ if (khugepaged_scan.address + HPAGE_CONT_PTE_SIZE < hend) {
+ hpage_size = HPAGE_CONT_PTE_SIZE;
+ hpage_nr = HPAGE_CONT_PTE_NR;
+ hpage_type = THP_TYPE_64KB;
+ }
+ }
+ ret = 0;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ return KHUGEPAGE_SCAN_BREAK;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + hpage_size >
+ hend);
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma,
+ khugepaged_scan.address);
+
+ mmap_read_unlock(mm);
+ ret = 1;
+ khugepaged_scan_file(mm, file, pgoff, hpage, hpage_type);
+ fput(file);
+ } else {
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage, hpage_type);
+ }
+ /* move to next address */
+ khugepaged_scan.address += hpage_size;
+ *progress += hpage_nr;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ return KHUGEPAGE_SCAN_BREAK_MMAP_LOCK;
+ if (*progress >= pages)
+ return KHUGEPAGE_SCAN_BREAK;
+ }
+ return KHUGEPAGE_SCAN_CONTINUE;
+}
+
+static struct thp_scan_hint *find_scan_hint(struct mm_slot *slot,
+ unsigned long addr)
+{
+ struct thp_scan_hint *hint;
+
+ list_for_each_entry(hint, &khugepaged_scan.hint_list, hint_list) {
+ if (hint->slot == slot)
+ return hint;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_THP_CONSERVATIVE
+/* caller must hold a proper mmap_lock */
+void khugepaged_mem_hook(struct mm_struct *mm, unsigned long addr,
+ long diff, const char *debug)
+{
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ struct thp_scan_hint *hint;
+ bool wakeup = false;
+ bool retry = false;
+
+ vma = find_vma(mm, addr);
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return;
+
+again:
+ spin_lock(&khugepaged_mm_lock);
+ slot = get_mm_slot(mm);
+ if (!slot) {
+ /* make a new slot or go out */
+ spin_unlock(&khugepaged_mm_lock);
+ if (retry)
+ return;
+ if (__khugepaged_enter(mm))
+ return;
+ retry = true;
+ goto again;
+ }
+
+ hint = find_scan_hint(slot, addr);
+ if (!hint) {
+ spin_unlock(&khugepaged_mm_lock);
+ hint = kzalloc(sizeof(struct thp_scan_hint), GFP_KERNEL);
+ hint->vma = vma;
+ hint->slot = slot;
+ hint->diff = 0;
+ hint->jiffies = jiffies;
+ spin_lock(&khugepaged_mm_lock);
+ list_add(&hint->hint_list, &khugepaged_scan.hint_list);
+ khugepaged_scan.nr_hint++;
+ }
+ hint->diff += diff;
+ if (hint->diff >= HPAGE_CONT_PTE_SIZE) {
+ wakeup = true;
+ //list_move(&hint->hint_list, &khugepaged_scan.hint_list);
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ /* if possible, wake khugepaged up for starting a scan */
+ if (wakeup) {
+ wake_up_interruptible(&khugepaged_wait);
+ }
+}
+#else /* CONFIG_THP_CONSERVATIVE */
+void khugepaged_mem_hook(struct mm_struct *mm,
+ unsigned long addr, long diff, const char *debug)
+{}
+#endif /* CONFIG_THP_CONSERVATIVE */
+
+static void clear_hint_list(struct mm_slot *slot)
+{
+ struct thp_scan_hint *hint;
+ hint = find_scan_hint(slot, 0);
+ if (hint) {
+ list_del(&hint->hint_list);
+ kfree(hint);
+ khugepaged_scan.nr_hint--;
+ }
+}
+
+static struct thp_scan_hint *get_next_hint(void)
+{
+ if (!list_empty(&khugepaged_scan.hint_list)) {
+ struct thp_scan_hint *hint = list_first_entry(
+ &khugepaged_scan.hint_list,
+ struct thp_scan_hint, hint_list);
+ list_del(&hint->hint_list);
+ khugepaged_scan.nr_hint--;
+ return hint;
+ }
+ return NULL;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else if (!list_empty(&khugepaged_scan.hint_list)) {
+ struct thp_scan_hint *hint;
+ long mem_diff;
+ unsigned long jiffies_diff;
+
+get_next_hint:
+ hint = get_next_hint();
+ if (!hint)
+ goto get_next_slot;
+
+ mm_slot = hint->slot;
+ mem_diff = hint->diff;
+ jiffies_diff = jiffies - hint->jiffies;
+ kfree(hint);
+ clear_hint_list(mm_slot);
+
+ if (khugepaged_test_exit(mm_slot->mm))
+ goto get_next_hint;
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ } else {
+get_next_slot:
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ clear_hint_list(mm_slot);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
+#endif /* CONFIG_FINEGRAINED_THP */
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
progress++;
for (; vma; vma = vma->vm_next) {
+#ifdef CONFIG_FINEGRAINED_THP
+ int ret;
+#else
unsigned long hstart, hend;
+#endif
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ ret = khugepaged_scan_vma(mm, vma, hpage, pages, &progress);
+
+ if (ret == KHUGEPAGE_SCAN_CONTINUE) {
+ progress++;
+ continue;
+ } else if (ret == KHUGEPAGE_SCAN_BREAK)
+ goto breakouterloop;
+ else if (ret == KHUGEPAGE_SCAN_BREAK_MMAP_LOCK)
+ goto breakouterloop_mmap_lock;
+#else /* CONFIG_FINEGRAINED_THP */
if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
if (progress >= pages)
goto breakouterloop;
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
breakouterloop:
mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
+#ifdef CONFIG_FINEGRAINED_THP
+ if (!list_empty(&khugepaged_scan.hint_list)) {
+ unsigned long jiffies_diff;
+ long mem_diff;
+ struct thp_scan_hint *hint;
+ struct mm_slot *next_slot;
+
+get_next_hint2:
+ hint = get_next_hint();
+
+ if (!hint) {
+ /* no more hint */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head)
+ goto get_next_slot2;
+ else
+ goto end_loop;
+ }
+
+ mem_diff = hint->diff;
+ jiffies_diff = jiffies - hint->jiffies;
+ next_slot = hint->slot;
+ kfree(hint);
+
+ if (next_slot == mm_slot)
+ goto get_next_hint2;
+
+ if (!khugepaged_test_exit(next_slot->mm)) {
+ list_move(&next_slot->mm_node, &mm_slot->mm_node);
+ clear_hint_list(next_slot);
+ } else
+ goto get_next_hint2;
+
+ khugepaged_scan.mm_slot = next_slot;
+ khugepaged_scan.address = 0;
+ } else if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+get_next_slot2:
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ clear_hint_list(khugepaged_scan.mm_slot);
+ khugepaged_scan.address = 0;
+ } else {
+end_loop:
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
khugepaged_scan.mm_slot = list_entry(
mm_slot->mm_node.next,
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
-
+#endif /* CONFIG_FINEGRAINED_THP */
collect_mm_slot(mm_slot);
}
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
+#include <linux/delay.h>
+bool eager_allocation = false;
+
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
if (!page)
continue;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (pte_cont(ptent))
+ split_huge_pte_address(vma, addr, false, NULL);
+#endif
+
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
if (!page)
continue;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (pte_cont(ptent))
+ split_huge_pte_address(vma, addr, false, NULL);
+#endif /* CONFIG_FINEGRAINED_THP */
+
/*
* If pmd isn't transhuge but the page is THP and
* is owned by only this process, split it and
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/huge_mm.h>
+#include <asm/finegrained_thp.h>
+#else
+#include <asm-generic/huge_mm.h>
+#include <asm-generic/finegrained_thp.h>
+#endif
#include "pgalloc-track.h"
#include "internal.h"
unsigned long highest_memmap_pfn __read_mostly;
+atomic_long_t nr_phys_cont_pte_pages;
+atomic_long_t nr_phys_huge_pmd_pages;
+
+unsigned long phys_cont_pte_pages(void)
+{
+ return atomic_long_read(&nr_phys_cont_pte_pages);
+}
+
+unsigned long phys_huge_pmd_pages(void)
+{
+ return atomic_long_read(&nr_phys_huge_pmd_pages);
+}
+
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
#endif /* SPLIT_RSS_COUNTING */
+#ifdef CONFIG_FINEGRAINED_THP
+void thp_print_cont_pte_table(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned long line);
+#endif /* CONFIG_FINEGRAINED_THP */
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ pte = arch_pte_clearhuge(pte);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(*src_pte))
is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ pte = arch_pte_clearhuge(pte);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
+ pte = arch_pte_clearhuge(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
+ /*
+ * when 64KB hugepage map is copied,
+ * clear contiguous bit
+ */
+ pte = arch_pte_clearhuge(pte);
retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, prealloc, pte, page);
if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
+ pte = arch_pte_clearhuge(pte);
+
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
progress++;
continue;
}
+
if (unlikely(!pte_present(*src_pte))) {
entry.val = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
progress += 8;
continue;
}
+
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, &prealloc);
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+ unsigned long end, struct page *page,
+ int *rss, spinlock_t *ptl);
+#else /* CONFIG_FINEGRAINED_THP */
+bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, pte_t **ptep, unsigned long *addr,
+ unsigned long end, struct page *page,
+ int *rss, spinlock_t *ptl)
+{
+ return false;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
details->check_mapping != page_rmapping(page))
continue;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (page && pte_cont(ptent) && PageTransHuge(compound_head(page))) {
+ if (zap_cont_pte_range(tlb, vma, pmd, &pte,
+ &addr, end, page, rss, ptl)) {
+ force_flush = 1;
+ break;
+ }
+ } else if (pte_cont(ptent))
+ atomic_long_dec(&nr_phys_cont_pte_pages);
+#endif /* CONFIG_FINEGRAINED_THP */
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
+#ifdef CONFIG_FINEGRAINED_THP
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ return arch_remap_pte_range(mm, pmd, addr, end, pfn, prot);
+}
+#else /* CONFIG_FINEGRAINED_THP */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
err = -EACCES;
break;
}
+
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte++;
+ addr += PAGE_SIZE;
+ } while (addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
+#endif /* CONFIG_FINEGRAINED_THP */
+
+static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ phys_addr_t phys_addr = __pfn_to_phys(pfn);
+ spinlock_t *ptl;
+ int ret;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ /* fixme - is this correct? */
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) {
+ pr_info("%s %d - freed pmd page??\n", __func__, __LINE__);
+ return 0;
+ }
+
+ ptl = pmd_lock(mm, pmd);
+ ret = pmd_set_huge(pmd, phys_addr, prot);
+ spin_unlock(ptl);
+
+ atomic_long_add(HPAGE_PMD_NR, &nr_phys_huge_pmd_pages);
+
+ return ret;
+}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
+
+ if (remap_try_huge_pmd(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot))
+ continue;
+
err = remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return ret;
}
+extern bool eager_allocation;
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
+#ifdef CONFIG_FINEGRAINED_THP
+#ifndef CONFIG_THP_CONSERVATIVE
+ /*
+ * 64KB hugepage creation on page fault is only allowed
+ * in an aggressive policy or a near-conservative policy
+ */
+ if (__transparent_hugepage_enabled(vma)) {
+ ret = arch_do_huge_pte_anonymous_page(vmf);
+ if (!(ret & VM_FAULT_FALLBACK)) {
+ return ret;
+ }
+ ret = 0;
+ }
+#endif /* CONFIG_THP_CONSERVATIVE */
+#endif /* CONFIG_FINEGRAINED_THP */
+
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
BUILD_BUG();
return 0;
}
+
+#ifdef CONFIG_FINEGRAINED_THP
+static vm_fault_t arch_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
#endif
/**
pte_t entry;
vm_fault_t ret;
- if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
+ compound_nr(compound_head(page)) == HPAGE_PMD_NR) {
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
+#ifdef CONFIG_FINEGRAINED_THP
+ /* PageTransHuge cannot find hugepage if the page is not a head */
+ if (PageTransCompound(page) &&
+ compound_nr(compound_head(page)) == HPAGE_CONT_PTE_NR) {
+ ret = arch_do_set_huge_pte(vmf, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
+
if (!vmf->pte) {
ret = pte_alloc_one_map(vmf);
if (ret)
update_mmu_tlb(vma, vmf->address, vmf->pte);
return VM_FAULT_NOPAGE;
}
-
+ /*
+ if (!strcmp(current->comm, "org.tizen.nlp.s") || !strcmp(current->comm, "memps"))
+ pr_info("THP-wp: huge fault for addr (%lx) (%s) %s\n",
+ vmf->address, current->comm, __func__);
+ */
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
-
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
return 0;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static inline vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ //struct timespec64 ts, te, diff;
+ int ret;
+
+#ifdef CONFIG_FINEGRAINED_THP
+ return VM_FAULT_FALLBACK;
+#endif
+
+ //ktime_get_ts64(&ts);
+ ret = do_huge_pmd_anonymous_page(vmf);
+ /*
+ ktime_get_ts64(&te);
+ diff = timespec64_sub(te, ts);
+ if (!(ret & VM_FAULT_FALLBACK))
+ pr_info("THP-fault: 2MB hugepage takes %lu nsecs\n",
+ timespec64_to_ns(&diff));
+ */
+ return ret;
+}
+#endif /* CONFIG_FINEGRAINED_THP */
+
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
+#ifdef CONFIG_FINEGRAINED_THP
+ return __do_huge_pmd_anonymous_page(vmf);
+#else
return do_huge_pmd_anonymous_page(vmf);
+#endif
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
return VM_FAULT_FALLBACK;
}
+#ifdef CONFIG_FINEGRAINED_THP
+vm_fault_t wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte);
+#endif /* CONFIG_FINEGRAINED_THP */
+
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
goto unlock;
}
if (vmf->flags & FAULT_FLAG_WRITE) {
- if (!pte_write(entry))
+ if (!pte_write(entry)) {
+ int ret = arch_do_wp_page(vmf, entry);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
return do_wp_page(vmf);
+ }
+ if (arch_huge_pte_set_accessed(vmf, entry))
+ goto unlock;
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
page_dup_rmap(new, true);
} else
#endif
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageTransHuge(new)) {
+ pte = pte_mkcont(pte_mkhuge(pte));
+ arch_set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, 0);
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, pvmw.address, true);
+ else
+ page_dup_rmap(new, true);
+ } else
+#endif /* CONFIG_FINEGRAINED_THP */
{
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_FINEGRAINED_THP
+#include <asm/finegrained_thp.h>
+#else
+#include <asm-generic/finegrained_thp.h>
+#endif
#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>
success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+ if (newbrk > oldbrk)
+ khugepaged_mem_hook(mm, origbrk, newbrk - oldbrk, __func__);
if (downgraded)
mmap_read_unlock(mm);
else
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+#ifdef CONFIG_FINEGRAINED_THP
+ if ((len >> PAGE_SHIFT) >= HPAGE_CONT_PTE_NR &&
+ file && addr == 0)
+ flags |= MAP_FILE_THP;
+#endif
+
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
allow_write_access(file);
}
file = vma->vm_file;
+ if (file && (vm_flags & VM_DENYWRITE))
+ /* read-only file pages */
+ khugepaged_mem_hook(mm, addr, len, __func__);
+ else if (!file && !vma->vm_ops)
+ /* anonymous pages */
+ khugepaged_mem_hook(mm, addr, len, __func__);
out:
perf_event_mmap(vma);
info.high_limit = mmap_end;
info.align_mask = 0;
info.align_offset = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (!addr && len >= HPAGE_PMD_SIZE) {
+ info.align_mask = HPAGE_PMD_SIZE - 1;
+ info.align_offset = HPAGE_PMD_SIZE;
+#ifdef CONFIG_FINEGRAINED_THP
+ } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) {
+ info.align_mask = HPAGE_CONT_PTE_SIZE - 1;
+ info.align_offset = HPAGE_CONT_PTE_SIZE;
+#endif
+ }
+#endif
+
return vm_unmapped_area(&info);
}
#endif
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
info.align_offset = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (!addr && len >= HPAGE_PMD_SIZE) {
+ info.align_mask = HPAGE_PMD_SIZE - 1;
+ info.align_offset = HPAGE_PMD_SIZE;
+#ifdef CONFIG_FINEGRAINED_THP
+ } else if (!addr && len >= HPAGE_CONT_PTE_SIZE) {
+ info.align_mask = HPAGE_CONT_PTE_SIZE - 1;
+ info.align_offset = HPAGE_CONT_PTE_SIZE;
+#endif
+ }
+#endif
+
addr = vm_unmapped_area(&info);
/*
pte_t ptent;
bool preserve_write = prot_numa && pte_write(oldpte);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (pte_cont(oldpte)) {
+ spin_unlock(ptl);
+ __split_huge_pte(vma, pmd, pte, addr, false, NULL);
+ spin_lock(ptl);
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
if (pte_none(*old_pte))
continue;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (pte_cont(*old_pte)) {
+ /*
+ * Contiguous ptes will be moved,
+ * and we cannot ensure their alignment.
+ * So, simply split them.
+ */
+ split_huge_pte_address(vma, old_addr, false, NULL);
+ }
+#endif /* CONFIG_FINEGRAINED_THP */
+
pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* If we are remapping a valid PTE, make sure
* disabled.
*/
if (compound)
+#ifdef CONFIG_FINEGRAINED_THP
+ {
+ if (nr == HPAGE_PMD_NR)
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ else
+ __inc_lruvec_page_state(page, NR_ANON_64KB_THPS);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
__inc_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
atomic_set(compound_mapcount_ptr(page), 0);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
-
+#ifdef CONFIG_FINEGRAINED_THP
+ if (nr == HPAGE_PMD_NR)
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ else
+ __inc_lruvec_page_state(page, NR_ANON_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__inc_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
if (PageSwapBacked(page))
+#ifdef CONFIG_FINEGRAINED_THP
+ __inc_node_page_state(page, nr == HPAGE_PMD_NR ?
+ NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED);
+#else
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+#endif
else
+#ifdef CONFIG_FINEGRAINED_THP
+ __inc_node_page_state(page, nr == HPAGE_PMD_NR ?
+ NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED);
+#else
__inc_node_page_state(page, NR_FILE_PMDMAPPED);
+#endif
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
if (PageSwapBacked(page))
+#ifdef CONFIG_FINEGRAINED_THP
+ __dec_node_page_state(page, nr == HPAGE_PMD_NR ?
+ NR_SHMEM_PMDMAPPED : NR_SHMEM_PTEMAPPED);
+#else
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+#endif
else
+#ifdef CONFIG_FINEGRAINED_THP
+ __dec_node_page_state(page, nr == HPAGE_PMD_NR ?
+ NR_FILE_PMDMAPPED : NR_FILE_PTEMAPPED);
+#else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
+#endif
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
return;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (thp_nr_pages(page) == HPAGE_PMD_NR)
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+ else
+ __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
+#else /* CONFIG_FINEGRAINED_THP */
__dec_lruvec_page_state(page, NR_ANON_THPS);
+#endif /* CONFIG_FINEGRAINED_THP */
if (TestClearPageDoubleMap(page)) {
/*
*/
__dec_lruvec_page_state(page, NR_ANON_MAPPED);
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
+ if (unlikely(PageMlocked(page))) {
+ if (unlikely(PageTransCompound(page)))
+ clear_page_mlock(compound_head(page));
+ else
+ clear_page_mlock(page);
+ }
if (PageTransCompound(page))
deferred_split_huge_page(compound_head(page));
flags & TTU_SPLIT_FREEZE, page);
}
+#ifdef CONFIG_FINEGRAINED_THP
+ if (flags & TTU_SPLIT_HUGE_PTE)
+ split_huge_pte_address(vma, address,
+ flags & TTU_SPLIT_FREEZE, page);
+#endif
+
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+#ifdef CONFIG_FINEGRAINED_THP
+ if (thp_nr_pages(page) == HPAGE_PMD_NR &&
+ huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
page_vma_mapped_walk_done(&pvmw);
break;
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
if (IS_ENABLED(CONFIG_MIGRATION) &&
return true;
/* Just proceed to delete a huge page wholly within the range punched */
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageHead(page) &&
+ page->index >= start && page->index + thp_nr_pages(page) <= end)
+ return true;
+#else
if (PageHead(page) &&
page->index >= start && page->index + HPAGE_PMD_NR <= end)
return true;
+#endif /* CONFIG_FINEGRAINED_THP */
/* Try to split huge page, so we can truly punch the hole or truncate */
return split_huge_page(page) >= 0;
clear_highpage(page);
flush_dcache_page(page);
set_page_dirty(page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (index <
+ round_up(start, thp_nr_pages(page)))
+ start = index + 1;
+#else /* CONFIG_FINEGRAINED_THP */
if (index <
round_up(start, HPAGE_PMD_NR))
start = index + 1;
+#endif /* CONFIG_FINEGRAINED_THP */
}
}
unlock_page(page);
return page;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *shmem_alloc_hugepage(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index, int page_nr)
+#else /* CONFIG_FINEGRAINED_THP */
static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
+#endif/* CONFIG_FINEGRAINED_THP */
{
struct vm_area_struct pvma;
struct address_space *mapping = info->vfs_inode.i_mapping;
pgoff_t hindex;
struct page *page;
+#ifdef CONFIG_FINEGRAINED_THP
+ hindex = round_down(index, page_nr);
+ if (xa_find(&mapping->i_pages, &hindex, hindex + page_nr - 1,
+ XA_PRESENT))
+ return NULL;
+#else /* CONFIG_FINEGRAINED_THP */
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
XA_PRESENT))
return NULL;
+#endif /* CONFIG_FINEGRAINED_THP */
shmem_pseudo_vma_init(&pvma, info, hindex);
+#ifdef CONFIG_FINEGRAINED_THP
+ page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
+ page_nr == HPAGE_PMD_NR ? HPAGE_PMD_ORDER : HPAGE_CONT_PTE_ORDER,
+ &pvma, 0, numa_node_id(), true);
+#else /* CONFIG_FINEGRAINED_THP */
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+#endif /* CONFIG_FINEGRAINED_THP */
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
return page;
}
+#ifdef CONFIG_FINEGRAINED_THP
+static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+ struct inode *inode,
+ pgoff_t index, bool huge, int page_nr)
+#else /* CONFIG_FINEGRAINED_THP */
static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
struct inode *inode,
pgoff_t index, bool huge)
+#endif /* CONFIG_FINEGRAINED_THP */
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct page *page;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
+#ifdef CONFIG_FINEGRAINED_THP
+ nr = huge ? page_nr : 1;
+#else
nr = huge ? HPAGE_PMD_NR : 1;
+#endif
if (!shmem_inode_acct_block(inode, nr))
goto failed;
if (huge)
+#ifdef CONFIG_FINEGRAINED_THP
+ page = shmem_alloc_hugepage(gfp, info, index, nr);
+#else
page = shmem_alloc_hugepage(gfp, info, index);
+#endif
else
page = shmem_alloc_page(gfp, info, index);
if (page) {
int error;
int once = 0;
int alloced = 0;
+#ifdef CONFIG_FINEGRAINED_THP
+ int nr_pages = HPAGE_PMD_NR;
+#endif
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
if (page && sgp == SGP_WRITE)
mark_page_accessed(page);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (page)
+ nr_pages = thp_nr_pages(page);
+#endif
+
/* fallocated page? */
if (page && !PageUptodate(page)) {
if (sgp != SGP_READ)
case SHMEM_HUGE_WITHIN_SIZE: {
loff_t i_size;
pgoff_t off;
-
+#ifdef CONFIG_FINEGRAINED_THP
+ off = round_up(index, nr_pages);
+#else
off = round_up(index, HPAGE_PMD_NR);
+#endif
i_size = round_up(i_size_read(inode), PAGE_SIZE);
+#ifdef CONFIG_FINEGRAINED_THP
+ if (i_size >= nr_pages * PAGE_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ goto alloc_huge;
+#else
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
+#endif
fallthrough;
}
}
alloc_huge:
+#ifdef CONFIG_FINEGRAINED_THP
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true, nr_pages);
+#else
page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+#endif
if (IS_ERR(page)) {
alloc_nohuge:
+#ifdef CONFIG_FINEGRAINED_THP
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false, 1);
+#else
page = shmem_alloc_and_acct_page(gfp, inode,
index, false);
+#endif
}
if (IS_ERR(page)) {
int retry = 5;
}
if (PageTransHuge(page))
+#ifdef CONFIG_FINEGRAINED_THP
+ hindex = round_down(index, nr_pages);
+#else
hindex = round_down(index, HPAGE_PMD_NR);
+#endif
else
hindex = index;
spin_unlock_irq(&info->lock);
alloced = true;
+#ifdef CONFIG_FINEGRAINED_THP
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + nr_pages - 1) {
+ /*
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
if (PageTransHuge(page) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
hindex + HPAGE_PMD_NR - 1) {
}
spin_unlock(&sbinfo->shrinklist_lock);
}
-
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
*/
struct page *head = compound_head(page);
if (PageTransCompound(page)) {
int i;
-
+#ifdef CONFIG_FINEGRAINED_THP
+ for (i = 0; i < thp_nr_pages(page); i++) {
+ if (head + i == page)
+ continue;
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
+ }
+#else /* CONFIG_FINEGRAINED_THP */
for (i = 0; i < HPAGE_PMD_NR; i++) {
if (head + i == page)
continue;
clear_highpage(head + i);
flush_dcache_page(head + i);
}
+#endif /* CONFIG_FINEGRAINED_THP */
}
if (copied < PAGE_SIZE) {
unsigned from = pos & (PAGE_SIZE - 1);
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
return true;
+#ifdef CONFIG_FINEGRAINED_THP
+ off = round_up(vma->vm_pgoff, HPAGE_CONT_PTE_NR);
+ if (i_size >= HPAGE_CONT_PTE_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ return true;
+#endif /* CONFIG_FINEGRAINED_THP */
fallthrough;
case SHMEM_HUGE_ADVISE:
/* TODO: implement fadvise() hints */
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
+#ifdef CONFIG_FINEGRAINED_THP
+ get_swap_pages(1, &entry, thp_nr_pages(page));
+#else
get_swap_pages(1, &entry, HPAGE_PMD_NR);
+#endif
goto out;
}
}
if (map)
ci = lock_cluster(si, offset);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+#ifdef CONFIG_FINEGRAINED_THP
+ for (i = 0; i < thp_nr_pages(page); i++)
+#else
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+#endif
+ {
mapcount = atomic_read(&page[i]._mapcount) + 1;
_total_mapcount += mapcount;
if (map) {
unlock_cluster(ci);
if (PageDoubleMap(page)) {
map_swapcount -= 1;
+#ifdef CONFIG_FINEGRAINED_THP
+ _total_mapcount -= thp_nr_pages(page);
+#else
_total_mapcount -= HPAGE_PMD_NR;
+#endif
}
mapcount = compound_mapcount(page);
map_swapcount += mapcount;
unlock_page(page);
continue;
} else if (PageTransHuge(page)) {
+#ifdef CONFIG_FINEGRAINED_THP
+ index += thp_nr_pages(page) - 1;
+ i += thp_nr_pages(page) - 1;
+#else /* CONFIG_FINEGRAINED_THP */
index += HPAGE_PMD_NR - 1;
i += HPAGE_PMD_NR - 1;
+#endif /* CONFIG_FINEGRAINED_THP */
/*
* 'end' is in the middle of THP. Don't
* invalidate the page as the part outside of
bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
+#ifdef CONFIG_FINEGRAINED_THP
+ {
+ if (nr_pages == HPAGE_PMD_NR)
+ flags |= TTU_SPLIT_HUGE_PMD;
+ else
+ flags |= TTU_SPLIT_HUGE_PTE;
+ }
+#else /* CONFIG_FINEGRAINED_THP */
flags |= TTU_SPLIT_HUGE_PMD;
+#endif /* CONFIG_FINEGRAINED_THP */
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail += nr_pages;
"nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
+#ifdef CONFIG_FINEGRAINED_THP
+ "nr_shmem_ptemapped",
+ "nr_file_64kb_hugepages",
+#endif
"nr_file_hugepages",
"nr_file_pmdmapped",
+#ifdef CONFIG_FINEGRAINED_THP
+ "nr_file_ptemapped",
+#endif
"nr_anon_transparent_hugepages",
+#ifdef CONFIG_FINEGRAINED_THP
+ "nr_anon_64KB_transparent_hugepages",
+#endif
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"thp_split_page",
"thp_split_page_failed",
"thp_deferred_split_page",
+#ifdef CONFIG_FINEGRAINED_THP
+ "thp_split_cont_pte",
+#endif
"thp_split_pmd",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
"thp_split_pud",