2 * Hugepage support for arm64 architecture
8 #include <linux/huge_mm.h>
9 #include <linux/rmap.h>
10 #include <linux/swap.h>
11 #include <linux/swapops.h>
12 #include <linux/khugepaged.h>
13 #include <linux/userfaultfd_k.h>
14 #include <linux/oom.h>
16 #include <asm/huge_mm.h>
17 #include <asm/pgalloc.h>
19 #include <asm/tlbflush.h>
21 #ifdef CONFIG_FINEGRAINED_THP
22 pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
23 unsigned long address, pte_t *ptep)
28 VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
29 VM_BUG_ON(!pte_cont(*ptep));
30 pte = ptep_get_and_clear(vma->vm_mm, address, ptep);
32 for (i = 1; i < HPAGE_CONT_PTE_NR; i++)
33 ptep_get_and_clear(vma->vm_mm, address + PAGE_SIZE * i, ptep + i);
35 flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
39 #define USE_THP_PRINT_CONT_TABLE
40 #ifdef USE_THP_PRINT_CONT_TABLE
41 void thp_print_cont_pte_table(struct mm_struct *mm,
42 unsigned long addr, pte_t *ptep, unsigned long line)
47 pr_info("THP: %s from %lu proc-%d(%s)\n", __func__, line,
48 task_pid_nr(mm->owner), mm->owner->comm);
49 pid = task_pid_nr(mm->owner);
51 pr_info("THP: %s from %lu\n", __func__, line);
52 for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE) {
53 pr_info("%lx: %llx pid(%d)\n", addr, pte_val(*ptep), pid);
57 void thp_print_cont_pte_table(struct mm_struct *mm,
58 unsigned long addr, pte_t *ptep, unsigned long line)
60 #endif /* USE_THP_PRINT_CONT_TABLE */
63 * always: directly stall for all thp allocations
64 * defer: wake kswapd and fail if not immediately available
65 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
66 * fail if not immediately available
67 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
69 * never: never stall for any thp allocation
71 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
73 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
75 /* Always do synchronous compaction */
76 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
77 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
79 /* Kick kcompactd and fail quickly */
80 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
81 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
83 /* Synchronous compaction if madvised, otherwise kick kcompactd */
84 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
85 return GFP_TRANSHUGE_LIGHT |
86 (vma_madvised ? __GFP_DIRECT_RECLAIM :
87 __GFP_KSWAPD_RECLAIM);
89 /* Only do synchronous compaction if madvised */
90 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
91 return GFP_TRANSHUGE_LIGHT |
92 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
94 return GFP_TRANSHUGE_LIGHT;
98 * a caller must hold both locks of dst and src
100 int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
101 pte_t *dst_pte, pte_t *src_pte, unsigned long haddr,
102 struct vm_area_struct *vma, int *rss)
104 struct page *src_page;
105 unsigned long addr = haddr;
110 src_page = vm_normal_page(vma, addr, pte);
114 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
116 page_dup_rmap(src_page, true);
118 rss[MM_ANONPAGES] += HPAGE_CONT_PTE_NR;
120 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
123 while (addr < haddr + HPAGE_CONT_PTE_SIZE) {
124 ptep_set_wrprotect(src_mm, addr, _pte);
127 pte = pte_mkold(pte_wrprotect(pte));
128 arm64_set_huge_pte_at(dst_mm, haddr, dst_pte, pte, 0);
133 vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
137 struct vm_area_struct *vma = vmf->vma;
138 bool write = vmf->flags & FAULT_FLAG_WRITE;
139 unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
140 pgoff_t index, pgoff, addroff, headoff;
141 vm_fault_t ret = VM_FAULT_FALLBACK;
143 if (!transhuge_adv_vma_suitable(vma, haddr))
144 return VM_FAULT_FALLBACK;
146 page = compound_head(page);
149 addroff = (vmf->address - haddr) >> PAGE_SHIFT;
151 if (pgoff - index != addroff)
152 return VM_FAULT_FALLBACK;
155 * Archs like ppc64 need additonal space to store information
156 * related to pte entry. Use the preallocated table for that.
158 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
159 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
160 if (!vmf->prealloc_pte)
162 smp_wmb(); /* See comment in __pte_alloc() */
165 if (unlikely(pmd_none(*vmf->pmd))) {
166 if (pte_alloc(vma->vm_mm, vmf->pmd))
171 /* The head offset indicates the position of the first page in the hugepage */
172 headoff = (addroff + (HPAGE_CONT_PTE_NR - pgoff)) % HPAGE_CONT_PTE_NR;
173 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, haddr, &vmf->ptl);
174 if (!vmf->pte || unlikely(!pte_none(*vmf->pte))) {
175 spin_unlock(vmf->ptl);
180 entry = arm64_make_huge_pte(compound_head(page), vma);
182 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
183 for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
184 flush_icache_page(vma, page + i);
185 if (write && !(vma->vm_flags & VM_SHARED)) {
186 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
188 page_add_new_anon_rmap(page, vma, haddr, true);
190 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
191 page_add_file_rmap(page, true);
194 arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, headoff);
195 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
196 count_vm_event(THP_FILE_MAPPED);
200 static vm_fault_t arm64_do_huge_pte_wp_page_fallback(struct vm_fault *vmf,
201 pte_t orig_pte, struct page *page)
203 struct vm_area_struct *vma = vmf->vma;
204 unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
208 struct mmu_notifier_range range;
210 pages = kmalloc_array(HPAGE_CONT_PTE_NR, sizeof(struct page *),
212 if (unlikely(!pages)) {
217 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
218 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
220 if (unlikely(!pages[i] ||
221 mem_cgroup_charge(pages[i], vma->vm_mm,
234 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
235 copy_user_highpage(pages[i], page + i,
236 haddr + PAGE_SIZE * i, vma);
237 __SetPageUptodate(pages[i]);
241 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
242 haddr, haddr + HPAGE_CONT_PTE_SIZE);
243 mmu_notifier_invalidate_range_start(&range);
245 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
246 if (unlikely(!pte_same(*vmf->pte, orig_pte)))
248 VM_BUG_ON_PAGE(!PageHead(page), page);
251 * Leave pmd empty until pte is filled note we must notify here as
252 * concurrent CPU thread might write to new page before the call to
253 * mmu_notifier_invalidate_range_end() happens which can lead to a
254 * device seeing memory write in different order than CPU.
256 * See Documentation/vm/mmu_notifier.rst
258 vmf->pte = pte_offset_map(vmf->pmd, haddr);
259 ptep_huge_clear_flush_notify(vma, haddr, vmf->pte);
261 for (i = 0; i < HPAGE_CONT_PTE_NR; i++, haddr += PAGE_SIZE) {
263 entry = mk_pte(pages[i], vma->vm_page_prot);
264 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
265 set_page_private(pages[i], 0);
267 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
268 lru_cache_add_inactive_or_unevictable(pages[i], vma);
269 vmf->pte = pte_offset_map(vmf->pmd, haddr);
270 VM_BUG_ON(!pte_none(*vmf->pte));
271 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
276 smp_wmb(); /* make pte visible before pmd */
277 page_remove_rmap(page, true);
278 spin_unlock(vmf->ptl);
281 * No need to double call mmu_notifier->invalidate_range() callback as
282 * the above pmdp_huge_clear_flush_notify() did already call it.
284 mmu_notifier_invalidate_range_only_end(&range);
286 ret |= VM_FAULT_WRITE;
293 spin_unlock(vmf->ptl);
294 mmu_notifier_invalidate_range_end(&range);
295 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
296 set_page_private(pages[i], 0);
303 vm_fault_t arm64_do_huge_pte_wp_page(struct vm_fault *vmf, pte_t orig_pte)
305 struct vm_area_struct *vma = vmf->vma;
306 struct page *page = NULL, *new_page;
307 unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
308 struct mmu_notifier_range range;
309 gfp_t huge_gfp; /* for allocation and charge */
312 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
313 VM_BUG_ON_VMA(!vma->anon_vma, vma);
316 if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
317 spin_unlock(vmf->ptl);
321 page = pte_page(orig_pte);
322 VM_BUG_ON_PAGE(!PageCompound(page), page);
323 page = compound_head(page);
325 * We can only reuse the page if nobody else maps the huge page or it's
328 if (!trylock_page(page)) {
330 spin_unlock(vmf->ptl);
333 if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
334 spin_unlock(vmf->ptl);
342 if (reuse_swap_page(page, NULL)) {
343 huge_cont_pte_set_accessed(vmf, orig_pte);
345 spin_unlock(vmf->ptl);
346 return VM_FAULT_WRITE;
350 spin_unlock(vmf->ptl);
353 * For 2MB hugepage, the kernel just splits it
354 * into standard-sized pages and fallbacks to
355 * normal page fault handling path.
357 * For 64KB hugepage, I think alloc-on-COW can
358 * be get a performance benefit. This is because,
359 * significant time is consumed for copying contents
360 * of 2MB page, but 64KB page is much smaller than
361 * 2MB page. So, I guess that the overhead can be
364 * TODO: accounting time overhead of below procedure
366 #ifdef CONFIG_THP_CONSERVATIVE
369 if (__transparent_hugepage_enabled(vma)) {
370 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
371 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr,
372 HPAGE_CONT_PTE_ORDER);
376 if (likely(new_page)) {
377 prep_transhuge_page(new_page);
380 split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
381 ret |= VM_FAULT_FALLBACK;
383 ret = arm64_do_huge_pte_wp_page_fallback(vmf, orig_pte, page);
384 if (ret & VM_FAULT_OOM) {
385 split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
386 ret |= VM_FAULT_FALLBACK;
390 count_vm_event(THP_FAULT_FALLBACK);
394 if (unlikely(mem_cgroup_charge(new_page, vma->vm_mm,
397 split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
400 ret |= VM_FAULT_FALLBACK;
401 count_vm_event(THP_FAULT_FALLBACK);
405 count_vm_event(THP_FAULT_ALLOC);
406 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
409 clear_huge_page(new_page, vmf->address, HPAGE_CONT_PTE_NR);
411 copy_user_huge_page(new_page, page, vmf->address,
412 vma, HPAGE_CONT_PTE_NR);
413 __SetPageUptodate(new_page);
415 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
416 haddr, haddr + HPAGE_CONT_PTE_SIZE);
417 mmu_notifier_invalidate_range_start(&range);
422 if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
423 spin_unlock(vmf->ptl);
424 mem_cgroup_uncharge(new_page);
430 entry = arm64_make_huge_pte(new_page, vma);
431 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
433 vmf->pte = pte_offset_map(vmf->pmd, haddr);
435 page_add_new_anon_rmap(new_page, vma, haddr, true);
436 lru_cache_add_inactive_or_unevictable(new_page, vma);
438 arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, 0);
439 update_mmu_cache(vma, vmf->address, vmf->pte);
442 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
444 VM_BUG_ON_PAGE(!PageHead(page), page);
445 page_remove_rmap(page, true);
448 ret |= VM_FAULT_WRITE;
450 spin_unlock(vmf->ptl);
453 * No need to double call mmu_notifier->invalidate_range() callback as
454 * the above pmdp_huge_clear_flush_notify() did already call it.
456 mmu_notifier_invalidate_range_only_end(&range);
459 #ifdef CONFIG_THP_CONSERVATIVE
461 __split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address, false, NULL);
462 return VM_FAULT_FALLBACK;
463 #endif /* CONFIG_THP_CONSERVATIVE */
466 /* the caller must hold lock */
467 vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte)
469 unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
472 if (vma_is_anonymous(vmf->vma)) {
473 spin_unlock(vmf->ptl);
474 return arm64_do_huge_pte_wp_page(vmf, orig_pte);
477 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
479 hpte_p = pte_offset_map(vmf->pmd, haddr);
480 spin_unlock(vmf->ptl);
481 __split_huge_pte(vmf->vma, vmf->pmd, hpte_p, haddr, false, NULL);
484 return VM_FAULT_FALLBACK;
487 static inline int check_huge_pte_range(pte_t *head)
491 for (i = 0; i < HPAGE_CONT_PTE_NR; i++, head++) {
492 if (!pte_none(*head))
498 void thp_print_cont_pte_table(struct mm_struct *mm,
499 unsigned long addr, pte_t *ptep, unsigned long line);
501 static vm_fault_t __do_huge_pte_anonymous_page(struct vm_fault *vmf,
502 struct page *page, gfp_t gfp)
504 struct vm_area_struct *vma = vmf->vma;
505 unsigned long offset, haddr = vmf->address & HPAGE_CONT_PTE_MASK;
509 VM_BUG_ON_PAGE(!PageCompound(page), page);
511 if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
513 count_vm_event(THP_FAULT_FALLBACK);
514 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
515 return VM_FAULT_FALLBACK;
517 cgroup_throttle_swaprate(page, gfp);
519 clear_huge_page(compound_head(page), haddr, HPAGE_CONT_PTE_NR);
521 * The memory barrier inside __SetPageUptodate makes sure that
522 * clear_huge_page writes become visible before the set_pmd_at()
525 __SetPageUptodate(page);
527 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
528 ret = check_stable_address_space(vma->vm_mm);
532 if (userfaultfd_missing(vma)) {
533 spin_unlock(vmf->ptl);
535 ret = handle_userfault(vmf, VM_UFFD_MISSING);
536 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
540 entry = arm64_make_huge_pte(page, vma);
541 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
542 offset = (vmf->address - haddr) >> PAGE_SHIFT;
543 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
544 if (!pte_none(*vmf->pte)) {
545 ret = VM_FAULT_FALLBACK;
548 if (check_huge_pte_range(vmf->pte - offset)) {
551 ret = VM_FAULT_FALLBACK;
555 page_add_new_anon_rmap(page, vma, haddr, true);
556 lru_cache_add_inactive_or_unevictable(page, vma);
557 arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte - offset, entry, 0);
558 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
560 spin_unlock(vmf->ptl);
562 count_vm_event(THP_FAULT_ALLOC);
563 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
568 spin_unlock(vmf->ptl);
574 vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf)
576 struct vm_area_struct *vma = vmf->vma;
578 unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
582 if (!transhuge_adv_vma_suitable(vma, haddr))
583 return VM_FAULT_FALLBACK;
584 if (unlikely(anon_vma_prepare(vma)))
586 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
588 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
589 !mm_forbids_zeropage(vma->vm_mm) &&
590 transparent_hugepage_use_zero_page()) {
591 return VM_FAULT_FALLBACK;
593 ptl = pmd_lock(vma->vm_mm, vmf->pmd);
594 vmf->pte = pte_offset_map(vmf->pmd, haddr);
595 if (check_huge_pte_range(vmf->pte)) {
598 return VM_FAULT_FALLBACK;
603 gfp = alloc_hugepage_direct_gfpmask(vma);
604 page = alloc_hugepage_vma(gfp, vma,
606 HPAGE_CONT_PTE_ORDER);
607 if (unlikely(!page)) {
608 count_vm_event(THP_FAULT_FALLBACK);
609 return VM_FAULT_FALLBACK;
611 prep_transhuge_page(page);
612 return __do_huge_pte_anonymous_page(vmf, page, gfp);
615 bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
616 pmd_t *pmd, pte_t **ptep, unsigned long *addr,
617 unsigned long end, struct page *page,
618 int *rss, spinlock_t *ptl)
620 struct mm_struct *mm = tlb->mm;
621 unsigned long haddr = (*addr) & HPAGE_CONT_PTE_MASK;
622 unsigned long range_end =
623 ((haddr + HPAGE_CONT_PTE_SIZE) > end) ? end :
624 haddr + HPAGE_CONT_PTE_SIZE;
625 size_t size = range_end - haddr;
626 unsigned long map_count = size >> PAGE_SHIFT;
629 pte = pte_offset_map(pmd, haddr);
631 if ((*addr) == haddr && haddr + HPAGE_CONT_PTE_SIZE <= range_end) {
632 arm64_clear_and_flush(mm, *addr, pte, PAGE_SIZE, map_count);
633 page_remove_rmap(compound_head(page), true);
634 rss[mm_counter(page)] -= map_count;
635 __tlb_adjust_range(tlb, *addr, size);
636 __tlb_remove_tlb_entry(tlb, pte, *addr);
637 tlb_remove_page_size(tlb, page, size);
643 *addr = end - PAGE_SIZE;
647 if (haddr < vma->vm_start) {
648 pr_err("haddr(%lx) is less than vm start(%lx)\n",
649 haddr, vma->vm_start);
650 thp_print_cont_pte_table(mm, haddr, pte, __LINE__);
654 __split_huge_pte(vma, pmd, pte, haddr, false, NULL);
660 return map_count == HPAGE_CONT_PTE_NR;
663 /* caller must hold a proper lock */
664 void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte)
669 bool write = vmf->flags & FAULT_FLAG_WRITE;
671 haddr = vmf->address & HPAGE_CONT_PTE_MASK;
672 pte = pte_offset_map(vmf->pmd, haddr);
674 for (i = 0; i < HPAGE_CONT_PTE_NR; i++, pte++, haddr += PAGE_SIZE) {
675 entry = pte_mkyoung(*pte);
677 entry = pte_mkwrite(pte_mkdirty(entry));
678 ptep_set_access_flags(vmf->vma, haddr, pte, entry, write);
680 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
684 * FOLL_FORCE can write to even unwritable pmd's, but only
685 * after we've gone through a COW cycle and they are dirty.
687 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
689 return pmd_write(pmd) ||
690 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
693 extern void mlock_vma_page(struct page *page);
694 extern void clear_page_mlock(struct page *page);
696 struct page *follow_trans_huge_pte(struct vm_area_struct *vma,
701 struct mm_struct *mm = vma->vm_mm;
702 struct page *page = NULL;
705 assert_spin_locked(pmd_lockptr(mm, pmd));
707 if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
710 /* Avoid dumping huge zero page */
711 if ((flags & FOLL_DUMP))
712 return ERR_PTR(-EFAULT);
714 /* Full NUMA hinting faults to serialise migration in fault paths */
715 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
718 pte = pte_offset_map(pmd, addr);
719 page = pte_page(*pte);
720 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
722 if (!try_grab_page(page, flags))
723 return ERR_PTR(-ENOMEM);
725 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
727 * We don't mlock() pte-mapped THPs. This way we can avoid
728 * leaking mlocked pages into non-VM_LOCKED VMAs.
732 * In most cases the pmd is the only mapping of the page as we
733 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
734 * writable private mappings in populate_vma_page_range().
736 * The only scenario when we have the page shared here is if we
737 * mlocking read-only mapping shared over fork(). We skip
738 * mlocking such pages.
742 * We can expect PageDoubleMap() to be stable under page lock:
743 * for file pages we set it in page_add_file_rmap(), which
744 * requires page to be locked.
747 if (PageAnon(page) && compound_mapcount(page) != 1)
749 if (PageDoubleMap(page) || !page->mapping)
751 if (!trylock_page(page))
753 if (page->mapping && !PageDoubleMap(page))
754 mlock_vma_page(page);
758 page += (addr & ~HPAGE_CONT_PTE_MASK) >> PAGE_SHIFT;
759 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
765 static inline pte_t ptep_invalidate(struct vm_area_struct *vma,
766 unsigned long address, pte_t *ptep)
768 return __pte(xchg_relaxed(&pte_val(*ptep), (pte_val(*ptep) & ~PTE_VALID)));
771 extern atomic_long_t nr_phys_cont_pte_pages;
773 static int remap_try_huge_pte(struct mm_struct *mm, pte_t *pte, unsigned long addr,
774 unsigned long end, unsigned long pfn,
777 phys_addr_t phys_addr = __pfn_to_phys(pfn);
780 if ((end - addr) != CONT_PTE_SIZE)
783 if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
786 if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
789 entry = pte_mkspecial(pte_mkcont(pte_mkhuge(pfn_pte(pfn, prot))));
790 arch_set_huge_pte_at(mm, addr, pte, entry, 0);
792 atomic_long_add(HPAGE_CONT_PTE_NR, &nr_phys_cont_pte_pages);
797 int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
798 unsigned long addr, unsigned long end,
799 unsigned long pfn, pgprot_t prot)
801 pte_t *pte, *mapped_pte;
806 mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
809 arch_enter_lazy_mmu_mode();
811 BUG_ON(!pte_none(*pte));
812 if (!pfn_modify_allowed(pfn, prot)) {
817 next = pte_cont_addr_end(addr, end);
818 if (remap_try_huge_pte(mm, pte, addr, next, pfn, prot)) {
819 pte += HPAGE_CONT_PTE_NR;
820 pfn += HPAGE_CONT_PTE_NR;
821 addr += HPAGE_CONT_PTE_SIZE;
823 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
828 } while (addr != end);
829 arch_leave_lazy_mmu_mode();
830 pte_unmap_unlock(mapped_pte, ptl);
834 /* caller must hold appropriate lock (pmd lock) */
835 int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
836 unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
838 struct mm_struct *mm = vma->vm_mm;
841 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
844 preserve_write = prot_numa && pte_write(*pte);
847 /* currently, we don't consider numa cases, but just remain them
848 * for the future work */
849 if (prot_numa && is_huge_zero_page(pte_page(*pte)))
852 if (prot_numa && pte_protnone(*pte))
855 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
856 entry = ptep_invalidate(vma, addr, pte);
857 entry = pte_modify(entry, newprot);
859 entry = pte_mk_savedwrite(entry);
860 entry = pte_mkcont(entry);
862 set_pte_at(mm, addr, pte, entry);
867 flush_tlb_range(vma, addr, addr + HPAGE_CONT_PTE_SIZE);
868 ret = HPAGE_CONT_PTE_NR;
873 static void __split_huge_pte_locked(struct vm_area_struct *vma, pte_t *pte,
874 unsigned long haddr, bool freeze)
876 struct mm_struct *mm = vma->vm_mm;
879 bool young, write, soft_dirty, pte_migration = false, uffd_wp = false;
883 VM_BUG_ON(haddr & ~HPAGE_CONT_PTE_MASK);
884 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
885 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_CONT_PTE_SIZE, vma);
887 count_vm_event(THP_SPLIT_CONT_PTE);
889 if (!vma_is_anonymous(vma)) {
890 _pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
893 page = pte_page(_pte);
894 if (!PageDirty(page) && pte_dirty(_pte))
895 set_page_dirty(page);
896 if (!PageReferenced(page) && pte_young(_pte))
897 SetPageReferenced(page);
898 page_remove_rmap(page, true);
900 add_mm_counter(mm, mm_counter_file(page), -HPAGE_CONT_PTE_NR);
902 } else if (is_huge_zero_page(pte_page(*pte))) {
903 pr_err("contiguous pte mapping for zero anon pages are not supported yet");
907 old_pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
909 pte_migration = is_pte_migration_entry(old_pte);
910 if (unlikely(pte_migration)) {
913 entry = pte_to_swp_entry(old_pte);
914 page = pfn_to_page(swp_offset(entry));
915 write = is_write_migration_entry(entry);
917 soft_dirty = pte_swp_soft_dirty(old_pte);
918 uffd_wp = pte_swp_uffd_wp(old_pte);
920 page = pte_page(old_pte);
921 if (pte_dirty(old_pte))
923 write = pte_write(old_pte);
924 young = pte_young(old_pte);
925 soft_dirty = pte_soft_dirty(old_pte);
926 uffd_wp = pte_uffd_wp(old_pte);
929 VM_BUG_ON_PAGE(!page_count(page), page);
930 page_ref_add(page, HPAGE_CONT_PTE_NR - 1);
932 for (i = 0, addr = haddr; i < HPAGE_CONT_PTE_NR;
933 i++, addr += PAGE_SIZE, pte++) {
936 if (freeze || pte_migration) {
937 swp_entry_t swp_entry;
938 swp_entry = make_migration_entry(page + i, write);
939 entry = swp_entry_to_pte(swp_entry);
941 entry = pte_swp_mksoft_dirty(entry);
943 entry = pte_swp_mkuffd_wp(entry);
945 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
946 entry = maybe_mkwrite(entry, vma);
948 entry = pte_wrprotect(entry);
950 entry = pte_mkold(entry);
952 entry = pte_mksoft_dirty(entry);
954 entry = pte_mkuffd_wp(entry);
956 //BUG_ON(!pte_none(*pte));
957 set_pte_at(mm, addr, pte, entry);
959 atomic_inc(&page[i]._mapcount);
963 if (!pte_migration) {
965 * Set PG_double_map before dropping compound_mapcount to avoid
966 * false-negative page_mapped().
968 if (compound_mapcount(page) > 1 &&
969 !TestSetPageDoubleMap(page)) {
970 for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
971 atomic_inc(&page[i]._mapcount);
974 lock_page_memcg(page);
975 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
976 /* Last compound_mapcount is gone. */
977 __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
978 if (TestClearPageDoubleMap(page)) {
979 /* No need in mapcount reference anymore */
980 for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
981 atomic_dec(&page[i]._mapcount);
984 unlock_page_memcg(page);
990 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
991 page_remove_rmap(page + i, false);
997 void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
998 pte_t *pte, unsigned long address,
999 bool freeze, struct page *page)
1002 struct mmu_notifier_range range;
1004 bool locked = false;
1006 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1007 address & HPAGE_CONT_PTE_MASK,
1008 (address & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE);
1009 mmu_notifier_invalidate_range_start(&range);
1010 ptl = pmd_lock(vma->vm_mm, pmd);
1013 VM_WARN_ON_ONCE(!PageLocked(page));
1014 if (page != pte_page(*pte))
1018 if (pte_cont(*pte)) {
1020 page = pte_page(*pte);
1022 * An anonymous page must be locked, to ensure that a
1023 * concurrent reuse_swap_page() sees stable mapcount;
1024 * but reuse_swap_page() is not used on shmem or file,
1025 * and page lock must not be taken when zap_pte_range()
1026 * calls __split_huge_pte() while i_mmap_lock is held.
1028 if (PageAnon(page)) {
1029 if (unlikely(!trylock_page(page))) {
1035 if (unlikely(!pte_same(*pte, _pte))) {
1046 if (PageMlocked(page))
1047 clear_page_mlock(page);
1048 } else if (!(pte_devmap(*pte) || is_pte_migration_entry(*pte)))
1050 __split_huge_pte_locked(vma, pte, range.start, freeze);
1055 mmu_notifier_invalidate_range_only_end(&range);
1058 void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
1059 bool freeze, struct page *page)
1061 unsigned long haddr = address & HPAGE_CONT_PTE_MASK;
1068 pgd = pgd_offset(vma->vm_mm, haddr);
1069 if (!pgd_present(*pgd))
1072 p4d = p4d_offset(pgd, haddr);
1073 if (!p4d_present(*p4d))
1076 pud = pud_offset(p4d, haddr);
1077 if (!pud_present(*pud))
1080 pmd = pmd_offset(pud, haddr);
1081 if (!pmd_present(*pmd))
1084 pte = pte_offset_map(pmd, haddr);
1085 if (!pte_present(*pte))
1088 __split_huge_pte(vma, pmd, pte, haddr, freeze, page);
1091 void set_huge_pte_migration_entry(
1092 struct page_vma_mapped_walk *pvmw,
1096 struct vm_area_struct *vma = pvmw->vma;
1097 struct mm_struct *mm = vma->vm_mm;
1098 unsigned long address = pvmw->address;
1102 struct page *_page = page;
1104 if (!(pvmw->pmd && pvmw->pte))
1107 flush_cache_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
1110 //arch_set_huge_pte_at(mm, address, pvmw->pte, ptee);
1111 for (i = 0, pte = pvmw->pte; i < HPAGE_CONT_PTE_NR; i++, pte++) {
1112 pteval = ptep_invalidate(vma, address, pte);
1113 if (pte_dirty(pteval))
1114 set_page_dirty(_page);
1115 entry = make_migration_entry(page, pte_write(pteval));
1116 pteswp = swp_entry_to_pte(entry);
1117 if (pte_soft_dirty(pteval))
1118 pteswp = pte_swp_mksoft_dirty(pteswp);
1119 set_pte_at(mm, address, pte, pteswp);
1121 address += PAGE_SIZE;
1125 pvmw->address = address;
1127 page_remove_rmap(page, true);
1131 void remove_migration_huge_pte(
1132 struct page_vma_mapped_walk *pvmw, struct page *new)
1134 struct vm_area_struct *vma = pvmw->vma;
1135 struct mm_struct *mm = vma->vm_mm;
1136 unsigned long address = pvmw->address;
1137 unsigned long mmun_start = address & HPAGE_CONT_PTE_MASK;
1141 if (!(pvmw->pmd && !pvmw->pte))
1144 entry = pmd_to_swp_entry(*pvmw->pmd);
1146 ptee = pte_mkold(arch_make_huge_pte(new, vma));
1147 if (pte_swp_soft_dirty(*pvmw->pte))
1148 ptee = pte_mksoft_dirty(ptee);
1149 if (is_write_migration_entry(entry))
1150 ptee = maybe_mkwrite(ptee, vma);
1152 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_CONT_PTE_SIZE);
1154 page_add_anon_rmap(new, vma, mmun_start, true);
1156 page_add_file_rmap(new, true);
1158 arch_set_huge_pte_at(mm, mmun_start, pvmw->pte, ptee, 0);
1159 if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
1160 mlock_vma_page(new);
1161 pvmw->address = address + HPAGE_CONT_PTE_SIZE;
1162 pvmw->pte = pvmw->pte + HPAGE_CONT_PTE_NR;
1163 update_mmu_cache_pmd(vma, address, pvmw->pmd);
1165 #endif /* CONFIG_FINEGRAINED_THP */