mm, thp, migrate: handling migration of 64KB hugepages
[platform/kernel/linux-rpi.git] / arch / arm64 / mm / huge_memory.c
1 /*
2  * Hugepage support for arm64 architecture
3  *
4  * 21.08.07.
5  *
6  */
7
8 #include <linux/huge_mm.h>
9 #include <linux/rmap.h>
10 #include <linux/swap.h>
11 #include <linux/swapops.h>
12 #include <linux/khugepaged.h>
13 #include <linux/userfaultfd_k.h>
14 #include <linux/oom.h>
15
16 #include <asm/huge_mm.h>
17 #include <asm/pgalloc.h>
18 #include <asm/tlb.h>
19 #include <asm/tlbflush.h>
20
21 #ifdef CONFIG_FINEGRAINED_THP
22 pte_t ptep_huge_clear_flush(struct vm_area_struct *vma,
23                                 unsigned long address, pte_t *ptep)
24 {
25         pte_t pte;
26         int i;
27
28         VM_BUG_ON(address & ~HPAGE_CONT_PTE_MASK);
29         VM_BUG_ON(!pte_cont(*ptep));
30         pte = ptep_get_and_clear(vma->vm_mm, address, ptep);
31
32         for (i = 1; i < HPAGE_CONT_PTE_NR; i++)
33                 ptep_get_and_clear(vma->vm_mm, address + PAGE_SIZE * i, ptep + i);
34
35         flush_tlb_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
36         return pte;
37 }
38
39 #define USE_THP_PRINT_CONT_TABLE
40 #ifdef USE_THP_PRINT_CONT_TABLE
41 void thp_print_cont_pte_table(struct mm_struct *mm,
42                         unsigned long addr, pte_t *ptep, unsigned long line)
43 {
44         int i, pid = 0;
45
46         if (mm->owner) {
47                 pr_info("THP: %s from %lu proc-%d(%s)\n", __func__, line,
48                                 task_pid_nr(mm->owner), mm->owner->comm);
49                 pid = task_pid_nr(mm->owner);
50         } else
51                 pr_info("THP: %s from %lu\n", __func__, line);
52         for (i = 0; i < HPAGE_CONT_PTE_NR; i++, ptep++, addr += PAGE_SIZE) {
53                 pr_info("%lx: %llx pid(%d)\n", addr, pte_val(*ptep), pid);
54         }
55 }
56 #else
57 void thp_print_cont_pte_table(struct mm_struct *mm,
58                         unsigned long addr, pte_t *ptep, unsigned long line)
59 {}
60 #endif /* USE_THP_PRINT_CONT_TABLE */
61
62 /*
63  * always: directly stall for all thp allocations
64  * defer: wake kswapd and fail if not immediately available
65  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
66  *                fail if not immediately available
67  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
68  *          available
69  * never: never stall for any thp allocation
70  */
71 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
72 {
73         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
74
75         /* Always do synchronous compaction */
76         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
77                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
78
79         /* Kick kcompactd and fail quickly */
80         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
81                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
82
83         /* Synchronous compaction if madvised, otherwise kick kcompactd */
84         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
85                 return GFP_TRANSHUGE_LIGHT |
86                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
87                                         __GFP_KSWAPD_RECLAIM);
88
89         /* Only do synchronous compaction if madvised */
90         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
91                 return GFP_TRANSHUGE_LIGHT |
92                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
93
94         return GFP_TRANSHUGE_LIGHT;
95 }
96
97 /*
98  * a caller must hold both locks of dst and src
99  */
100 int copy_huge_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
101                   pte_t *dst_pte, pte_t *src_pte, unsigned long haddr,
102                   struct vm_area_struct *vma, int *rss)
103 {
104         struct page *src_page;
105         unsigned long addr = haddr;
106         pte_t pte, *_pte;
107
108         pte = *src_pte;
109
110         src_page = vm_normal_page(vma, addr, pte);
111         if (!src_page)
112                 return -EAGAIN;
113
114         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
115         get_page(src_page);
116         page_dup_rmap(src_page, true);
117         if (rss)
118                 rss[MM_ANONPAGES] += HPAGE_CONT_PTE_NR;
119         else
120                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
121
122         _pte = src_pte;
123         while (addr < haddr + HPAGE_CONT_PTE_SIZE) {
124                 ptep_set_wrprotect(src_mm, addr, _pte);
125                 addr += PAGE_SIZE;
126         }
127         pte = pte_mkold(pte_wrprotect(pte));
128         arm64_set_huge_pte_at(dst_mm, haddr, dst_pte, pte, 0);
129
130         return 0;
131 }
132
133 vm_fault_t arm64_do_set_huge_pte(struct vm_fault *vmf, struct page *page)
134 {
135         int i;
136         pte_t entry;
137         struct vm_area_struct *vma = vmf->vma;
138         bool write = vmf->flags & FAULT_FLAG_WRITE;
139         unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
140         pgoff_t index, pgoff, addroff, headoff;
141         vm_fault_t ret = VM_FAULT_FALLBACK;
142
143         if (!transhuge_adv_vma_suitable(vma, haddr))
144                 return VM_FAULT_FALLBACK;
145
146         page = compound_head(page);
147         index = page->index;
148         pgoff = vmf->pgoff;
149         addroff = (vmf->address - haddr) >> PAGE_SHIFT;
150
151         if (pgoff - index != addroff)
152                 return VM_FAULT_FALLBACK;
153
154         /*
155          * Archs like ppc64 need additonal space to store information
156          * related to pte entry. Use the preallocated table for that.
157          */
158         if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
159                 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
160                 if (!vmf->prealloc_pte)
161                         return VM_FAULT_OOM;
162                 smp_wmb(); /* See comment in __pte_alloc() */
163         }
164
165         if (unlikely(pmd_none(*vmf->pmd))) {
166                 if (pte_alloc(vma->vm_mm, vmf->pmd))
167                         return VM_FAULT_OOM;
168                 smp_wmb();
169         }
170
171         /* The head offset indicates the position of the first page in the hugepage */
172         headoff = (addroff + (HPAGE_CONT_PTE_NR - pgoff)) % HPAGE_CONT_PTE_NR;
173         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, haddr, &vmf->ptl);
174         if (!vmf->pte || unlikely(!pte_none(*vmf->pte))) {
175                 spin_unlock(vmf->ptl);
176                 vmf->pte = NULL;
177                 return ret;
178         }
179
180         entry = arm64_make_huge_pte(compound_head(page), vma);
181         if (write)
182                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
183         for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
184                 flush_icache_page(vma, page + i);
185         if (write && !(vma->vm_flags & VM_SHARED)) {
186                 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
187                 if (PageAnon(page))
188                         page_add_new_anon_rmap(page, vma, haddr, true);
189         } else {
190                 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_CONT_PTE_NR);
191                 page_add_file_rmap(page, true);
192         }
193
194         arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, headoff);
195         update_mmu_cache_pmd(vma, haddr, vmf->pmd);
196         count_vm_event(THP_FILE_MAPPED);
197         return 0;
198 }
199
200 static vm_fault_t arm64_do_huge_pte_wp_page_fallback(struct vm_fault *vmf,
201                         pte_t orig_pte, struct page *page)
202 {
203         struct vm_area_struct *vma = vmf->vma;
204         unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
205         int i;
206         vm_fault_t ret = 0;
207         struct page **pages;
208         struct mmu_notifier_range range;
209
210         pages = kmalloc_array(HPAGE_CONT_PTE_NR, sizeof(struct page *),
211                               GFP_KERNEL);
212         if (unlikely(!pages)) {
213                 ret |= VM_FAULT_OOM;
214                 goto out;
215         }
216
217         for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
218                 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
219                                                vmf->address);
220                 if (unlikely(!pages[i] ||
221                              mem_cgroup_charge(pages[i], vma->vm_mm,
222                                      GFP_KERNEL))) {
223                         if (pages[i])
224                                 put_page(pages[i]);
225                         while (--i >= 0) {
226                                 put_page(pages[i]);
227                         }
228                         kfree(pages);
229                         ret |= VM_FAULT_OOM;
230                         goto out;
231                 }
232         }
233
234         for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
235                 copy_user_highpage(pages[i], page + i,
236                                    haddr + PAGE_SIZE * i, vma);
237                 __SetPageUptodate(pages[i]);
238                 cond_resched();
239         }
240
241         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
242                                 haddr, haddr + HPAGE_CONT_PTE_SIZE);
243         mmu_notifier_invalidate_range_start(&range);
244
245         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
246         if (unlikely(!pte_same(*vmf->pte, orig_pte)))
247                 goto out_free_pages;
248         VM_BUG_ON_PAGE(!PageHead(page), page);
249
250         /*
251          * Leave pmd empty until pte is filled note we must notify here as
252          * concurrent CPU thread might write to new page before the call to
253          * mmu_notifier_invalidate_range_end() happens which can lead to a
254          * device seeing memory write in different order than CPU.
255          *
256          * See Documentation/vm/mmu_notifier.rst
257          */
258         vmf->pte = pte_offset_map(vmf->pmd, haddr);
259         ptep_huge_clear_flush_notify(vma, haddr, vmf->pte);
260
261         for (i = 0; i < HPAGE_CONT_PTE_NR; i++, haddr += PAGE_SIZE) {
262                 pte_t entry;
263                 entry = mk_pte(pages[i], vma->vm_page_prot);
264                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
265                 set_page_private(pages[i], 0);
266
267                 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
268                 lru_cache_add_inactive_or_unevictable(pages[i], vma);
269                 vmf->pte = pte_offset_map(vmf->pmd, haddr);
270                 VM_BUG_ON(!pte_none(*vmf->pte));
271                 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
272                 pte_unmap(vmf->pte);
273         }
274         kfree(pages);
275
276         smp_wmb(); /* make pte visible before pmd */
277         page_remove_rmap(page, true);
278         spin_unlock(vmf->ptl);
279
280         /*
281          * No need to double call mmu_notifier->invalidate_range() callback as
282          * the above pmdp_huge_clear_flush_notify() did already call it.
283          */
284         mmu_notifier_invalidate_range_only_end(&range);
285
286         ret |= VM_FAULT_WRITE;
287         put_page(page);
288
289 out:
290         return ret;
291
292 out_free_pages:
293         spin_unlock(vmf->ptl);
294         mmu_notifier_invalidate_range_end(&range);
295         for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
296                 set_page_private(pages[i], 0);
297                 put_page(pages[i]);
298         }
299         kfree(pages);
300         goto out;
301 }
302
303 vm_fault_t arm64_do_huge_pte_wp_page(struct vm_fault *vmf, pte_t orig_pte)
304 {
305         struct vm_area_struct *vma = vmf->vma;
306         struct page *page = NULL, *new_page;
307         unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
308         struct mmu_notifier_range range;
309         gfp_t huge_gfp;                 /* for allocation and charge */
310         vm_fault_t ret = 0;
311
312         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
313         VM_BUG_ON_VMA(!vma->anon_vma, vma);
314
315         spin_lock(vmf->ptl);
316         if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
317                 spin_unlock(vmf->ptl);
318                 return ret;
319         }
320
321         page = pte_page(orig_pte);
322         VM_BUG_ON_PAGE(!PageCompound(page), page);
323         page = compound_head(page);
324         /*
325          * We can only reuse the page if nobody else maps the huge page or it's
326          * part.
327          */
328         if (!trylock_page(page)) {
329                 get_page(page);
330                 spin_unlock(vmf->ptl);
331                 lock_page(page);
332                 spin_lock(vmf->ptl);
333                 if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
334                         spin_unlock(vmf->ptl);
335                         unlock_page(page);
336                         put_page(page);
337                         return 0;
338                 }
339                 put_page(page);
340         }
341
342         if (reuse_swap_page(page, NULL)) {
343                 huge_cont_pte_set_accessed(vmf, orig_pte);
344                 unlock_page(page);
345                 spin_unlock(vmf->ptl);
346                 return VM_FAULT_WRITE;
347         }
348         unlock_page(page);
349         get_page(page);
350         spin_unlock(vmf->ptl);
351
352         /*
353          * For 2MB hugepage, the kernel just splits it
354          * into standard-sized pages and fallbacks to
355          * normal page fault handling path.
356          *
357          * For 64KB hugepage, I think alloc-on-COW can
358          * be get a performance benefit. This is because,
359          * significant time is consumed for copying contents
360          * of 2MB page, but 64KB page is much smaller than
361          * 2MB page. So, I guess that the overhead can be
362          * negligible.
363          *
364          * TODO: accounting time overhead of below procedure
365          */
366 #ifdef CONFIG_THP_CONSERVATIVE
367          goto fallback;
368 #endif
369         if (__transparent_hugepage_enabled(vma)) {
370                 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
371                 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr,
372                                 HPAGE_CONT_PTE_ORDER);
373         } else
374                 new_page = NULL;
375
376         if (likely(new_page)) {
377                 prep_transhuge_page(new_page);
378         } else {
379                 if (!page) {
380                         split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
381                         ret |= VM_FAULT_FALLBACK;
382                 } else {
383                         ret = arm64_do_huge_pte_wp_page_fallback(vmf, orig_pte, page);
384                         if (ret & VM_FAULT_OOM) {
385                                 split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
386                                 ret |= VM_FAULT_FALLBACK;
387                         }
388                         put_page(page);
389                 }
390                 count_vm_event(THP_FAULT_FALLBACK);
391                 goto out;
392         }
393
394         if (unlikely(mem_cgroup_charge(new_page, vma->vm_mm,
395                                         huge_gfp))) {
396                 put_page(new_page);
397                 split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address);
398                 if (page)
399                         put_page(page);
400                 ret |= VM_FAULT_FALLBACK;
401                 count_vm_event(THP_FAULT_FALLBACK);
402                 goto out;
403         }
404
405         count_vm_event(THP_FAULT_ALLOC);
406         count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
407
408         if (!page)
409                 clear_huge_page(new_page, vmf->address, HPAGE_CONT_PTE_NR);
410         else
411                 copy_user_huge_page(new_page, page, vmf->address,
412                                     vma, HPAGE_CONT_PTE_NR);
413         __SetPageUptodate(new_page);
414
415         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
416                                 haddr, haddr + HPAGE_CONT_PTE_SIZE);
417         mmu_notifier_invalidate_range_start(&range);
418
419         spin_lock(vmf->ptl);
420         if (page)
421                 put_page(page);
422         if (unlikely(!pte_same(*vmf->pte, orig_pte))) {
423                 spin_unlock(vmf->ptl);
424                 mem_cgroup_uncharge(new_page);
425                 put_page(new_page);
426                 goto out_mn;
427         } else {
428                 pte_t entry;
429
430                 entry = arm64_make_huge_pte(new_page, vma);
431                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
432
433                 vmf->pte = pte_offset_map(vmf->pmd, haddr);
434
435                 page_add_new_anon_rmap(new_page, vma, haddr, true);
436                 lru_cache_add_inactive_or_unevictable(new_page, vma);
437
438                 arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte, entry, 0);
439                 update_mmu_cache(vma, vmf->address, vmf->pte);
440
441                 if (!page) {
442                         add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
443                 } else {
444                         VM_BUG_ON_PAGE(!PageHead(page), page);
445                         page_remove_rmap(page, true);
446                         put_page(page);
447                 }
448                 ret |= VM_FAULT_WRITE;
449         }
450         spin_unlock(vmf->ptl);
451 out_mn:
452         /*
453          * No need to double call mmu_notifier->invalidate_range() callback as
454          * the above pmdp_huge_clear_flush_notify() did already call it.
455          */
456         mmu_notifier_invalidate_range_only_end(&range);
457 out:
458         return ret;
459 #ifdef CONFIG_THP_CONSERVATIVE
460 fallback:
461         __split_huge_pte(vma, vmf->pmd, vmf->pte, vmf->address, false, NULL);
462         return VM_FAULT_FALLBACK;
463 #endif /* CONFIG_THP_CONSERVATIVE */
464 }
465
466 /* the caller must hold lock */
467 vm_fault_t arm64_wp_huge_pte(struct vm_fault *vmf, pte_t orig_pte)
468 {
469         unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
470         pte_t *hpte_p;
471
472         if (vma_is_anonymous(vmf->vma)) {
473                 spin_unlock(vmf->ptl);
474                 return arm64_do_huge_pte_wp_page(vmf, orig_pte);
475         }
476
477         VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
478
479         hpte_p = pte_offset_map(vmf->pmd, haddr);
480         spin_unlock(vmf->ptl);
481         __split_huge_pte(vmf->vma, vmf->pmd, hpte_p, haddr, false, NULL);
482         spin_lock(vmf->ptl);
483
484         return VM_FAULT_FALLBACK;
485 }
486
487 static inline int check_huge_pte_range(pte_t *head)
488 {
489         int i;
490
491         for (i = 0; i < HPAGE_CONT_PTE_NR; i++, head++) {
492                 if (!pte_none(*head))
493                         return 1;
494         }
495         return 0;
496 }
497
498 void thp_print_cont_pte_table(struct mm_struct *mm,
499                         unsigned long addr, pte_t *ptep, unsigned long line);
500
501 static vm_fault_t __do_huge_pte_anonymous_page(struct vm_fault *vmf,
502                         struct page *page, gfp_t gfp)
503 {
504         struct vm_area_struct *vma = vmf->vma;
505         unsigned long offset, haddr = vmf->address & HPAGE_CONT_PTE_MASK;
506         pte_t entry;
507         vm_fault_t ret = 0;
508
509         VM_BUG_ON_PAGE(!PageCompound(page), page);
510
511         if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
512                 put_page(page);
513                 count_vm_event(THP_FAULT_FALLBACK);
514                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
515                 return VM_FAULT_FALLBACK;
516         }
517         cgroup_throttle_swaprate(page, gfp);
518
519         clear_huge_page(compound_head(page), haddr, HPAGE_CONT_PTE_NR);
520         /*
521          * The memory barrier inside __SetPageUptodate makes sure that
522          * clear_huge_page writes become visible before the set_pmd_at()
523          * write.
524          */
525         __SetPageUptodate(page);
526
527         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
528         ret = check_stable_address_space(vma->vm_mm);
529         if (ret)
530                 goto unlock_release;
531
532         if (userfaultfd_missing(vma)) {
533                 spin_unlock(vmf->ptl);
534                 put_page(page);
535                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
536                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
537                 return ret;
538         }
539
540         entry = arm64_make_huge_pte(page, vma);
541         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
542         offset = (vmf->address - haddr) >> PAGE_SHIFT;
543         vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
544         if (!pte_none(*vmf->pte)) {
545                 ret = VM_FAULT_FALLBACK;
546                 goto unlock_release;
547         }
548         if (check_huge_pte_range(vmf->pte - offset)) {
549                 /* recheck */
550                 /* TODO: COPY? */
551                 ret = VM_FAULT_FALLBACK;
552                 goto unlock_release;
553         }
554
555         page_add_new_anon_rmap(page, vma, haddr, true);
556         lru_cache_add_inactive_or_unevictable(page, vma);
557         arm64_set_huge_pte_at(vma->vm_mm, haddr, vmf->pte - offset, entry, 0);
558         add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_CONT_PTE_NR);
559
560         spin_unlock(vmf->ptl);
561
562         count_vm_event(THP_FAULT_ALLOC);
563         count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
564
565         return 0;
566
567 unlock_release:
568         spin_unlock(vmf->ptl);
569         put_page(page);
570
571         return ret;
572 }
573
574 vm_fault_t arm64_do_huge_pte_anonymous_page(struct vm_fault *vmf)
575 {
576         struct vm_area_struct *vma = vmf->vma;
577         struct page *page;
578         unsigned long haddr = vmf->address & HPAGE_CONT_PTE_MASK;
579         spinlock_t *ptl;
580         gfp_t gfp;
581
582         if (!transhuge_adv_vma_suitable(vma, haddr))
583                 return VM_FAULT_FALLBACK;
584         if (unlikely(anon_vma_prepare(vma)))
585                 return VM_FAULT_OOM;
586         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
587                 return VM_FAULT_OOM;
588         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
589                         !mm_forbids_zeropage(vma->vm_mm) &&
590                         transparent_hugepage_use_zero_page()) {
591                 return VM_FAULT_FALLBACK;
592         }
593         ptl = pmd_lock(vma->vm_mm, vmf->pmd);
594         vmf->pte = pte_offset_map(vmf->pmd, haddr);
595         if (check_huge_pte_range(vmf->pte)) {
596                 pte_unmap(vmf->pte);
597                 spin_unlock(ptl);
598                 return VM_FAULT_FALLBACK;
599         }
600         pte_unmap(vmf->pte);
601         spin_unlock(ptl);
602
603         gfp = alloc_hugepage_direct_gfpmask(vma);
604         page = alloc_hugepage_vma(gfp, vma,
605                                 haddr,
606                                 HPAGE_CONT_PTE_ORDER);
607         if (unlikely(!page)) {
608                 count_vm_event(THP_FAULT_FALLBACK);
609                 return VM_FAULT_FALLBACK;
610         }
611         prep_transhuge_page(page);
612         return __do_huge_pte_anonymous_page(vmf, page, gfp);
613 }
614
615 bool zap_cont_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
616                         pmd_t *pmd, pte_t **ptep, unsigned long *addr,
617                         unsigned long end, struct page *page,
618                         int *rss, spinlock_t *ptl)
619 {
620         struct mm_struct *mm = tlb->mm;
621         unsigned long haddr = (*addr) & HPAGE_CONT_PTE_MASK;
622         unsigned long range_end =
623                 ((haddr + HPAGE_CONT_PTE_SIZE) > end) ? end :
624                 haddr + HPAGE_CONT_PTE_SIZE;
625         size_t size = range_end - haddr;
626         unsigned long map_count = size >> PAGE_SHIFT;
627         pte_t *pte;
628
629         pte = pte_offset_map(pmd, haddr);
630
631         if ((*addr) == haddr && haddr + HPAGE_CONT_PTE_SIZE <= range_end) {
632                 arm64_clear_and_flush(mm, *addr, pte, PAGE_SIZE, map_count);
633                 page_remove_rmap(compound_head(page), true);
634                 rss[mm_counter(page)] -= map_count;
635                 __tlb_adjust_range(tlb, *addr, size);
636                 __tlb_remove_tlb_entry(tlb, pte, *addr);
637                 tlb_remove_page_size(tlb, page, size);
638
639                 *addr += size;
640                 pte += map_count;
641
642                 if (*addr >= end)
643                         *addr = end - PAGE_SIZE;
644
645                 *ptep = pte;
646         } else {
647                 if (haddr < vma->vm_start) {
648                         pr_err("haddr(%lx) is less than vm start(%lx)\n",
649                                         haddr, vma->vm_start);
650                         thp_print_cont_pte_table(mm, haddr, pte, __LINE__);
651                 }
652
653                 spin_unlock(ptl);
654                 __split_huge_pte(vma, pmd, pte, haddr, false, NULL);
655                 spin_lock(ptl);
656         }
657
658         pte_unmap(pte);
659
660         return map_count == HPAGE_CONT_PTE_NR;
661 }
662
663 /* caller must hold a proper lock */
664 void huge_cont_pte_set_accessed(struct vm_fault *vmf, pte_t orig_pte)
665 {
666         int i;
667         pte_t entry, *pte;
668         unsigned long haddr;
669         bool write = vmf->flags & FAULT_FLAG_WRITE;
670
671         haddr = vmf->address & HPAGE_CONT_PTE_MASK;
672         pte = pte_offset_map(vmf->pmd, haddr);
673
674         for (i = 0; i < HPAGE_CONT_PTE_NR; i++, pte++, haddr += PAGE_SIZE) {
675                 entry = pte_mkyoung(*pte);
676                 if (write)
677                         entry = pte_mkwrite(pte_mkdirty(entry));
678                 ptep_set_access_flags(vmf->vma, haddr, pte, entry, write);
679         }
680         update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
681 }
682
683 /*
684  * FOLL_FORCE can write to even unwritable pmd's, but only
685  * after we've gone through a COW cycle and they are dirty.
686  */
687 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
688 {
689         return pmd_write(pmd) ||
690                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
691 }
692
693 extern void mlock_vma_page(struct page *page);
694 extern void clear_page_mlock(struct page *page);
695
696 struct page *follow_trans_huge_pte(struct vm_area_struct *vma,
697                                    unsigned long addr,
698                                    pmd_t *pmd,
699                                    unsigned int flags)
700 {
701         struct mm_struct *mm = vma->vm_mm;
702         struct page *page = NULL;
703         pte_t *pte;
704
705         assert_spin_locked(pmd_lockptr(mm, pmd));
706
707         if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
708                 goto out;
709
710         /* Avoid dumping huge zero page */
711         if ((flags & FOLL_DUMP))
712                 return ERR_PTR(-EFAULT);
713
714         /* Full NUMA hinting faults to serialise migration in fault paths */
715         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
716                 goto out;
717
718         pte = pte_offset_map(pmd, addr);
719         page = pte_page(*pte);
720         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
721
722         if (!try_grab_page(page, flags))
723                 return ERR_PTR(-ENOMEM);
724
725         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
726                 /*
727                  * We don't mlock() pte-mapped THPs. This way we can avoid
728                  * leaking mlocked pages into non-VM_LOCKED VMAs.
729                  *
730                  * For anon THP:
731                  *
732                  * In most cases the pmd is the only mapping of the page as we
733                  * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
734                  * writable private mappings in populate_vma_page_range().
735                  *
736                  * The only scenario when we have the page shared here is if we
737                  * mlocking read-only mapping shared over fork(). We skip
738                  * mlocking such pages.
739                  *
740                  * For file THP:
741                  *
742                  * We can expect PageDoubleMap() to be stable under page lock:
743                  * for file pages we set it in page_add_file_rmap(), which
744                  * requires page to be locked.
745                  */
746
747                 if (PageAnon(page) && compound_mapcount(page) != 1)
748                         goto skip_mlock;
749                 if (PageDoubleMap(page) || !page->mapping)
750                         goto skip_mlock;
751                 if (!trylock_page(page))
752                         goto skip_mlock;
753                 if (page->mapping && !PageDoubleMap(page))
754                         mlock_vma_page(page);
755                 unlock_page(page);
756         }
757 skip_mlock:
758         page += (addr & ~HPAGE_CONT_PTE_MASK) >> PAGE_SHIFT;
759         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
760
761 out:
762         return page;
763 }
764
765 static inline pte_t ptep_invalidate(struct vm_area_struct *vma,
766                                 unsigned long address, pte_t *ptep)
767 {
768         return __pte(xchg_relaxed(&pte_val(*ptep), (pte_val(*ptep) & ~PTE_VALID)));
769 }
770
771 extern atomic_long_t nr_phys_cont_pte_pages;
772
773 static int remap_try_huge_pte(struct mm_struct *mm, pte_t *pte, unsigned long addr,
774                                 unsigned long end, unsigned long pfn,
775                                 pgprot_t prot)
776 {
777         phys_addr_t phys_addr = __pfn_to_phys(pfn);
778         pte_t entry;
779
780         if ((end - addr) != CONT_PTE_SIZE)
781                 return 0;
782
783         if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
784                 return 0;
785
786         if (!IS_ALIGNED(phys_addr, CONT_PTE_SIZE))
787                 return 0;
788
789         entry = pte_mkspecial(pte_mkcont(pte_mkhuge(pfn_pte(pfn, prot))));
790         arch_set_huge_pte_at(mm, addr, pte, entry, 0);
791
792         atomic_long_add(HPAGE_CONT_PTE_NR, &nr_phys_cont_pte_pages);
793
794         return 1;
795 }
796
797 int arm64_remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
798                         unsigned long addr, unsigned long end,
799                         unsigned long pfn, pgprot_t prot)
800 {
801         pte_t *pte, *mapped_pte;
802         unsigned long next;
803         spinlock_t *ptl;
804         int err = 0;
805
806         mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
807         if (!pte)
808                 return -ENOMEM;
809         arch_enter_lazy_mmu_mode();
810         do {
811                 BUG_ON(!pte_none(*pte));
812                 if (!pfn_modify_allowed(pfn, prot)) {
813                         err = -EACCES;
814                         break;
815                 }
816
817                 next = pte_cont_addr_end(addr, end);
818                 if (remap_try_huge_pte(mm, pte, addr, next, pfn, prot)) {
819                         pte += HPAGE_CONT_PTE_NR;
820                         pfn += HPAGE_CONT_PTE_NR;
821                         addr += HPAGE_CONT_PTE_SIZE;
822                 } else {
823                         set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
824                         pfn++;
825                         pte++;
826                         addr += PAGE_SIZE;
827                 }
828         } while (addr != end);
829         arch_leave_lazy_mmu_mode();
830         pte_unmap_unlock(mapped_pte, ptl);
831         return err;
832 }
833
834 /* caller must hold appropriate lock (pmd lock) */
835 int change_huge_pte(struct vm_area_struct *vma, pte_t *pte,
836                 unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
837 {
838         struct mm_struct *mm = vma->vm_mm;
839         pte_t entry;
840         bool preserve_write;
841         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
842         int i, ret;
843
844         preserve_write = prot_numa && pte_write(*pte);
845         ret = 1;
846
847         /* currently, we don't consider numa cases, but just remain them
848          * for the future work */
849         if (prot_numa && is_huge_zero_page(pte_page(*pte)))
850                 goto out;
851
852         if (prot_numa && pte_protnone(*pte))
853                 goto out;
854
855         for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
856                 entry = ptep_invalidate(vma, addr, pte);
857                 entry = pte_modify(entry, newprot);
858                 if (preserve_write)
859                         entry = pte_mk_savedwrite(entry);
860                 entry = pte_mkcont(entry);
861
862                 set_pte_at(mm, addr, pte, entry);
863                 pte++;
864                 addr += PAGE_SIZE;
865         }
866
867         flush_tlb_range(vma, addr, addr + HPAGE_CONT_PTE_SIZE);
868         ret = HPAGE_CONT_PTE_NR;
869 out:
870         return ret;
871 }
872
873 static void __split_huge_pte_locked(struct vm_area_struct *vma, pte_t *pte,
874                 unsigned long haddr, bool freeze)
875 {
876         struct mm_struct *mm = vma->vm_mm;
877         struct page *page;
878         pte_t old_pte, _pte;
879         bool young, write, soft_dirty, pte_migration = false, uffd_wp = false;
880         unsigned long addr;
881         int i;
882
883         VM_BUG_ON(haddr & ~HPAGE_CONT_PTE_MASK);
884         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
885         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_CONT_PTE_SIZE, vma);
886
887         count_vm_event(THP_SPLIT_CONT_PTE);
888
889         if (!vma_is_anonymous(vma)) {
890                 _pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
891                 if (vma_is_dax(vma))
892                         return;
893                 page = pte_page(_pte);
894                 if (!PageDirty(page) && pte_dirty(_pte))
895                         set_page_dirty(page);
896                 if (!PageReferenced(page) && pte_young(_pte))
897                         SetPageReferenced(page);
898                 page_remove_rmap(page, true);
899                 put_page(page);
900                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_CONT_PTE_NR);
901                 return;
902         } else if (is_huge_zero_page(pte_page(*pte))) {
903                 pr_err("contiguous pte mapping for zero anon pages are not supported yet");
904                 BUG();
905         }
906
907         old_pte = ptep_huge_clear_flush_notify(vma, haddr, pte);
908
909         pte_migration = is_pte_migration_entry(old_pte);
910         if (unlikely(pte_migration)) {
911                 swp_entry_t entry;
912
913                 entry = pte_to_swp_entry(old_pte);
914                 page = pfn_to_page(swp_offset(entry));
915                 write = is_write_migration_entry(entry);
916                 young = false;
917                 soft_dirty = pte_swp_soft_dirty(old_pte);
918                 uffd_wp = pte_swp_uffd_wp(old_pte);
919         } else {
920                 page = pte_page(old_pte);
921                 if (pte_dirty(old_pte))
922                         SetPageDirty(page);
923                 write = pte_write(old_pte);
924                 young = pte_young(old_pte);
925                 soft_dirty = pte_soft_dirty(old_pte);
926                 uffd_wp = pte_uffd_wp(old_pte);
927         }
928
929         VM_BUG_ON_PAGE(!page_count(page), page);
930         page_ref_add(page, HPAGE_CONT_PTE_NR - 1);
931
932         for (i = 0, addr = haddr; i < HPAGE_CONT_PTE_NR;
933                                 i++, addr += PAGE_SIZE, pte++) {
934                 pte_t entry;
935
936                 if (freeze || pte_migration) {
937                         swp_entry_t swp_entry;
938                         swp_entry = make_migration_entry(page + i, write);
939                         entry = swp_entry_to_pte(swp_entry);
940                         if (soft_dirty)
941                                 entry = pte_swp_mksoft_dirty(entry);
942                         if (uffd_wp)
943                                 entry = pte_swp_mkuffd_wp(entry);
944                 } else {
945                         entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
946                         entry = maybe_mkwrite(entry, vma);
947                         if (!write)
948                                 entry = pte_wrprotect(entry);
949                         if (!young)
950                                 entry = pte_mkold(entry);
951                         if (soft_dirty)
952                                 entry = pte_mksoft_dirty(entry);
953                         if (uffd_wp)
954                                 entry = pte_mkuffd_wp(entry);
955                 }
956                 //BUG_ON(!pte_none(*pte));
957                 set_pte_at(mm, addr, pte, entry);
958                 if (!pte_migration)
959                         atomic_inc(&page[i]._mapcount);
960                 pte_unmap(pte);
961         }
962
963         if (!pte_migration) {
964                 /*
965                  * Set PG_double_map before dropping compound_mapcount to avoid
966                  * false-negative page_mapped().
967                  */
968                 if (compound_mapcount(page) > 1 &&
969                                 !TestSetPageDoubleMap(page)) {
970                         for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
971                                 atomic_inc(&page[i]._mapcount);
972                 }
973
974                 lock_page_memcg(page);
975                 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
976                         /* Last compound_mapcount is gone. */
977                         __dec_lruvec_page_state(page, NR_ANON_64KB_THPS);
978                         if (TestClearPageDoubleMap(page)) {
979                                 /* No need in mapcount reference anymore */
980                                 for (i = 0; i < HPAGE_CONT_PTE_NR; i++)
981                                         atomic_dec(&page[i]._mapcount);
982                         }
983                 }
984                 unlock_page_memcg(page);
985         }
986
987         smp_wmb();
988
989         if (freeze) {
990                 for (i = 0; i < HPAGE_CONT_PTE_NR; i++) {
991                         page_remove_rmap(page + i, false);
992                         put_page(page + i);
993                 }
994         }
995 }
996
997 void __split_huge_pte(struct vm_area_struct *vma, pmd_t *pmd,
998                 pte_t *pte, unsigned long address,
999                 bool freeze, struct page *page)
1000 {
1001         spinlock_t *ptl;
1002         struct mmu_notifier_range range;
1003         pte_t _pte;
1004         bool locked = false;
1005
1006         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1007                                 address & HPAGE_CONT_PTE_MASK,
1008                                 (address & HPAGE_CONT_PTE_MASK) + HPAGE_CONT_PTE_SIZE);
1009         mmu_notifier_invalidate_range_start(&range);
1010         ptl = pmd_lock(vma->vm_mm, pmd);
1011
1012         if (page) {
1013                 VM_WARN_ON_ONCE(!PageLocked(page));
1014                 if (page != pte_page(*pte))
1015                         goto out;
1016         }
1017 repeat:
1018         if (pte_cont(*pte)) {
1019                 if (!page) {
1020                         page = pte_page(*pte);
1021                         /*
1022                          * An anonymous page must be locked, to ensure that a
1023                          * concurrent reuse_swap_page() sees stable mapcount;
1024                          * but reuse_swap_page() is not used on shmem or file,
1025                          * and page lock must not be taken when zap_pte_range()
1026                          * calls __split_huge_pte() while i_mmap_lock is held.
1027                          */
1028                         if (PageAnon(page)) {
1029                                 if (unlikely(!trylock_page(page))) {
1030                                         _pte = *pte;
1031                                         get_page(page);
1032                                         spin_unlock(ptl);
1033                                         lock_page(page);
1034                                         spin_lock(ptl);
1035                                         if (unlikely(!pte_same(*pte, _pte))) {
1036                                                 unlock_page(page);
1037                                                 put_page(page);
1038                                                 page = NULL;
1039                                                 goto repeat;
1040                                         }
1041                                         put_page(page);
1042                                 }
1043                                 locked = true;
1044                         }
1045                 }
1046                 if (PageMlocked(page))
1047                         clear_page_mlock(page);
1048         } else if (!(pte_devmap(*pte) || is_pte_migration_entry(*pte)))
1049                 goto out;
1050         __split_huge_pte_locked(vma, pte, range.start, freeze);
1051 out:
1052         spin_unlock(ptl);
1053         if (locked && page)
1054                 unlock_page(page);
1055         mmu_notifier_invalidate_range_only_end(&range);
1056 }
1057
1058 void split_huge_pte_address(struct vm_area_struct *vma, unsigned long address,
1059                 bool freeze, struct page *page)
1060 {
1061         unsigned long haddr = address & HPAGE_CONT_PTE_MASK;
1062         pgd_t *pgd;
1063         p4d_t *p4d;
1064         pud_t *pud;
1065         pmd_t *pmd;
1066         pte_t *pte;
1067
1068         pgd = pgd_offset(vma->vm_mm, haddr);
1069         if (!pgd_present(*pgd))
1070                 return;
1071
1072         p4d = p4d_offset(pgd, haddr);
1073         if (!p4d_present(*p4d))
1074                 return;
1075
1076         pud = pud_offset(p4d, haddr);
1077         if (!pud_present(*pud))
1078                 return;
1079
1080         pmd = pmd_offset(pud, haddr);
1081         if (!pmd_present(*pmd))
1082                 return;
1083
1084         pte = pte_offset_map(pmd, haddr);
1085         if (!pte_present(*pte))
1086                 return;
1087
1088         __split_huge_pte(vma, pmd, pte, haddr, freeze, page);
1089 }
1090
1091 void set_huge_pte_migration_entry(
1092                 struct page_vma_mapped_walk *pvmw,
1093                 struct page *page)
1094 {
1095         int i;
1096         struct vm_area_struct *vma = pvmw->vma;
1097         struct mm_struct *mm = vma->vm_mm;
1098         unsigned long address = pvmw->address;
1099         pte_t pteval, *pte;
1100         swp_entry_t entry;
1101         pte_t pteswp;
1102         struct page *_page = page;
1103
1104         if (!(pvmw->pmd && pvmw->pte))
1105                 return;
1106
1107         flush_cache_range(vma, address, address + HPAGE_CONT_PTE_SIZE);
1108         pte = pvmw->pte;
1109
1110         //arch_set_huge_pte_at(mm, address, pvmw->pte, ptee);
1111         for (i = 0, pte = pvmw->pte; i < HPAGE_CONT_PTE_NR; i++, pte++) {
1112                 pteval = ptep_invalidate(vma, address, pte);
1113                 if (pte_dirty(pteval))
1114                         set_page_dirty(_page);
1115                 entry = make_migration_entry(page, pte_write(pteval));
1116                 pteswp = swp_entry_to_pte(entry);
1117                 if (pte_soft_dirty(pteval))
1118                         pteswp = pte_swp_mksoft_dirty(pteswp);
1119                 set_pte_at(mm, address, pte, pteswp);
1120                 _page++;
1121                 address += PAGE_SIZE;
1122         }
1123
1124         pvmw->pte = pte;
1125         pvmw->address = address;
1126
1127         page_remove_rmap(page, true);
1128         put_page(page);
1129 }
1130
1131 void remove_migration_huge_pte(
1132                 struct page_vma_mapped_walk *pvmw, struct page *new)
1133 {
1134         struct vm_area_struct *vma = pvmw->vma;
1135         struct mm_struct *mm = vma->vm_mm;
1136         unsigned long address = pvmw->address;
1137         unsigned long mmun_start = address & HPAGE_CONT_PTE_MASK;
1138         pte_t ptee;
1139         swp_entry_t entry;
1140
1141         if (!(pvmw->pmd && !pvmw->pte))
1142                 return;
1143
1144         entry = pmd_to_swp_entry(*pvmw->pmd);
1145         get_page(new);
1146         ptee = pte_mkold(arch_make_huge_pte(new, vma));
1147         if (pte_swp_soft_dirty(*pvmw->pte))
1148                 ptee = pte_mksoft_dirty(ptee);
1149         if (is_write_migration_entry(entry))
1150                 ptee = maybe_mkwrite(ptee, vma);
1151
1152         flush_cache_range(vma, mmun_start, mmun_start + HPAGE_CONT_PTE_SIZE);
1153         if (PageAnon(new))
1154                 page_add_anon_rmap(new, vma, mmun_start, true);
1155         else
1156                 page_add_file_rmap(new, true);
1157
1158         arch_set_huge_pte_at(mm, mmun_start, pvmw->pte, ptee, 0);
1159         if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
1160                 mlock_vma_page(new);
1161         pvmw->address = address + HPAGE_CONT_PTE_SIZE;
1162         pvmw->pte = pvmw->pte + HPAGE_CONT_PTE_NR;
1163         update_mmu_cache_pmd(vma, address, pvmw->pmd);
1164 }
1165 #endif /* CONFIG_FINEGRAINED_THP */