Merge tag 'fbdev-for-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/deller...
[platform/kernel/linux-starfive.git] / mm / huge_memory.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 2009  Red Hat, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/coredump.h>
12 #include <linux/sched/numa_balancing.h>
13 #include <linux/highmem.h>
14 #include <linux/hugetlb.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/rmap.h>
17 #include <linux/swap.h>
18 #include <linux/shrinker.h>
19 #include <linux/mm_inline.h>
20 #include <linux/swapops.h>
21 #include <linux/backing-dev.h>
22 #include <linux/dax.h>
23 #include <linux/khugepaged.h>
24 #include <linux/freezer.h>
25 #include <linux/pfn_t.h>
26 #include <linux/mman.h>
27 #include <linux/memremap.h>
28 #include <linux/pagemap.h>
29 #include <linux/debugfs.h>
30 #include <linux/migrate.h>
31 #include <linux/hashtable.h>
32 #include <linux/userfaultfd_k.h>
33 #include <linux/page_idle.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/oom.h>
36 #include <linux/numa.h>
37 #include <linux/page_owner.h>
38 #include <linux/sched/sysctl.h>
39 #include <linux/memory-tiers.h>
40
41 #include <asm/tlb.h>
42 #include <asm/pgalloc.h>
43 #include "internal.h"
44 #include "swap.h"
45
46 #define CREATE_TRACE_POINTS
47 #include <trace/events/thp.h>
48
49 /*
50  * By default, transparent hugepage support is disabled in order to avoid
51  * risking an increased memory footprint for applications that are not
52  * guaranteed to benefit from it. When transparent hugepage support is
53  * enabled, it is for all mappings, and khugepaged scans all mappings.
54  * Defrag is invoked by khugepaged hugepage allocations and by page faults
55  * for all hugepage allocations.
56  */
57 unsigned long transparent_hugepage_flags __read_mostly =
58 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
59         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
60 #endif
61 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
62         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
63 #endif
64         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
65         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
66         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
67
68 static struct shrinker deferred_split_shrinker;
69
70 static atomic_t huge_zero_refcount;
71 struct page *huge_zero_page __read_mostly;
72 unsigned long huge_zero_pfn __read_mostly = ~0UL;
73
74 bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
75                         bool smaps, bool in_pf, bool enforce_sysfs)
76 {
77         if (!vma->vm_mm)                /* vdso */
78                 return false;
79
80         /*
81          * Explicitly disabled through madvise or prctl, or some
82          * architectures may disable THP for some mappings, for
83          * example, s390 kvm.
84          * */
85         if ((vm_flags & VM_NOHUGEPAGE) ||
86             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
87                 return false;
88         /*
89          * If the hardware/firmware marked hugepage support disabled.
90          */
91         if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
92                 return false;
93
94         /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
95         if (vma_is_dax(vma))
96                 return in_pf;
97
98         /*
99          * Special VMA and hugetlb VMA.
100          * Must be checked after dax since some dax mappings may have
101          * VM_MIXEDMAP set.
102          */
103         if (vm_flags & VM_NO_KHUGEPAGED)
104                 return false;
105
106         /*
107          * Check alignment for file vma and size for both file and anon vma.
108          *
109          * Skip the check for page fault. Huge fault does the check in fault
110          * handlers. And this check is not suitable for huge PUD fault.
111          */
112         if (!in_pf &&
113             !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
114                 return false;
115
116         /*
117          * Enabled via shmem mount options or sysfs settings.
118          * Must be done before hugepage flags check since shmem has its
119          * own flags.
120          */
121         if (!in_pf && shmem_file(vma->vm_file))
122                 return shmem_huge_enabled(vma, !enforce_sysfs);
123
124         /* Enforce sysfs THP requirements as necessary */
125         if (enforce_sysfs &&
126             (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
127                                            !hugepage_flags_always())))
128                 return false;
129
130         /* Only regular file is valid */
131         if (!in_pf && file_thp_enabled(vma))
132                 return true;
133
134         if (!vma_is_anonymous(vma))
135                 return false;
136
137         if (vma_is_temporary_stack(vma))
138                 return false;
139
140         /*
141          * THPeligible bit of smaps should show 1 for proper VMAs even
142          * though anon_vma is not initialized yet.
143          *
144          * Allow page fault since anon_vma may be not initialized until
145          * the first page fault.
146          */
147         if (!vma->anon_vma)
148                 return (smaps || in_pf);
149
150         return true;
151 }
152
153 static bool get_huge_zero_page(void)
154 {
155         struct page *zero_page;
156 retry:
157         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
158                 return true;
159
160         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
161                         HPAGE_PMD_ORDER);
162         if (!zero_page) {
163                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
164                 return false;
165         }
166         preempt_disable();
167         if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
168                 preempt_enable();
169                 __free_pages(zero_page, compound_order(zero_page));
170                 goto retry;
171         }
172         WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
173
174         /* We take additional reference here. It will be put back by shrinker */
175         atomic_set(&huge_zero_refcount, 2);
176         preempt_enable();
177         count_vm_event(THP_ZERO_PAGE_ALLOC);
178         return true;
179 }
180
181 static void put_huge_zero_page(void)
182 {
183         /*
184          * Counter should never go to zero here. Only shrinker can put
185          * last reference.
186          */
187         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
188 }
189
190 struct page *mm_get_huge_zero_page(struct mm_struct *mm)
191 {
192         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
193                 return READ_ONCE(huge_zero_page);
194
195         if (!get_huge_zero_page())
196                 return NULL;
197
198         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
199                 put_huge_zero_page();
200
201         return READ_ONCE(huge_zero_page);
202 }
203
204 void mm_put_huge_zero_page(struct mm_struct *mm)
205 {
206         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
207                 put_huge_zero_page();
208 }
209
210 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
211                                         struct shrink_control *sc)
212 {
213         /* we can free zero page only if last reference remains */
214         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
215 }
216
217 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
218                                        struct shrink_control *sc)
219 {
220         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
221                 struct page *zero_page = xchg(&huge_zero_page, NULL);
222                 BUG_ON(zero_page == NULL);
223                 WRITE_ONCE(huge_zero_pfn, ~0UL);
224                 __free_pages(zero_page, compound_order(zero_page));
225                 return HPAGE_PMD_NR;
226         }
227
228         return 0;
229 }
230
231 static struct shrinker huge_zero_page_shrinker = {
232         .count_objects = shrink_huge_zero_page_count,
233         .scan_objects = shrink_huge_zero_page_scan,
234         .seeks = DEFAULT_SEEKS,
235 };
236
237 #ifdef CONFIG_SYSFS
238 static ssize_t enabled_show(struct kobject *kobj,
239                             struct kobj_attribute *attr, char *buf)
240 {
241         const char *output;
242
243         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
244                 output = "[always] madvise never";
245         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
246                           &transparent_hugepage_flags))
247                 output = "always [madvise] never";
248         else
249                 output = "always madvise [never]";
250
251         return sysfs_emit(buf, "%s\n", output);
252 }
253
254 static ssize_t enabled_store(struct kobject *kobj,
255                              struct kobj_attribute *attr,
256                              const char *buf, size_t count)
257 {
258         ssize_t ret = count;
259
260         if (sysfs_streq(buf, "always")) {
261                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
262                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
263         } else if (sysfs_streq(buf, "madvise")) {
264                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
265                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
266         } else if (sysfs_streq(buf, "never")) {
267                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
268                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
269         } else
270                 ret = -EINVAL;
271
272         if (ret > 0) {
273                 int err = start_stop_khugepaged();
274                 if (err)
275                         ret = err;
276         }
277         return ret;
278 }
279
280 static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
281
282 ssize_t single_hugepage_flag_show(struct kobject *kobj,
283                                   struct kobj_attribute *attr, char *buf,
284                                   enum transparent_hugepage_flag flag)
285 {
286         return sysfs_emit(buf, "%d\n",
287                           !!test_bit(flag, &transparent_hugepage_flags));
288 }
289
290 ssize_t single_hugepage_flag_store(struct kobject *kobj,
291                                  struct kobj_attribute *attr,
292                                  const char *buf, size_t count,
293                                  enum transparent_hugepage_flag flag)
294 {
295         unsigned long value;
296         int ret;
297
298         ret = kstrtoul(buf, 10, &value);
299         if (ret < 0)
300                 return ret;
301         if (value > 1)
302                 return -EINVAL;
303
304         if (value)
305                 set_bit(flag, &transparent_hugepage_flags);
306         else
307                 clear_bit(flag, &transparent_hugepage_flags);
308
309         return count;
310 }
311
312 static ssize_t defrag_show(struct kobject *kobj,
313                            struct kobj_attribute *attr, char *buf)
314 {
315         const char *output;
316
317         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
318                      &transparent_hugepage_flags))
319                 output = "[always] defer defer+madvise madvise never";
320         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
321                           &transparent_hugepage_flags))
322                 output = "always [defer] defer+madvise madvise never";
323         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
324                           &transparent_hugepage_flags))
325                 output = "always defer [defer+madvise] madvise never";
326         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
327                           &transparent_hugepage_flags))
328                 output = "always defer defer+madvise [madvise] never";
329         else
330                 output = "always defer defer+madvise madvise [never]";
331
332         return sysfs_emit(buf, "%s\n", output);
333 }
334
335 static ssize_t defrag_store(struct kobject *kobj,
336                             struct kobj_attribute *attr,
337                             const char *buf, size_t count)
338 {
339         if (sysfs_streq(buf, "always")) {
340                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
341                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
342                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
343                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
344         } else if (sysfs_streq(buf, "defer+madvise")) {
345                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
346                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
347                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
348                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
349         } else if (sysfs_streq(buf, "defer")) {
350                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
351                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
352                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
353                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
354         } else if (sysfs_streq(buf, "madvise")) {
355                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
356                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
357                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
358                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
359         } else if (sysfs_streq(buf, "never")) {
360                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
361                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
362                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
363                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
364         } else
365                 return -EINVAL;
366
367         return count;
368 }
369 static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
370
371 static ssize_t use_zero_page_show(struct kobject *kobj,
372                                   struct kobj_attribute *attr, char *buf)
373 {
374         return single_hugepage_flag_show(kobj, attr, buf,
375                                          TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
376 }
377 static ssize_t use_zero_page_store(struct kobject *kobj,
378                 struct kobj_attribute *attr, const char *buf, size_t count)
379 {
380         return single_hugepage_flag_store(kobj, attr, buf, count,
381                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
382 }
383 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
384
385 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
386                                    struct kobj_attribute *attr, char *buf)
387 {
388         return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
389 }
390 static struct kobj_attribute hpage_pmd_size_attr =
391         __ATTR_RO(hpage_pmd_size);
392
393 static struct attribute *hugepage_attr[] = {
394         &enabled_attr.attr,
395         &defrag_attr.attr,
396         &use_zero_page_attr.attr,
397         &hpage_pmd_size_attr.attr,
398 #ifdef CONFIG_SHMEM
399         &shmem_enabled_attr.attr,
400 #endif
401         NULL,
402 };
403
404 static const struct attribute_group hugepage_attr_group = {
405         .attrs = hugepage_attr,
406 };
407
408 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
409 {
410         int err;
411
412         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
413         if (unlikely(!*hugepage_kobj)) {
414                 pr_err("failed to create transparent hugepage kobject\n");
415                 return -ENOMEM;
416         }
417
418         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
419         if (err) {
420                 pr_err("failed to register transparent hugepage group\n");
421                 goto delete_obj;
422         }
423
424         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
425         if (err) {
426                 pr_err("failed to register transparent hugepage group\n");
427                 goto remove_hp_group;
428         }
429
430         return 0;
431
432 remove_hp_group:
433         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
434 delete_obj:
435         kobject_put(*hugepage_kobj);
436         return err;
437 }
438
439 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
440 {
441         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
442         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
443         kobject_put(hugepage_kobj);
444 }
445 #else
446 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
447 {
448         return 0;
449 }
450
451 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
452 {
453 }
454 #endif /* CONFIG_SYSFS */
455
456 static int __init hugepage_init(void)
457 {
458         int err;
459         struct kobject *hugepage_kobj;
460
461         if (!has_transparent_hugepage()) {
462                 /*
463                  * Hardware doesn't support hugepages, hence disable
464                  * DAX PMD support.
465                  */
466                 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
467                 return -EINVAL;
468         }
469
470         /*
471          * hugepages can't be allocated by the buddy allocator
472          */
473         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
474         /*
475          * we use page->mapping and page->index in second tail page
476          * as list_head: assuming THP order >= 2
477          */
478         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
479
480         err = hugepage_init_sysfs(&hugepage_kobj);
481         if (err)
482                 goto err_sysfs;
483
484         err = khugepaged_init();
485         if (err)
486                 goto err_slab;
487
488         err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
489         if (err)
490                 goto err_hzp_shrinker;
491         err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
492         if (err)
493                 goto err_split_shrinker;
494
495         /*
496          * By default disable transparent hugepages on smaller systems,
497          * where the extra memory used could hurt more than TLB overhead
498          * is likely to save.  The admin can still enable it through /sys.
499          */
500         if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
501                 transparent_hugepage_flags = 0;
502                 return 0;
503         }
504
505         err = start_stop_khugepaged();
506         if (err)
507                 goto err_khugepaged;
508
509         return 0;
510 err_khugepaged:
511         unregister_shrinker(&deferred_split_shrinker);
512 err_split_shrinker:
513         unregister_shrinker(&huge_zero_page_shrinker);
514 err_hzp_shrinker:
515         khugepaged_destroy();
516 err_slab:
517         hugepage_exit_sysfs(hugepage_kobj);
518 err_sysfs:
519         return err;
520 }
521 subsys_initcall(hugepage_init);
522
523 static int __init setup_transparent_hugepage(char *str)
524 {
525         int ret = 0;
526         if (!str)
527                 goto out;
528         if (!strcmp(str, "always")) {
529                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
530                         &transparent_hugepage_flags);
531                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
532                           &transparent_hugepage_flags);
533                 ret = 1;
534         } else if (!strcmp(str, "madvise")) {
535                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
536                           &transparent_hugepage_flags);
537                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
538                         &transparent_hugepage_flags);
539                 ret = 1;
540         } else if (!strcmp(str, "never")) {
541                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
542                           &transparent_hugepage_flags);
543                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
544                           &transparent_hugepage_flags);
545                 ret = 1;
546         }
547 out:
548         if (!ret)
549                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
550         return ret;
551 }
552 __setup("transparent_hugepage=", setup_transparent_hugepage);
553
554 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
555 {
556         if (likely(vma->vm_flags & VM_WRITE))
557                 pmd = pmd_mkwrite(pmd);
558         return pmd;
559 }
560
561 #ifdef CONFIG_MEMCG
562 static inline struct deferred_split *get_deferred_split_queue(struct page *page)
563 {
564         struct mem_cgroup *memcg = page_memcg(compound_head(page));
565         struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
566
567         if (memcg)
568                 return &memcg->deferred_split_queue;
569         else
570                 return &pgdat->deferred_split_queue;
571 }
572 #else
573 static inline struct deferred_split *get_deferred_split_queue(struct page *page)
574 {
575         struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
576
577         return &pgdat->deferred_split_queue;
578 }
579 #endif
580
581 void prep_transhuge_page(struct page *page)
582 {
583         /*
584          * we use page->mapping and page->index in second tail page
585          * as list_head: assuming THP order >= 2
586          */
587
588         INIT_LIST_HEAD(page_deferred_list(page));
589         set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
590 }
591
592 static inline bool is_transparent_hugepage(struct page *page)
593 {
594         if (!PageCompound(page))
595                 return false;
596
597         page = compound_head(page);
598         return is_huge_zero_page(page) ||
599                page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
600 }
601
602 static unsigned long __thp_get_unmapped_area(struct file *filp,
603                 unsigned long addr, unsigned long len,
604                 loff_t off, unsigned long flags, unsigned long size)
605 {
606         loff_t off_end = off + len;
607         loff_t off_align = round_up(off, size);
608         unsigned long len_pad, ret;
609
610         if (off_end <= off_align || (off_end - off_align) < size)
611                 return 0;
612
613         len_pad = len + size;
614         if (len_pad < len || (off + len_pad) < off)
615                 return 0;
616
617         ret = current->mm->get_unmapped_area(filp, addr, len_pad,
618                                               off >> PAGE_SHIFT, flags);
619
620         /*
621          * The failure might be due to length padding. The caller will retry
622          * without the padding.
623          */
624         if (IS_ERR_VALUE(ret))
625                 return 0;
626
627         /*
628          * Do not try to align to THP boundary if allocation at the address
629          * hint succeeds.
630          */
631         if (ret == addr)
632                 return addr;
633
634         ret += (off - ret) & (size - 1);
635         return ret;
636 }
637
638 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
639                 unsigned long len, unsigned long pgoff, unsigned long flags)
640 {
641         unsigned long ret;
642         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
643
644         ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
645         if (ret)
646                 return ret;
647
648         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
649 }
650 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
651
652 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
653                         struct page *page, gfp_t gfp)
654 {
655         struct vm_area_struct *vma = vmf->vma;
656         pgtable_t pgtable;
657         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
658         vm_fault_t ret = 0;
659
660         VM_BUG_ON_PAGE(!PageCompound(page), page);
661
662         if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
663                 put_page(page);
664                 count_vm_event(THP_FAULT_FALLBACK);
665                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
666                 return VM_FAULT_FALLBACK;
667         }
668         cgroup_throttle_swaprate(page, gfp);
669
670         pgtable = pte_alloc_one(vma->vm_mm);
671         if (unlikely(!pgtable)) {
672                 ret = VM_FAULT_OOM;
673                 goto release;
674         }
675
676         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
677         /*
678          * The memory barrier inside __SetPageUptodate makes sure that
679          * clear_huge_page writes become visible before the set_pmd_at()
680          * write.
681          */
682         __SetPageUptodate(page);
683
684         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
685         if (unlikely(!pmd_none(*vmf->pmd))) {
686                 goto unlock_release;
687         } else {
688                 pmd_t entry;
689
690                 ret = check_stable_address_space(vma->vm_mm);
691                 if (ret)
692                         goto unlock_release;
693
694                 /* Deliver the page fault to userland */
695                 if (userfaultfd_missing(vma)) {
696                         spin_unlock(vmf->ptl);
697                         put_page(page);
698                         pte_free(vma->vm_mm, pgtable);
699                         ret = handle_userfault(vmf, VM_UFFD_MISSING);
700                         VM_BUG_ON(ret & VM_FAULT_FALLBACK);
701                         return ret;
702                 }
703
704                 entry = mk_huge_pmd(page, vma->vm_page_prot);
705                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
706                 page_add_new_anon_rmap(page, vma, haddr);
707                 lru_cache_add_inactive_or_unevictable(page, vma);
708                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
709                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
710                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
711                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
712                 mm_inc_nr_ptes(vma->vm_mm);
713                 spin_unlock(vmf->ptl);
714                 count_vm_event(THP_FAULT_ALLOC);
715                 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
716         }
717
718         return 0;
719 unlock_release:
720         spin_unlock(vmf->ptl);
721 release:
722         if (pgtable)
723                 pte_free(vma->vm_mm, pgtable);
724         put_page(page);
725         return ret;
726
727 }
728
729 /*
730  * always: directly stall for all thp allocations
731  * defer: wake kswapd and fail if not immediately available
732  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
733  *                fail if not immediately available
734  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
735  *          available
736  * never: never stall for any thp allocation
737  */
738 gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
739 {
740         const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
741
742         /* Always do synchronous compaction */
743         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
744                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
745
746         /* Kick kcompactd and fail quickly */
747         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
748                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
749
750         /* Synchronous compaction if madvised, otherwise kick kcompactd */
751         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
752                 return GFP_TRANSHUGE_LIGHT |
753                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
754                                         __GFP_KSWAPD_RECLAIM);
755
756         /* Only do synchronous compaction if madvised */
757         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
758                 return GFP_TRANSHUGE_LIGHT |
759                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
760
761         return GFP_TRANSHUGE_LIGHT;
762 }
763
764 /* Caller must hold page table lock. */
765 static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
766                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
767                 struct page *zero_page)
768 {
769         pmd_t entry;
770         if (!pmd_none(*pmd))
771                 return;
772         entry = mk_pmd(zero_page, vma->vm_page_prot);
773         entry = pmd_mkhuge(entry);
774         pgtable_trans_huge_deposit(mm, pmd, pgtable);
775         set_pmd_at(mm, haddr, pmd, entry);
776         mm_inc_nr_ptes(mm);
777 }
778
779 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
780 {
781         struct vm_area_struct *vma = vmf->vma;
782         gfp_t gfp;
783         struct folio *folio;
784         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
785
786         if (!transhuge_vma_suitable(vma, haddr))
787                 return VM_FAULT_FALLBACK;
788         if (unlikely(anon_vma_prepare(vma)))
789                 return VM_FAULT_OOM;
790         khugepaged_enter_vma(vma, vma->vm_flags);
791
792         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
793                         !mm_forbids_zeropage(vma->vm_mm) &&
794                         transparent_hugepage_use_zero_page()) {
795                 pgtable_t pgtable;
796                 struct page *zero_page;
797                 vm_fault_t ret;
798                 pgtable = pte_alloc_one(vma->vm_mm);
799                 if (unlikely(!pgtable))
800                         return VM_FAULT_OOM;
801                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
802                 if (unlikely(!zero_page)) {
803                         pte_free(vma->vm_mm, pgtable);
804                         count_vm_event(THP_FAULT_FALLBACK);
805                         return VM_FAULT_FALLBACK;
806                 }
807                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
808                 ret = 0;
809                 if (pmd_none(*vmf->pmd)) {
810                         ret = check_stable_address_space(vma->vm_mm);
811                         if (ret) {
812                                 spin_unlock(vmf->ptl);
813                                 pte_free(vma->vm_mm, pgtable);
814                         } else if (userfaultfd_missing(vma)) {
815                                 spin_unlock(vmf->ptl);
816                                 pte_free(vma->vm_mm, pgtable);
817                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
818                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
819                         } else {
820                                 set_huge_zero_page(pgtable, vma->vm_mm, vma,
821                                                    haddr, vmf->pmd, zero_page);
822                                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
823                                 spin_unlock(vmf->ptl);
824                         }
825                 } else {
826                         spin_unlock(vmf->ptl);
827                         pte_free(vma->vm_mm, pgtable);
828                 }
829                 return ret;
830         }
831         gfp = vma_thp_gfp_mask(vma);
832         folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
833         if (unlikely(!folio)) {
834                 count_vm_event(THP_FAULT_FALLBACK);
835                 return VM_FAULT_FALLBACK;
836         }
837         return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
838 }
839
840 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
841                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
842                 pgtable_t pgtable)
843 {
844         struct mm_struct *mm = vma->vm_mm;
845         pmd_t entry;
846         spinlock_t *ptl;
847
848         ptl = pmd_lock(mm, pmd);
849         if (!pmd_none(*pmd)) {
850                 if (write) {
851                         if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
852                                 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
853                                 goto out_unlock;
854                         }
855                         entry = pmd_mkyoung(*pmd);
856                         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
857                         if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
858                                 update_mmu_cache_pmd(vma, addr, pmd);
859                 }
860
861                 goto out_unlock;
862         }
863
864         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
865         if (pfn_t_devmap(pfn))
866                 entry = pmd_mkdevmap(entry);
867         if (write) {
868                 entry = pmd_mkyoung(pmd_mkdirty(entry));
869                 entry = maybe_pmd_mkwrite(entry, vma);
870         }
871
872         if (pgtable) {
873                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
874                 mm_inc_nr_ptes(mm);
875                 pgtable = NULL;
876         }
877
878         set_pmd_at(mm, addr, pmd, entry);
879         update_mmu_cache_pmd(vma, addr, pmd);
880
881 out_unlock:
882         spin_unlock(ptl);
883         if (pgtable)
884                 pte_free(mm, pgtable);
885 }
886
887 /**
888  * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
889  * @vmf: Structure describing the fault
890  * @pfn: pfn to insert
891  * @pgprot: page protection to use
892  * @write: whether it's a write fault
893  *
894  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
895  * also consult the vmf_insert_mixed_prot() documentation when
896  * @pgprot != @vmf->vma->vm_page_prot.
897  *
898  * Return: vm_fault_t value.
899  */
900 vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
901                                    pgprot_t pgprot, bool write)
902 {
903         unsigned long addr = vmf->address & PMD_MASK;
904         struct vm_area_struct *vma = vmf->vma;
905         pgtable_t pgtable = NULL;
906
907         /*
908          * If we had pmd_special, we could avoid all these restrictions,
909          * but we need to be consistent with PTEs and architectures that
910          * can't support a 'special' bit.
911          */
912         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
913                         !pfn_t_devmap(pfn));
914         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
915                                                 (VM_PFNMAP|VM_MIXEDMAP));
916         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
917
918         if (addr < vma->vm_start || addr >= vma->vm_end)
919                 return VM_FAULT_SIGBUS;
920
921         if (arch_needs_pgtable_deposit()) {
922                 pgtable = pte_alloc_one(vma->vm_mm);
923                 if (!pgtable)
924                         return VM_FAULT_OOM;
925         }
926
927         track_pfn_insert(vma, &pgprot, pfn);
928
929         insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
930         return VM_FAULT_NOPAGE;
931 }
932 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
933
934 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
935 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
936 {
937         if (likely(vma->vm_flags & VM_WRITE))
938                 pud = pud_mkwrite(pud);
939         return pud;
940 }
941
942 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
943                 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
944 {
945         struct mm_struct *mm = vma->vm_mm;
946         pud_t entry;
947         spinlock_t *ptl;
948
949         ptl = pud_lock(mm, pud);
950         if (!pud_none(*pud)) {
951                 if (write) {
952                         if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
953                                 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
954                                 goto out_unlock;
955                         }
956                         entry = pud_mkyoung(*pud);
957                         entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
958                         if (pudp_set_access_flags(vma, addr, pud, entry, 1))
959                                 update_mmu_cache_pud(vma, addr, pud);
960                 }
961                 goto out_unlock;
962         }
963
964         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
965         if (pfn_t_devmap(pfn))
966                 entry = pud_mkdevmap(entry);
967         if (write) {
968                 entry = pud_mkyoung(pud_mkdirty(entry));
969                 entry = maybe_pud_mkwrite(entry, vma);
970         }
971         set_pud_at(mm, addr, pud, entry);
972         update_mmu_cache_pud(vma, addr, pud);
973
974 out_unlock:
975         spin_unlock(ptl);
976 }
977
978 /**
979  * vmf_insert_pfn_pud_prot - insert a pud size pfn
980  * @vmf: Structure describing the fault
981  * @pfn: pfn to insert
982  * @pgprot: page protection to use
983  * @write: whether it's a write fault
984  *
985  * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
986  * also consult the vmf_insert_mixed_prot() documentation when
987  * @pgprot != @vmf->vma->vm_page_prot.
988  *
989  * Return: vm_fault_t value.
990  */
991 vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
992                                    pgprot_t pgprot, bool write)
993 {
994         unsigned long addr = vmf->address & PUD_MASK;
995         struct vm_area_struct *vma = vmf->vma;
996
997         /*
998          * If we had pud_special, we could avoid all these restrictions,
999          * but we need to be consistent with PTEs and architectures that
1000          * can't support a 'special' bit.
1001          */
1002         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1003                         !pfn_t_devmap(pfn));
1004         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1005                                                 (VM_PFNMAP|VM_MIXEDMAP));
1006         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1007
1008         if (addr < vma->vm_start || addr >= vma->vm_end)
1009                 return VM_FAULT_SIGBUS;
1010
1011         track_pfn_insert(vma, &pgprot, pfn);
1012
1013         insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
1014         return VM_FAULT_NOPAGE;
1015 }
1016 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
1017 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1018
1019 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1020                       pmd_t *pmd, bool write)
1021 {
1022         pmd_t _pmd;
1023
1024         _pmd = pmd_mkyoung(*pmd);
1025         if (write)
1026                 _pmd = pmd_mkdirty(_pmd);
1027         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1028                                   pmd, _pmd, write))
1029                 update_mmu_cache_pmd(vma, addr, pmd);
1030 }
1031
1032 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1033                 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
1034 {
1035         unsigned long pfn = pmd_pfn(*pmd);
1036         struct mm_struct *mm = vma->vm_mm;
1037         struct page *page;
1038         int ret;
1039
1040         assert_spin_locked(pmd_lockptr(mm, pmd));
1041
1042         /* FOLL_GET and FOLL_PIN are mutually exclusive. */
1043         if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
1044                          (FOLL_PIN | FOLL_GET)))
1045                 return NULL;
1046
1047         if (flags & FOLL_WRITE && !pmd_write(*pmd))
1048                 return NULL;
1049
1050         if (pmd_present(*pmd) && pmd_devmap(*pmd))
1051                 /* pass */;
1052         else
1053                 return NULL;
1054
1055         if (flags & FOLL_TOUCH)
1056                 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1057
1058         /*
1059          * device mapped pages can only be returned if the
1060          * caller will manage the page reference count.
1061          */
1062         if (!(flags & (FOLL_GET | FOLL_PIN)))
1063                 return ERR_PTR(-EEXIST);
1064
1065         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1066         *pgmap = get_dev_pagemap(pfn, *pgmap);
1067         if (!*pgmap)
1068                 return ERR_PTR(-EFAULT);
1069         page = pfn_to_page(pfn);
1070         ret = try_grab_page(page, flags);
1071         if (ret)
1072                 page = ERR_PTR(ret);
1073
1074         return page;
1075 }
1076
1077 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1078                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1079                   struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1080 {
1081         spinlock_t *dst_ptl, *src_ptl;
1082         struct page *src_page;
1083         pmd_t pmd;
1084         pgtable_t pgtable = NULL;
1085         int ret = -ENOMEM;
1086
1087         /* Skip if can be re-fill on fault */
1088         if (!vma_is_anonymous(dst_vma))
1089                 return 0;
1090
1091         pgtable = pte_alloc_one(dst_mm);
1092         if (unlikely(!pgtable))
1093                 goto out;
1094
1095         dst_ptl = pmd_lock(dst_mm, dst_pmd);
1096         src_ptl = pmd_lockptr(src_mm, src_pmd);
1097         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1098
1099         ret = -EAGAIN;
1100         pmd = *src_pmd;
1101
1102 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1103         if (unlikely(is_swap_pmd(pmd))) {
1104                 swp_entry_t entry = pmd_to_swp_entry(pmd);
1105
1106                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1107                 if (!is_readable_migration_entry(entry)) {
1108                         entry = make_readable_migration_entry(
1109                                                         swp_offset(entry));
1110                         pmd = swp_entry_to_pmd(entry);
1111                         if (pmd_swp_soft_dirty(*src_pmd))
1112                                 pmd = pmd_swp_mksoft_dirty(pmd);
1113                         if (pmd_swp_uffd_wp(*src_pmd))
1114                                 pmd = pmd_swp_mkuffd_wp(pmd);
1115                         set_pmd_at(src_mm, addr, src_pmd, pmd);
1116                 }
1117                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1118                 mm_inc_nr_ptes(dst_mm);
1119                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1120                 if (!userfaultfd_wp(dst_vma))
1121                         pmd = pmd_swp_clear_uffd_wp(pmd);
1122                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1123                 ret = 0;
1124                 goto out_unlock;
1125         }
1126 #endif
1127
1128         if (unlikely(!pmd_trans_huge(pmd))) {
1129                 pte_free(dst_mm, pgtable);
1130                 goto out_unlock;
1131         }
1132         /*
1133          * When page table lock is held, the huge zero pmd should not be
1134          * under splitting since we don't split the page itself, only pmd to
1135          * a page table.
1136          */
1137         if (is_huge_zero_pmd(pmd)) {
1138                 /*
1139                  * get_huge_zero_page() will never allocate a new page here,
1140                  * since we already have a zero page to copy. It just takes a
1141                  * reference.
1142                  */
1143                 mm_get_huge_zero_page(dst_mm);
1144                 goto out_zero_page;
1145         }
1146
1147         src_page = pmd_page(pmd);
1148         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1149
1150         get_page(src_page);
1151         if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
1152                 /* Page maybe pinned: split and retry the fault on PTEs. */
1153                 put_page(src_page);
1154                 pte_free(dst_mm, pgtable);
1155                 spin_unlock(src_ptl);
1156                 spin_unlock(dst_ptl);
1157                 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1158                 return -EAGAIN;
1159         }
1160         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1161 out_zero_page:
1162         mm_inc_nr_ptes(dst_mm);
1163         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1164         pmdp_set_wrprotect(src_mm, addr, src_pmd);
1165         if (!userfaultfd_wp(dst_vma))
1166                 pmd = pmd_clear_uffd_wp(pmd);
1167         pmd = pmd_mkold(pmd_wrprotect(pmd));
1168         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1169
1170         ret = 0;
1171 out_unlock:
1172         spin_unlock(src_ptl);
1173         spin_unlock(dst_ptl);
1174 out:
1175         return ret;
1176 }
1177
1178 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1179 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1180                       pud_t *pud, bool write)
1181 {
1182         pud_t _pud;
1183
1184         _pud = pud_mkyoung(*pud);
1185         if (write)
1186                 _pud = pud_mkdirty(_pud);
1187         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1188                                   pud, _pud, write))
1189                 update_mmu_cache_pud(vma, addr, pud);
1190 }
1191
1192 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1193                 pud_t *pud, int flags, struct dev_pagemap **pgmap)
1194 {
1195         unsigned long pfn = pud_pfn(*pud);
1196         struct mm_struct *mm = vma->vm_mm;
1197         struct page *page;
1198         int ret;
1199
1200         assert_spin_locked(pud_lockptr(mm, pud));
1201
1202         if (flags & FOLL_WRITE && !pud_write(*pud))
1203                 return NULL;
1204
1205         /* FOLL_GET and FOLL_PIN are mutually exclusive. */
1206         if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
1207                          (FOLL_PIN | FOLL_GET)))
1208                 return NULL;
1209
1210         if (pud_present(*pud) && pud_devmap(*pud))
1211                 /* pass */;
1212         else
1213                 return NULL;
1214
1215         if (flags & FOLL_TOUCH)
1216                 touch_pud(vma, addr, pud, flags & FOLL_WRITE);
1217
1218         /*
1219          * device mapped pages can only be returned if the
1220          * caller will manage the page reference count.
1221          *
1222          * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
1223          */
1224         if (!(flags & (FOLL_GET | FOLL_PIN)))
1225                 return ERR_PTR(-EEXIST);
1226
1227         pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1228         *pgmap = get_dev_pagemap(pfn, *pgmap);
1229         if (!*pgmap)
1230                 return ERR_PTR(-EFAULT);
1231         page = pfn_to_page(pfn);
1232
1233         ret = try_grab_page(page, flags);
1234         if (ret)
1235                 page = ERR_PTR(ret);
1236
1237         return page;
1238 }
1239
1240 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1241                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1242                   struct vm_area_struct *vma)
1243 {
1244         spinlock_t *dst_ptl, *src_ptl;
1245         pud_t pud;
1246         int ret;
1247
1248         dst_ptl = pud_lock(dst_mm, dst_pud);
1249         src_ptl = pud_lockptr(src_mm, src_pud);
1250         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1251
1252         ret = -EAGAIN;
1253         pud = *src_pud;
1254         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1255                 goto out_unlock;
1256
1257         /*
1258          * When page table lock is held, the huge zero pud should not be
1259          * under splitting since we don't split the page itself, only pud to
1260          * a page table.
1261          */
1262         if (is_huge_zero_pud(pud)) {
1263                 /* No huge zero pud yet */
1264         }
1265
1266         /*
1267          * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
1268          * and split if duplicating fails.
1269          */
1270         pudp_set_wrprotect(src_mm, addr, src_pud);
1271         pud = pud_mkold(pud_wrprotect(pud));
1272         set_pud_at(dst_mm, addr, dst_pud, pud);
1273
1274         ret = 0;
1275 out_unlock:
1276         spin_unlock(src_ptl);
1277         spin_unlock(dst_ptl);
1278         return ret;
1279 }
1280
1281 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1282 {
1283         bool write = vmf->flags & FAULT_FLAG_WRITE;
1284
1285         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1286         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1287                 goto unlock;
1288
1289         touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1290 unlock:
1291         spin_unlock(vmf->ptl);
1292 }
1293 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1294
1295 void huge_pmd_set_accessed(struct vm_fault *vmf)
1296 {
1297         bool write = vmf->flags & FAULT_FLAG_WRITE;
1298
1299         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1300         if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1301                 goto unlock;
1302
1303         touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1304
1305 unlock:
1306         spin_unlock(vmf->ptl);
1307 }
1308
1309 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1310 {
1311         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1312         struct vm_area_struct *vma = vmf->vma;
1313         struct folio *folio;
1314         struct page *page;
1315         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1316         pmd_t orig_pmd = vmf->orig_pmd;
1317
1318         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1319         VM_BUG_ON_VMA(!vma->anon_vma, vma);
1320
1321         if (is_huge_zero_pmd(orig_pmd))
1322                 goto fallback;
1323
1324         spin_lock(vmf->ptl);
1325
1326         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1327                 spin_unlock(vmf->ptl);
1328                 return 0;
1329         }
1330
1331         page = pmd_page(orig_pmd);
1332         folio = page_folio(page);
1333         VM_BUG_ON_PAGE(!PageHead(page), page);
1334
1335         /* Early check when only holding the PT lock. */
1336         if (PageAnonExclusive(page))
1337                 goto reuse;
1338
1339         if (!folio_trylock(folio)) {
1340                 folio_get(folio);
1341                 spin_unlock(vmf->ptl);
1342                 folio_lock(folio);
1343                 spin_lock(vmf->ptl);
1344                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1345                         spin_unlock(vmf->ptl);
1346                         folio_unlock(folio);
1347                         folio_put(folio);
1348                         return 0;
1349                 }
1350                 folio_put(folio);
1351         }
1352
1353         /* Recheck after temporarily dropping the PT lock. */
1354         if (PageAnonExclusive(page)) {
1355                 folio_unlock(folio);
1356                 goto reuse;
1357         }
1358
1359         /*
1360          * See do_wp_page(): we can only reuse the folio exclusively if
1361          * there are no additional references. Note that we always drain
1362          * the LRU pagevecs immediately after adding a THP.
1363          */
1364         if (folio_ref_count(folio) >
1365                         1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1366                 goto unlock_fallback;
1367         if (folio_test_swapcache(folio))
1368                 folio_free_swap(folio);
1369         if (folio_ref_count(folio) == 1) {
1370                 pmd_t entry;
1371
1372                 page_move_anon_rmap(page, vma);
1373                 folio_unlock(folio);
1374 reuse:
1375                 if (unlikely(unshare)) {
1376                         spin_unlock(vmf->ptl);
1377                         return 0;
1378                 }
1379                 entry = pmd_mkyoung(orig_pmd);
1380                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1381                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1382                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1383                 spin_unlock(vmf->ptl);
1384                 return 0;
1385         }
1386
1387 unlock_fallback:
1388         folio_unlock(folio);
1389         spin_unlock(vmf->ptl);
1390 fallback:
1391         __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1392         return VM_FAULT_FALLBACK;
1393 }
1394
1395 static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1396                                            unsigned long addr, pmd_t pmd)
1397 {
1398         struct page *page;
1399
1400         if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1401                 return false;
1402
1403         /* Don't touch entries that are not even readable (NUMA hinting). */
1404         if (pmd_protnone(pmd))
1405                 return false;
1406
1407         /* Do we need write faults for softdirty tracking? */
1408         if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1409                 return false;
1410
1411         /* Do we need write faults for uffd-wp tracking? */
1412         if (userfaultfd_huge_pmd_wp(vma, pmd))
1413                 return false;
1414
1415         if (!(vma->vm_flags & VM_SHARED)) {
1416                 /* See can_change_pte_writable(). */
1417                 page = vm_normal_page_pmd(vma, addr, pmd);
1418                 return page && PageAnon(page) && PageAnonExclusive(page);
1419         }
1420
1421         /* See can_change_pte_writable(). */
1422         return pmd_dirty(pmd);
1423 }
1424
1425 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
1426 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
1427                                         struct vm_area_struct *vma,
1428                                         unsigned int flags)
1429 {
1430         /* If the pmd is writable, we can write to the page. */
1431         if (pmd_write(pmd))
1432                 return true;
1433
1434         /* Maybe FOLL_FORCE is set to override it? */
1435         if (!(flags & FOLL_FORCE))
1436                 return false;
1437
1438         /* But FOLL_FORCE has no effect on shared mappings */
1439         if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
1440                 return false;
1441
1442         /* ... or read-only private ones */
1443         if (!(vma->vm_flags & VM_MAYWRITE))
1444                 return false;
1445
1446         /* ... or already writable ones that just need to take a write fault */
1447         if (vma->vm_flags & VM_WRITE)
1448                 return false;
1449
1450         /*
1451          * See can_change_pte_writable(): we broke COW and could map the page
1452          * writable if we have an exclusive anonymous page ...
1453          */
1454         if (!page || !PageAnon(page) || !PageAnonExclusive(page))
1455                 return false;
1456
1457         /* ... and a write-fault isn't required for other reasons. */
1458         if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1459                 return false;
1460         return !userfaultfd_huge_pmd_wp(vma, pmd);
1461 }
1462
1463 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1464                                    unsigned long addr,
1465                                    pmd_t *pmd,
1466                                    unsigned int flags)
1467 {
1468         struct mm_struct *mm = vma->vm_mm;
1469         struct page *page;
1470         int ret;
1471
1472         assert_spin_locked(pmd_lockptr(mm, pmd));
1473
1474         page = pmd_page(*pmd);
1475         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1476
1477         if ((flags & FOLL_WRITE) &&
1478             !can_follow_write_pmd(*pmd, page, vma, flags))
1479                 return NULL;
1480
1481         /* Avoid dumping huge zero page */
1482         if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1483                 return ERR_PTR(-EFAULT);
1484
1485         /* Full NUMA hinting faults to serialise migration in fault paths */
1486         if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
1487                 return NULL;
1488
1489         if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
1490                 return ERR_PTR(-EMLINK);
1491
1492         VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
1493                         !PageAnonExclusive(page), page);
1494
1495         ret = try_grab_page(page, flags);
1496         if (ret)
1497                 return ERR_PTR(ret);
1498
1499         if (flags & FOLL_TOUCH)
1500                 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
1501
1502         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1503         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1504
1505         return page;
1506 }
1507
1508 /* NUMA hinting page fault entry point for trans huge pmds */
1509 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1510 {
1511         struct vm_area_struct *vma = vmf->vma;
1512         pmd_t oldpmd = vmf->orig_pmd;
1513         pmd_t pmd;
1514         struct page *page;
1515         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1516         int page_nid = NUMA_NO_NODE;
1517         int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
1518         bool migrated = false, writable = false;
1519         int flags = 0;
1520
1521         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1522         if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1523                 spin_unlock(vmf->ptl);
1524                 goto out;
1525         }
1526
1527         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1528
1529         /*
1530          * Detect now whether the PMD could be writable; this information
1531          * is only valid while holding the PT lock.
1532          */
1533         writable = pmd_write(pmd);
1534         if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1535             can_change_pmd_writable(vma, vmf->address, pmd))
1536                 writable = true;
1537
1538         page = vm_normal_page_pmd(vma, haddr, pmd);
1539         if (!page)
1540                 goto out_map;
1541
1542         /* See similar comment in do_numa_page for explanation */
1543         if (!writable)
1544                 flags |= TNF_NO_GROUP;
1545
1546         page_nid = page_to_nid(page);
1547         /*
1548          * For memory tiering mode, cpupid of slow memory page is used
1549          * to record page access time.  So use default value.
1550          */
1551         if (node_is_toptier(page_nid))
1552                 last_cpupid = page_cpupid_last(page);
1553         target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
1554                                        &flags);
1555
1556         if (target_nid == NUMA_NO_NODE) {
1557                 put_page(page);
1558                 goto out_map;
1559         }
1560
1561         spin_unlock(vmf->ptl);
1562         writable = false;
1563
1564         migrated = migrate_misplaced_page(page, vma, target_nid);
1565         if (migrated) {
1566                 flags |= TNF_MIGRATED;
1567                 page_nid = target_nid;
1568         } else {
1569                 flags |= TNF_MIGRATE_FAIL;
1570                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1571                 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1572                         spin_unlock(vmf->ptl);
1573                         goto out;
1574                 }
1575                 goto out_map;
1576         }
1577
1578 out:
1579         if (page_nid != NUMA_NO_NODE)
1580                 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1581                                 flags);
1582
1583         return 0;
1584
1585 out_map:
1586         /* Restore the PMD */
1587         pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1588         pmd = pmd_mkyoung(pmd);
1589         if (writable)
1590                 pmd = pmd_mkwrite(pmd);
1591         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1592         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1593         spin_unlock(vmf->ptl);
1594         goto out;
1595 }
1596
1597 /*
1598  * Return true if we do MADV_FREE successfully on entire pmd page.
1599  * Otherwise, return false.
1600  */
1601 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1602                 pmd_t *pmd, unsigned long addr, unsigned long next)
1603 {
1604         spinlock_t *ptl;
1605         pmd_t orig_pmd;
1606         struct page *page;
1607         struct mm_struct *mm = tlb->mm;
1608         bool ret = false;
1609
1610         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1611
1612         ptl = pmd_trans_huge_lock(pmd, vma);
1613         if (!ptl)
1614                 goto out_unlocked;
1615
1616         orig_pmd = *pmd;
1617         if (is_huge_zero_pmd(orig_pmd))
1618                 goto out;
1619
1620         if (unlikely(!pmd_present(orig_pmd))) {
1621                 VM_BUG_ON(thp_migration_supported() &&
1622                                   !is_pmd_migration_entry(orig_pmd));
1623                 goto out;
1624         }
1625
1626         page = pmd_page(orig_pmd);
1627         /*
1628          * If other processes are mapping this page, we couldn't discard
1629          * the page unless they all do MADV_FREE so let's skip the page.
1630          */
1631         if (total_mapcount(page) != 1)
1632                 goto out;
1633
1634         if (!trylock_page(page))
1635                 goto out;
1636
1637         /*
1638          * If user want to discard part-pages of THP, split it so MADV_FREE
1639          * will deactivate only them.
1640          */
1641         if (next - addr != HPAGE_PMD_SIZE) {
1642                 get_page(page);
1643                 spin_unlock(ptl);
1644                 split_huge_page(page);
1645                 unlock_page(page);
1646                 put_page(page);
1647                 goto out_unlocked;
1648         }
1649
1650         if (PageDirty(page))
1651                 ClearPageDirty(page);
1652         unlock_page(page);
1653
1654         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1655                 pmdp_invalidate(vma, addr, pmd);
1656                 orig_pmd = pmd_mkold(orig_pmd);
1657                 orig_pmd = pmd_mkclean(orig_pmd);
1658
1659                 set_pmd_at(mm, addr, pmd, orig_pmd);
1660                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1661         }
1662
1663         mark_page_lazyfree(page);
1664         ret = true;
1665 out:
1666         spin_unlock(ptl);
1667 out_unlocked:
1668         return ret;
1669 }
1670
1671 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1672 {
1673         pgtable_t pgtable;
1674
1675         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1676         pte_free(mm, pgtable);
1677         mm_dec_nr_ptes(mm);
1678 }
1679
1680 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1681                  pmd_t *pmd, unsigned long addr)
1682 {
1683         pmd_t orig_pmd;
1684         spinlock_t *ptl;
1685
1686         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1687
1688         ptl = __pmd_trans_huge_lock(pmd, vma);
1689         if (!ptl)
1690                 return 0;
1691         /*
1692          * For architectures like ppc64 we look at deposited pgtable
1693          * when calling pmdp_huge_get_and_clear. So do the
1694          * pgtable_trans_huge_withdraw after finishing pmdp related
1695          * operations.
1696          */
1697         orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1698                                                 tlb->fullmm);
1699         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1700         if (vma_is_special_huge(vma)) {
1701                 if (arch_needs_pgtable_deposit())
1702                         zap_deposited_table(tlb->mm, pmd);
1703                 spin_unlock(ptl);
1704         } else if (is_huge_zero_pmd(orig_pmd)) {
1705                 zap_deposited_table(tlb->mm, pmd);
1706                 spin_unlock(ptl);
1707         } else {
1708                 struct page *page = NULL;
1709                 int flush_needed = 1;
1710
1711                 if (pmd_present(orig_pmd)) {
1712                         page = pmd_page(orig_pmd);
1713                         page_remove_rmap(page, vma, true);
1714                         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1715                         VM_BUG_ON_PAGE(!PageHead(page), page);
1716                 } else if (thp_migration_supported()) {
1717                         swp_entry_t entry;
1718
1719                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1720                         entry = pmd_to_swp_entry(orig_pmd);
1721                         page = pfn_swap_entry_to_page(entry);
1722                         flush_needed = 0;
1723                 } else
1724                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1725
1726                 if (PageAnon(page)) {
1727                         zap_deposited_table(tlb->mm, pmd);
1728                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1729                 } else {
1730                         if (arch_needs_pgtable_deposit())
1731                                 zap_deposited_table(tlb->mm, pmd);
1732                         add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
1733                 }
1734
1735                 spin_unlock(ptl);
1736                 if (flush_needed)
1737                         tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1738         }
1739         return 1;
1740 }
1741
1742 #ifndef pmd_move_must_withdraw
1743 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1744                                          spinlock_t *old_pmd_ptl,
1745                                          struct vm_area_struct *vma)
1746 {
1747         /*
1748          * With split pmd lock we also need to move preallocated
1749          * PTE page table if new_pmd is on different PMD page table.
1750          *
1751          * We also don't deposit and withdraw tables for file pages.
1752          */
1753         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1754 }
1755 #endif
1756
1757 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1758 {
1759 #ifdef CONFIG_MEM_SOFT_DIRTY
1760         if (unlikely(is_pmd_migration_entry(pmd)))
1761                 pmd = pmd_swp_mksoft_dirty(pmd);
1762         else if (pmd_present(pmd))
1763                 pmd = pmd_mksoft_dirty(pmd);
1764 #endif
1765         return pmd;
1766 }
1767
1768 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1769                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
1770 {
1771         spinlock_t *old_ptl, *new_ptl;
1772         pmd_t pmd;
1773         struct mm_struct *mm = vma->vm_mm;
1774         bool force_flush = false;
1775
1776         /*
1777          * The destination pmd shouldn't be established, free_pgtables()
1778          * should have release it.
1779          */
1780         if (WARN_ON(!pmd_none(*new_pmd))) {
1781                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1782                 return false;
1783         }
1784
1785         /*
1786          * We don't have to worry about the ordering of src and dst
1787          * ptlocks because exclusive mmap_lock prevents deadlock.
1788          */
1789         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1790         if (old_ptl) {
1791                 new_ptl = pmd_lockptr(mm, new_pmd);
1792                 if (new_ptl != old_ptl)
1793                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1794                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1795                 if (pmd_present(pmd))
1796                         force_flush = true;
1797                 VM_BUG_ON(!pmd_none(*new_pmd));
1798
1799                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1800                         pgtable_t pgtable;
1801                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1802                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1803                 }
1804                 pmd = move_soft_dirty_pmd(pmd);
1805                 set_pmd_at(mm, new_addr, new_pmd, pmd);
1806                 if (force_flush)
1807                         flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1808                 if (new_ptl != old_ptl)
1809                         spin_unlock(new_ptl);
1810                 spin_unlock(old_ptl);
1811                 return true;
1812         }
1813         return false;
1814 }
1815
1816 /*
1817  * Returns
1818  *  - 0 if PMD could not be locked
1819  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
1820  *      or if prot_numa but THP migration is not supported
1821  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
1822  */
1823 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1824                     pmd_t *pmd, unsigned long addr, pgprot_t newprot,
1825                     unsigned long cp_flags)
1826 {
1827         struct mm_struct *mm = vma->vm_mm;
1828         spinlock_t *ptl;
1829         pmd_t oldpmd, entry;
1830         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1831         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1832         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
1833         int ret = 1;
1834
1835         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1836
1837         if (prot_numa && !thp_migration_supported())
1838                 return 1;
1839
1840         ptl = __pmd_trans_huge_lock(pmd, vma);
1841         if (!ptl)
1842                 return 0;
1843
1844 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1845         if (is_swap_pmd(*pmd)) {
1846                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
1847                 struct page *page = pfn_swap_entry_to_page(entry);
1848
1849                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1850                 if (is_writable_migration_entry(entry)) {
1851                         pmd_t newpmd;
1852                         /*
1853                          * A protection check is difficult so
1854                          * just be safe and disable write
1855                          */
1856                         if (PageAnon(page))
1857                                 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
1858                         else
1859                                 entry = make_readable_migration_entry(swp_offset(entry));
1860                         newpmd = swp_entry_to_pmd(entry);
1861                         if (pmd_swp_soft_dirty(*pmd))
1862                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
1863                         if (pmd_swp_uffd_wp(*pmd))
1864                                 newpmd = pmd_swp_mkuffd_wp(newpmd);
1865                         set_pmd_at(mm, addr, pmd, newpmd);
1866                 }
1867                 goto unlock;
1868         }
1869 #endif
1870
1871         if (prot_numa) {
1872                 struct page *page;
1873                 bool toptier;
1874                 /*
1875                  * Avoid trapping faults against the zero page. The read-only
1876                  * data is likely to be read-cached on the local CPU and
1877                  * local/remote hits to the zero page are not interesting.
1878                  */
1879                 if (is_huge_zero_pmd(*pmd))
1880                         goto unlock;
1881
1882                 if (pmd_protnone(*pmd))
1883                         goto unlock;
1884
1885                 page = pmd_page(*pmd);
1886                 toptier = node_is_toptier(page_to_nid(page));
1887                 /*
1888                  * Skip scanning top tier node if normal numa
1889                  * balancing is disabled
1890                  */
1891                 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
1892                     toptier)
1893                         goto unlock;
1894
1895                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
1896                     !toptier)
1897                         xchg_page_access_time(page, jiffies_to_msecs(jiffies));
1898         }
1899         /*
1900          * In case prot_numa, we are under mmap_read_lock(mm). It's critical
1901          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1902          * which is also under mmap_read_lock(mm):
1903          *
1904          *      CPU0:                           CPU1:
1905          *                              change_huge_pmd(prot_numa=1)
1906          *                               pmdp_huge_get_and_clear_notify()
1907          * madvise_dontneed()
1908          *  zap_pmd_range()
1909          *   pmd_trans_huge(*pmd) == 0 (without ptl)
1910          *   // skip the pmd
1911          *                               set_pmd_at();
1912          *                               // pmd is re-established
1913          *
1914          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1915          * which may break userspace.
1916          *
1917          * pmdp_invalidate_ad() is required to make sure we don't miss
1918          * dirty/young flags set by hardware.
1919          */
1920         oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
1921
1922         entry = pmd_modify(oldpmd, newprot);
1923         if (uffd_wp) {
1924                 entry = pmd_wrprotect(entry);
1925                 entry = pmd_mkuffd_wp(entry);
1926         } else if (uffd_wp_resolve) {
1927                 /*
1928                  * Leave the write bit to be handled by PF interrupt
1929                  * handler, then things like COW could be properly
1930                  * handled.
1931                  */
1932                 entry = pmd_clear_uffd_wp(entry);
1933         }
1934
1935         /* See change_pte_range(). */
1936         if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
1937             can_change_pmd_writable(vma, addr, entry))
1938                 entry = pmd_mkwrite(entry);
1939
1940         ret = HPAGE_PMD_NR;
1941         set_pmd_at(mm, addr, pmd, entry);
1942
1943         if (huge_pmd_needs_flush(oldpmd, entry))
1944                 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
1945 unlock:
1946         spin_unlock(ptl);
1947         return ret;
1948 }
1949
1950 /*
1951  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1952  *
1953  * Note that if it returns page table lock pointer, this routine returns without
1954  * unlocking page table lock. So callers must unlock it.
1955  */
1956 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1957 {
1958         spinlock_t *ptl;
1959         ptl = pmd_lock(vma->vm_mm, pmd);
1960         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
1961                         pmd_devmap(*pmd)))
1962                 return ptl;
1963         spin_unlock(ptl);
1964         return NULL;
1965 }
1966
1967 /*
1968  * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
1969  *
1970  * Note that if it returns page table lock pointer, this routine returns without
1971  * unlocking page table lock. So callers must unlock it.
1972  */
1973 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
1974 {
1975         spinlock_t *ptl;
1976
1977         ptl = pud_lock(vma->vm_mm, pud);
1978         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
1979                 return ptl;
1980         spin_unlock(ptl);
1981         return NULL;
1982 }
1983
1984 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1985 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
1986                  pud_t *pud, unsigned long addr)
1987 {
1988         spinlock_t *ptl;
1989
1990         ptl = __pud_trans_huge_lock(pud, vma);
1991         if (!ptl)
1992                 return 0;
1993
1994         pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
1995         tlb_remove_pud_tlb_entry(tlb, pud, addr);
1996         if (vma_is_special_huge(vma)) {
1997                 spin_unlock(ptl);
1998                 /* No zero page support yet */
1999         } else {
2000                 /* No support for anonymous PUD pages yet */
2001                 BUG();
2002         }
2003         return 1;
2004 }
2005
2006 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2007                 unsigned long haddr)
2008 {
2009         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2010         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2011         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2012         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2013
2014         count_vm_event(THP_SPLIT_PUD);
2015
2016         pudp_huge_clear_flush_notify(vma, haddr, pud);
2017 }
2018
2019 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2020                 unsigned long address)
2021 {
2022         spinlock_t *ptl;
2023         struct mmu_notifier_range range;
2024
2025         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2026                                 address & HPAGE_PUD_MASK,
2027                                 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2028         mmu_notifier_invalidate_range_start(&range);
2029         ptl = pud_lock(vma->vm_mm, pud);
2030         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2031                 goto out;
2032         __split_huge_pud_locked(vma, pud, range.start);
2033
2034 out:
2035         spin_unlock(ptl);
2036         /*
2037          * No need to double call mmu_notifier->invalidate_range() callback as
2038          * the above pudp_huge_clear_flush_notify() did already call it.
2039          */
2040         mmu_notifier_invalidate_range_only_end(&range);
2041 }
2042 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2043
2044 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2045                 unsigned long haddr, pmd_t *pmd)
2046 {
2047         struct mm_struct *mm = vma->vm_mm;
2048         pgtable_t pgtable;
2049         pmd_t _pmd;
2050         int i;
2051
2052         /*
2053          * Leave pmd empty until pte is filled note that it is fine to delay
2054          * notification until mmu_notifier_invalidate_range_end() as we are
2055          * replacing a zero pmd write protected page with a zero pte write
2056          * protected page.
2057          *
2058          * See Documentation/mm/mmu_notifier.rst
2059          */
2060         pmdp_huge_clear_flush(vma, haddr, pmd);
2061
2062         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2063         pmd_populate(mm, &_pmd, pgtable);
2064
2065         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2066                 pte_t *pte, entry;
2067                 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2068                 entry = pte_mkspecial(entry);
2069                 pte = pte_offset_map(&_pmd, haddr);
2070                 VM_BUG_ON(!pte_none(*pte));
2071                 set_pte_at(mm, haddr, pte, entry);
2072                 pte_unmap(pte);
2073         }
2074         smp_wmb(); /* make pte visible before pmd */
2075         pmd_populate(mm, pmd, pgtable);
2076 }
2077
2078 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2079                 unsigned long haddr, bool freeze)
2080 {
2081         struct mm_struct *mm = vma->vm_mm;
2082         struct page *page;
2083         pgtable_t pgtable;
2084         pmd_t old_pmd, _pmd;
2085         bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2086         bool anon_exclusive = false, dirty = false;
2087         unsigned long addr;
2088         int i;
2089
2090         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2091         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2092         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2093         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2094                                 && !pmd_devmap(*pmd));
2095
2096         count_vm_event(THP_SPLIT_PMD);
2097
2098         if (!vma_is_anonymous(vma)) {
2099                 old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2100                 /*
2101                  * We are going to unmap this huge page. So
2102                  * just go ahead and zap it
2103                  */
2104                 if (arch_needs_pgtable_deposit())
2105                         zap_deposited_table(mm, pmd);
2106                 if (vma_is_special_huge(vma))
2107                         return;
2108                 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2109                         swp_entry_t entry;
2110
2111                         entry = pmd_to_swp_entry(old_pmd);
2112                         page = pfn_swap_entry_to_page(entry);
2113                 } else {
2114                         page = pmd_page(old_pmd);
2115                         if (!PageDirty(page) && pmd_dirty(old_pmd))
2116                                 set_page_dirty(page);
2117                         if (!PageReferenced(page) && pmd_young(old_pmd))
2118                                 SetPageReferenced(page);
2119                         page_remove_rmap(page, vma, true);
2120                         put_page(page);
2121                 }
2122                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2123                 return;
2124         }
2125
2126         if (is_huge_zero_pmd(*pmd)) {
2127                 /*
2128                  * FIXME: Do we want to invalidate secondary mmu by calling
2129                  * mmu_notifier_invalidate_range() see comments below inside
2130                  * __split_huge_pmd() ?
2131                  *
2132                  * We are going from a zero huge page write protected to zero
2133                  * small page also write protected so it does not seems useful
2134                  * to invalidate secondary mmu at this time.
2135                  */
2136                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2137         }
2138
2139         /*
2140          * Up to this point the pmd is present and huge and userland has the
2141          * whole access to the hugepage during the split (which happens in
2142          * place). If we overwrite the pmd with the not-huge version pointing
2143          * to the pte here (which of course we could if all CPUs were bug
2144          * free), userland could trigger a small page size TLB miss on the
2145          * small sized TLB while the hugepage TLB entry is still established in
2146          * the huge TLB. Some CPU doesn't like that.
2147          * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2148          * 383 on page 105. Intel should be safe but is also warns that it's
2149          * only safe if the permission and cache attributes of the two entries
2150          * loaded in the two TLB is identical (which should be the case here).
2151          * But it is generally safer to never allow small and huge TLB entries
2152          * for the same virtual address to be loaded simultaneously. So instead
2153          * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2154          * current pmd notpresent (atomically because here the pmd_trans_huge
2155          * must remain set at all times on the pmd until the split is complete
2156          * for this pmd), then we flush the SMP TLB and finally we write the
2157          * non-huge version of the pmd entry with pmd_populate.
2158          */
2159         old_pmd = pmdp_invalidate(vma, haddr, pmd);
2160
2161         pmd_migration = is_pmd_migration_entry(old_pmd);
2162         if (unlikely(pmd_migration)) {
2163                 swp_entry_t entry;
2164
2165                 entry = pmd_to_swp_entry(old_pmd);
2166                 page = pfn_swap_entry_to_page(entry);
2167                 write = is_writable_migration_entry(entry);
2168                 if (PageAnon(page))
2169                         anon_exclusive = is_readable_exclusive_migration_entry(entry);
2170                 young = is_migration_entry_young(entry);
2171                 dirty = is_migration_entry_dirty(entry);
2172                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2173                 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2174         } else {
2175                 page = pmd_page(old_pmd);
2176                 if (pmd_dirty(old_pmd)) {
2177                         dirty = true;
2178                         SetPageDirty(page);
2179                 }
2180                 write = pmd_write(old_pmd);
2181                 young = pmd_young(old_pmd);
2182                 soft_dirty = pmd_soft_dirty(old_pmd);
2183                 uffd_wp = pmd_uffd_wp(old_pmd);
2184
2185                 VM_BUG_ON_PAGE(!page_count(page), page);
2186
2187                 /*
2188                  * Without "freeze", we'll simply split the PMD, propagating the
2189                  * PageAnonExclusive() flag for each PTE by setting it for
2190                  * each subpage -- no need to (temporarily) clear.
2191                  *
2192                  * With "freeze" we want to replace mapped pages by
2193                  * migration entries right away. This is only possible if we
2194                  * managed to clear PageAnonExclusive() -- see
2195                  * set_pmd_migration_entry().
2196                  *
2197                  * In case we cannot clear PageAnonExclusive(), split the PMD
2198                  * only and let try_to_migrate_one() fail later.
2199                  *
2200                  * See page_try_share_anon_rmap(): invalidate PMD first.
2201                  */
2202                 anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
2203                 if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
2204                         freeze = false;
2205                 if (!freeze)
2206                         page_ref_add(page, HPAGE_PMD_NR - 1);
2207         }
2208
2209         /*
2210          * Withdraw the table only after we mark the pmd entry invalid.
2211          * This's critical for some architectures (Power).
2212          */
2213         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2214         pmd_populate(mm, &_pmd, pgtable);
2215
2216         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2217                 pte_t entry, *pte;
2218                 /*
2219                  * Note that NUMA hinting access restrictions are not
2220                  * transferred to avoid any possibility of altering
2221                  * permissions across VMAs.
2222                  */
2223                 if (freeze || pmd_migration) {
2224                         swp_entry_t swp_entry;
2225                         if (write)
2226                                 swp_entry = make_writable_migration_entry(
2227                                                         page_to_pfn(page + i));
2228                         else if (anon_exclusive)
2229                                 swp_entry = make_readable_exclusive_migration_entry(
2230                                                         page_to_pfn(page + i));
2231                         else
2232                                 swp_entry = make_readable_migration_entry(
2233                                                         page_to_pfn(page + i));
2234                         if (young)
2235                                 swp_entry = make_migration_entry_young(swp_entry);
2236                         if (dirty)
2237                                 swp_entry = make_migration_entry_dirty(swp_entry);
2238                         entry = swp_entry_to_pte(swp_entry);
2239                         if (soft_dirty)
2240                                 entry = pte_swp_mksoft_dirty(entry);
2241                         if (uffd_wp)
2242                                 entry = pte_swp_mkuffd_wp(entry);
2243                 } else {
2244                         entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2245                         entry = maybe_mkwrite(entry, vma);
2246                         if (anon_exclusive)
2247                                 SetPageAnonExclusive(page + i);
2248                         if (!young)
2249                                 entry = pte_mkold(entry);
2250                         /* NOTE: this may set soft-dirty too on some archs */
2251                         if (dirty)
2252                                 entry = pte_mkdirty(entry);
2253                         /*
2254                          * NOTE: this needs to happen after pte_mkdirty,
2255                          * because some archs (sparc64, loongarch) could
2256                          * set hw write bit when mkdirty.
2257                          */
2258                         if (!write)
2259                                 entry = pte_wrprotect(entry);
2260                         if (soft_dirty)
2261                                 entry = pte_mksoft_dirty(entry);
2262                         if (uffd_wp)
2263                                 entry = pte_mkuffd_wp(entry);
2264                         page_add_anon_rmap(page + i, vma, addr, false);
2265                 }
2266                 pte = pte_offset_map(&_pmd, addr);
2267                 BUG_ON(!pte_none(*pte));
2268                 set_pte_at(mm, addr, pte, entry);
2269                 pte_unmap(pte);
2270         }
2271
2272         if (!pmd_migration)
2273                 page_remove_rmap(page, vma, true);
2274         if (freeze)
2275                 put_page(page);
2276
2277         smp_wmb(); /* make pte visible before pmd */
2278         pmd_populate(mm, pmd, pgtable);
2279 }
2280
2281 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2282                 unsigned long address, bool freeze, struct folio *folio)
2283 {
2284         spinlock_t *ptl;
2285         struct mmu_notifier_range range;
2286
2287         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2288                                 address & HPAGE_PMD_MASK,
2289                                 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2290         mmu_notifier_invalidate_range_start(&range);
2291         ptl = pmd_lock(vma->vm_mm, pmd);
2292
2293         /*
2294          * If caller asks to setup a migration entry, we need a folio to check
2295          * pmd against. Otherwise we can end up replacing wrong folio.
2296          */
2297         VM_BUG_ON(freeze && !folio);
2298         VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2299
2300         if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2301             is_pmd_migration_entry(*pmd)) {
2302                 /*
2303                  * It's safe to call pmd_page when folio is set because it's
2304                  * guaranteed that pmd is present.
2305                  */
2306                 if (folio && folio != page_folio(pmd_page(*pmd)))
2307                         goto out;
2308                 __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2309         }
2310
2311 out:
2312         spin_unlock(ptl);
2313         /*
2314          * No need to double call mmu_notifier->invalidate_range() callback.
2315          * They are 3 cases to consider inside __split_huge_pmd_locked():
2316          *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
2317          *  2) __split_huge_zero_page_pmd() read only zero page and any write
2318          *    fault will trigger a flush_notify before pointing to a new page
2319          *    (it is fine if the secondary mmu keeps pointing to the old zero
2320          *    page in the meantime)
2321          *  3) Split a huge pmd into pte pointing to the same page. No need
2322          *     to invalidate secondary tlb entry they are all still valid.
2323          *     any further changes to individual pte will notify. So no need
2324          *     to call mmu_notifier->invalidate_range()
2325          */
2326         mmu_notifier_invalidate_range_only_end(&range);
2327 }
2328
2329 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2330                 bool freeze, struct folio *folio)
2331 {
2332         pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
2333
2334         if (!pmd)
2335                 return;
2336
2337         __split_huge_pmd(vma, pmd, address, freeze, folio);
2338 }
2339
2340 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2341 {
2342         /*
2343          * If the new address isn't hpage aligned and it could previously
2344          * contain an hugepage: check if we need to split an huge pmd.
2345          */
2346         if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2347             range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2348                          ALIGN(address, HPAGE_PMD_SIZE)))
2349                 split_huge_pmd_address(vma, address, false, NULL);
2350 }
2351
2352 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2353                              unsigned long start,
2354                              unsigned long end,
2355                              long adjust_next)
2356 {
2357         /* Check if we need to split start first. */
2358         split_huge_pmd_if_needed(vma, start);
2359
2360         /* Check if we need to split end next. */
2361         split_huge_pmd_if_needed(vma, end);
2362
2363         /*
2364          * If we're also updating the next vma vm_start,
2365          * check if we need to split it.
2366          */
2367         if (adjust_next > 0) {
2368                 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
2369                 unsigned long nstart = next->vm_start;
2370                 nstart += adjust_next;
2371                 split_huge_pmd_if_needed(next, nstart);
2372         }
2373 }
2374
2375 static void unmap_folio(struct folio *folio)
2376 {
2377         enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
2378                 TTU_SYNC;
2379
2380         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2381
2382         /*
2383          * Anon pages need migration entries to preserve them, but file
2384          * pages can simply be left unmapped, then faulted back on demand.
2385          * If that is ever changed (perhaps for mlock), update remap_page().
2386          */
2387         if (folio_test_anon(folio))
2388                 try_to_migrate(folio, ttu_flags);
2389         else
2390                 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
2391 }
2392
2393 static void remap_page(struct folio *folio, unsigned long nr)
2394 {
2395         int i = 0;
2396
2397         /* If unmap_folio() uses try_to_migrate() on file, remove this check */
2398         if (!folio_test_anon(folio))
2399                 return;
2400         for (;;) {
2401                 remove_migration_ptes(folio, folio, true);
2402                 i += folio_nr_pages(folio);
2403                 if (i >= nr)
2404                         break;
2405                 folio = folio_next(folio);
2406         }
2407 }
2408
2409 static void lru_add_page_tail(struct page *head, struct page *tail,
2410                 struct lruvec *lruvec, struct list_head *list)
2411 {
2412         VM_BUG_ON_PAGE(!PageHead(head), head);
2413         VM_BUG_ON_PAGE(PageCompound(tail), head);
2414         VM_BUG_ON_PAGE(PageLRU(tail), head);
2415         lockdep_assert_held(&lruvec->lru_lock);
2416
2417         if (list) {
2418                 /* page reclaim is reclaiming a huge page */
2419                 VM_WARN_ON(PageLRU(head));
2420                 get_page(tail);
2421                 list_add_tail(&tail->lru, list);
2422         } else {
2423                 /* head is still on lru (and we have it frozen) */
2424                 VM_WARN_ON(!PageLRU(head));
2425                 if (PageUnevictable(tail))
2426                         tail->mlock_count = 0;
2427                 else
2428                         list_add_tail(&tail->lru, &head->lru);
2429                 SetPageLRU(tail);
2430         }
2431 }
2432
2433 static void __split_huge_page_tail(struct page *head, int tail,
2434                 struct lruvec *lruvec, struct list_head *list)
2435 {
2436         struct page *page_tail = head + tail;
2437
2438         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2439
2440         /*
2441          * Clone page flags before unfreezing refcount.
2442          *
2443          * After successful get_page_unless_zero() might follow flags change,
2444          * for example lock_page() which set PG_waiters.
2445          *
2446          * Note that for mapped sub-pages of an anonymous THP,
2447          * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
2448          * the migration entry instead from where remap_page() will restore it.
2449          * We can still have PG_anon_exclusive set on effectively unmapped and
2450          * unreferenced sub-pages of an anonymous THP: we can simply drop
2451          * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2452          */
2453         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2454         page_tail->flags |= (head->flags &
2455                         ((1L << PG_referenced) |
2456                          (1L << PG_swapbacked) |
2457                          (1L << PG_swapcache) |
2458                          (1L << PG_mlocked) |
2459                          (1L << PG_uptodate) |
2460                          (1L << PG_active) |
2461                          (1L << PG_workingset) |
2462                          (1L << PG_locked) |
2463                          (1L << PG_unevictable) |
2464 #ifdef CONFIG_64BIT
2465                          (1L << PG_arch_2) |
2466 #endif
2467                          (1L << PG_dirty) |
2468                          LRU_GEN_MASK | LRU_REFS_MASK));
2469
2470         /* ->mapping in first and second tail page is replaced by other uses */
2471         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2472                         page_tail);
2473         page_tail->mapping = head->mapping;
2474         page_tail->index = head->index + tail;
2475
2476         /*
2477          * page->private should not be set in tail pages with the exception
2478          * of swap cache pages that store the swp_entry_t in tail pages.
2479          * Fix up and warn once if private is unexpectedly set.
2480          *
2481          * What of 32-bit systems, on which head[1].compound_pincount overlays
2482          * head[1].private?  No problem: THP_SWAP is not enabled on 32-bit, and
2483          * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
2484          */
2485         if (!folio_test_swapcache(page_folio(head))) {
2486                 VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
2487                 page_tail->private = 0;
2488         }
2489
2490         /* Page flags must be visible before we make the page non-compound. */
2491         smp_wmb();
2492
2493         /*
2494          * Clear PageTail before unfreezing page refcount.
2495          *
2496          * After successful get_page_unless_zero() might follow put_page()
2497          * which needs correct compound_head().
2498          */
2499         clear_compound_head(page_tail);
2500
2501         /* Finally unfreeze refcount. Additional reference from page cache. */
2502         page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
2503                                           PageSwapCache(head)));
2504
2505         if (page_is_young(head))
2506                 set_page_young(page_tail);
2507         if (page_is_idle(head))
2508                 set_page_idle(page_tail);
2509
2510         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
2511
2512         /*
2513          * always add to the tail because some iterators expect new
2514          * pages to show after the currently processed elements - e.g.
2515          * migrate_pages
2516          */
2517         lru_add_page_tail(head, page_tail, lruvec, list);
2518 }
2519
2520 static void __split_huge_page(struct page *page, struct list_head *list,
2521                 pgoff_t end)
2522 {
2523         struct folio *folio = page_folio(page);
2524         struct page *head = &folio->page;
2525         struct lruvec *lruvec;
2526         struct address_space *swap_cache = NULL;
2527         unsigned long offset = 0;
2528         unsigned int nr = thp_nr_pages(head);
2529         int i;
2530
2531         /* complete memcg works before add pages to LRU */
2532         split_page_memcg(head, nr);
2533
2534         if (PageAnon(head) && PageSwapCache(head)) {
2535                 swp_entry_t entry = { .val = page_private(head) };
2536
2537                 offset = swp_offset(entry);
2538                 swap_cache = swap_address_space(entry);
2539                 xa_lock(&swap_cache->i_pages);
2540         }
2541
2542         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2543         lruvec = folio_lruvec_lock(folio);
2544
2545         ClearPageHasHWPoisoned(head);
2546
2547         for (i = nr - 1; i >= 1; i--) {
2548                 __split_huge_page_tail(head, i, lruvec, list);
2549                 /* Some pages can be beyond EOF: drop them from page cache */
2550                 if (head[i].index >= end) {
2551                         struct folio *tail = page_folio(head + i);
2552
2553                         if (shmem_mapping(head->mapping))
2554                                 shmem_uncharge(head->mapping->host, 1);
2555                         else if (folio_test_clear_dirty(tail))
2556                                 folio_account_cleaned(tail,
2557                                         inode_to_wb(folio->mapping->host));
2558                         __filemap_remove_folio(tail, NULL);
2559                         folio_put(tail);
2560                 } else if (!PageAnon(page)) {
2561                         __xa_store(&head->mapping->i_pages, head[i].index,
2562                                         head + i, 0);
2563                 } else if (swap_cache) {
2564                         __xa_store(&swap_cache->i_pages, offset + i,
2565                                         head + i, 0);
2566                 }
2567         }
2568
2569         ClearPageCompound(head);
2570         unlock_page_lruvec(lruvec);
2571         /* Caller disabled irqs, so they are still disabled here */
2572
2573         split_page_owner(head, nr);
2574
2575         /* See comment in __split_huge_page_tail() */
2576         if (PageAnon(head)) {
2577                 /* Additional pin to swap cache */
2578                 if (PageSwapCache(head)) {
2579                         page_ref_add(head, 2);
2580                         xa_unlock(&swap_cache->i_pages);
2581                 } else {
2582                         page_ref_inc(head);
2583                 }
2584         } else {
2585                 /* Additional pin to page cache */
2586                 page_ref_add(head, 2);
2587                 xa_unlock(&head->mapping->i_pages);
2588         }
2589         local_irq_enable();
2590
2591         remap_page(folio, nr);
2592
2593         if (PageSwapCache(head)) {
2594                 swp_entry_t entry = { .val = page_private(head) };
2595
2596                 split_swap_cluster(entry);
2597         }
2598
2599         for (i = 0; i < nr; i++) {
2600                 struct page *subpage = head + i;
2601                 if (subpage == page)
2602                         continue;
2603                 unlock_page(subpage);
2604
2605                 /*
2606                  * Subpages may be freed if there wasn't any mapping
2607                  * like if add_to_swap() is running on a lru page that
2608                  * had its mapping zapped. And freeing these pages
2609                  * requires taking the lru_lock so we do the put_page
2610                  * of the tail pages after the split is complete.
2611                  */
2612                 free_page_and_swap_cache(subpage);
2613         }
2614 }
2615
2616 /* Racy check whether the huge page can be split */
2617 bool can_split_folio(struct folio *folio, int *pextra_pins)
2618 {
2619         int extra_pins;
2620
2621         /* Additional pins from page cache */
2622         if (folio_test_anon(folio))
2623                 extra_pins = folio_test_swapcache(folio) ?
2624                                 folio_nr_pages(folio) : 0;
2625         else
2626                 extra_pins = folio_nr_pages(folio);
2627         if (pextra_pins)
2628                 *pextra_pins = extra_pins;
2629         return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
2630 }
2631
2632 /*
2633  * This function splits huge page into normal pages. @page can point to any
2634  * subpage of huge page to split. Split doesn't change the position of @page.
2635  *
2636  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
2637  * The huge page must be locked.
2638  *
2639  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2640  *
2641  * Both head page and tail pages will inherit mapping, flags, and so on from
2642  * the hugepage.
2643  *
2644  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
2645  * they are not mapped.
2646  *
2647  * Returns 0 if the hugepage is split successfully.
2648  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
2649  * us.
2650  */
2651 int split_huge_page_to_list(struct page *page, struct list_head *list)
2652 {
2653         struct folio *folio = page_folio(page);
2654         struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
2655         XA_STATE(xas, &folio->mapping->i_pages, folio->index);
2656         struct anon_vma *anon_vma = NULL;
2657         struct address_space *mapping = NULL;
2658         int extra_pins, ret;
2659         pgoff_t end;
2660         bool is_hzp;
2661
2662         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2663         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2664
2665         is_hzp = is_huge_zero_page(&folio->page);
2666         VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
2667         if (is_hzp)
2668                 return -EBUSY;
2669
2670         if (folio_test_writeback(folio))
2671                 return -EBUSY;
2672
2673         if (folio_test_anon(folio)) {
2674                 /*
2675                  * The caller does not necessarily hold an mmap_lock that would
2676                  * prevent the anon_vma disappearing so we first we take a
2677                  * reference to it and then lock the anon_vma for write. This
2678                  * is similar to folio_lock_anon_vma_read except the write lock
2679                  * is taken to serialise against parallel split or collapse
2680                  * operations.
2681                  */
2682                 anon_vma = folio_get_anon_vma(folio);
2683                 if (!anon_vma) {
2684                         ret = -EBUSY;
2685                         goto out;
2686                 }
2687                 end = -1;
2688                 mapping = NULL;
2689                 anon_vma_lock_write(anon_vma);
2690         } else {
2691                 gfp_t gfp;
2692
2693                 mapping = folio->mapping;
2694
2695                 /* Truncated ? */
2696                 if (!mapping) {
2697                         ret = -EBUSY;
2698                         goto out;
2699                 }
2700
2701                 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
2702                                                         GFP_RECLAIM_MASK);
2703
2704                 if (folio_test_private(folio) &&
2705                                 !filemap_release_folio(folio, gfp)) {
2706                         ret = -EBUSY;
2707                         goto out;
2708                 }
2709
2710                 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
2711                 if (xas_error(&xas)) {
2712                         ret = xas_error(&xas);
2713                         goto out;
2714                 }
2715
2716                 anon_vma = NULL;
2717                 i_mmap_lock_read(mapping);
2718
2719                 /*
2720                  *__split_huge_page() may need to trim off pages beyond EOF:
2721                  * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
2722                  * which cannot be nested inside the page tree lock. So note
2723                  * end now: i_size itself may be changed at any moment, but
2724                  * folio lock is good enough to serialize the trimming.
2725                  */
2726                 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2727                 if (shmem_mapping(mapping))
2728                         end = shmem_fallocend(mapping->host, end);
2729         }
2730
2731         /*
2732          * Racy check if we can split the page, before unmap_folio() will
2733          * split PMDs
2734          */
2735         if (!can_split_folio(folio, &extra_pins)) {
2736                 ret = -EAGAIN;
2737                 goto out_unlock;
2738         }
2739
2740         unmap_folio(folio);
2741
2742         /* block interrupt reentry in xa_lock and spinlock */
2743         local_irq_disable();
2744         if (mapping) {
2745                 /*
2746                  * Check if the folio is present in page cache.
2747                  * We assume all tail are present too, if folio is there.
2748                  */
2749                 xas_lock(&xas);
2750                 xas_reset(&xas);
2751                 if (xas_load(&xas) != folio)
2752                         goto fail;
2753         }
2754
2755         /* Prevent deferred_split_scan() touching ->_refcount */
2756         spin_lock(&ds_queue->split_queue_lock);
2757         if (folio_ref_freeze(folio, 1 + extra_pins)) {
2758                 if (!list_empty(page_deferred_list(&folio->page))) {
2759                         ds_queue->split_queue_len--;
2760                         list_del(page_deferred_list(&folio->page));
2761                 }
2762                 spin_unlock(&ds_queue->split_queue_lock);
2763                 if (mapping) {
2764                         int nr = folio_nr_pages(folio);
2765
2766                         xas_split(&xas, folio, folio_order(folio));
2767                         if (folio_test_swapbacked(folio)) {
2768                                 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
2769                                                         -nr);
2770                         } else {
2771                                 __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
2772                                                         -nr);
2773                                 filemap_nr_thps_dec(mapping);
2774                         }
2775                 }
2776
2777                 __split_huge_page(page, list, end);
2778                 ret = 0;
2779         } else {
2780                 spin_unlock(&ds_queue->split_queue_lock);
2781 fail:
2782                 if (mapping)
2783                         xas_unlock(&xas);
2784                 local_irq_enable();
2785                 remap_page(folio, folio_nr_pages(folio));
2786                 ret = -EAGAIN;
2787         }
2788
2789 out_unlock:
2790         if (anon_vma) {
2791                 anon_vma_unlock_write(anon_vma);
2792                 put_anon_vma(anon_vma);
2793         }
2794         if (mapping)
2795                 i_mmap_unlock_read(mapping);
2796 out:
2797         xas_destroy(&xas);
2798         count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
2799         return ret;
2800 }
2801
2802 void free_transhuge_page(struct page *page)
2803 {
2804         struct deferred_split *ds_queue = get_deferred_split_queue(page);
2805         unsigned long flags;
2806
2807         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2808         if (!list_empty(page_deferred_list(page))) {
2809                 ds_queue->split_queue_len--;
2810                 list_del(page_deferred_list(page));
2811         }
2812         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2813         free_compound_page(page);
2814 }
2815
2816 void deferred_split_huge_page(struct page *page)
2817 {
2818         struct deferred_split *ds_queue = get_deferred_split_queue(page);
2819 #ifdef CONFIG_MEMCG
2820         struct mem_cgroup *memcg = page_memcg(compound_head(page));
2821 #endif
2822         unsigned long flags;
2823
2824         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
2825
2826         /*
2827          * The try_to_unmap() in page reclaim path might reach here too,
2828          * this may cause a race condition to corrupt deferred split queue.
2829          * And, if page reclaim is already handling the same page, it is
2830          * unnecessary to handle it again in shrinker.
2831          *
2832          * Check PageSwapCache to determine if the page is being
2833          * handled by page reclaim since THP swap would add the page into
2834          * swap cache before calling try_to_unmap().
2835          */
2836         if (PageSwapCache(page))
2837                 return;
2838
2839         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2840         if (list_empty(page_deferred_list(page))) {
2841                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2842                 list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
2843                 ds_queue->split_queue_len++;
2844 #ifdef CONFIG_MEMCG
2845                 if (memcg)
2846                         set_shrinker_bit(memcg, page_to_nid(page),
2847                                          deferred_split_shrinker.id);
2848 #endif
2849         }
2850         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2851 }
2852
2853 static unsigned long deferred_split_count(struct shrinker *shrink,
2854                 struct shrink_control *sc)
2855 {
2856         struct pglist_data *pgdata = NODE_DATA(sc->nid);
2857         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2858
2859 #ifdef CONFIG_MEMCG
2860         if (sc->memcg)
2861                 ds_queue = &sc->memcg->deferred_split_queue;
2862 #endif
2863         return READ_ONCE(ds_queue->split_queue_len);
2864 }
2865
2866 static unsigned long deferred_split_scan(struct shrinker *shrink,
2867                 struct shrink_control *sc)
2868 {
2869         struct pglist_data *pgdata = NODE_DATA(sc->nid);
2870         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2871         unsigned long flags;
2872         LIST_HEAD(list), *pos, *next;
2873         struct page *page;
2874         int split = 0;
2875
2876 #ifdef CONFIG_MEMCG
2877         if (sc->memcg)
2878                 ds_queue = &sc->memcg->deferred_split_queue;
2879 #endif
2880
2881         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2882         /* Take pin on all head pages to avoid freeing them under us */
2883         list_for_each_safe(pos, next, &ds_queue->split_queue) {
2884                 page = list_entry((void *)pos, struct page, deferred_list);
2885                 page = compound_head(page);
2886                 if (get_page_unless_zero(page)) {
2887                         list_move(page_deferred_list(page), &list);
2888                 } else {
2889                         /* We lost race with put_compound_page() */
2890                         list_del_init(page_deferred_list(page));
2891                         ds_queue->split_queue_len--;
2892                 }
2893                 if (!--sc->nr_to_scan)
2894                         break;
2895         }
2896         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2897
2898         list_for_each_safe(pos, next, &list) {
2899                 page = list_entry((void *)pos, struct page, deferred_list);
2900                 if (!trylock_page(page))
2901                         goto next;
2902                 /* split_huge_page() removes page from list on success */
2903                 if (!split_huge_page(page))
2904                         split++;
2905                 unlock_page(page);
2906 next:
2907                 put_page(page);
2908         }
2909
2910         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2911         list_splice_tail(&list, &ds_queue->split_queue);
2912         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2913
2914         /*
2915          * Stop shrinker if we didn't split any page, but the queue is empty.
2916          * This can happen if pages were freed under us.
2917          */
2918         if (!split && list_empty(&ds_queue->split_queue))
2919                 return SHRINK_STOP;
2920         return split;
2921 }
2922
2923 static struct shrinker deferred_split_shrinker = {
2924         .count_objects = deferred_split_count,
2925         .scan_objects = deferred_split_scan,
2926         .seeks = DEFAULT_SEEKS,
2927         .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
2928                  SHRINKER_NONSLAB,
2929 };
2930
2931 #ifdef CONFIG_DEBUG_FS
2932 static void split_huge_pages_all(void)
2933 {
2934         struct zone *zone;
2935         struct page *page;
2936         unsigned long pfn, max_zone_pfn;
2937         unsigned long total = 0, split = 0;
2938
2939         pr_debug("Split all THPs\n");
2940         for_each_zone(zone) {
2941                 if (!managed_zone(zone))
2942                         continue;
2943                 max_zone_pfn = zone_end_pfn(zone);
2944                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
2945                         int nr_pages;
2946
2947                         page = pfn_to_online_page(pfn);
2948                         if (!page || !get_page_unless_zero(page))
2949                                 continue;
2950
2951                         if (zone != page_zone(page))
2952                                 goto next;
2953
2954                         if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
2955                                 goto next;
2956
2957                         total++;
2958                         lock_page(page);
2959                         nr_pages = thp_nr_pages(page);
2960                         if (!split_huge_page(page))
2961                                 split++;
2962                         pfn += nr_pages - 1;
2963                         unlock_page(page);
2964 next:
2965                         put_page(page);
2966                         cond_resched();
2967                 }
2968         }
2969
2970         pr_debug("%lu of %lu THP split\n", split, total);
2971 }
2972
2973 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
2974 {
2975         return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
2976                     is_vm_hugetlb_page(vma);
2977 }
2978
2979 static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
2980                                 unsigned long vaddr_end)
2981 {
2982         int ret = 0;
2983         struct task_struct *task;
2984         struct mm_struct *mm;
2985         unsigned long total = 0, split = 0;
2986         unsigned long addr;
2987
2988         vaddr_start &= PAGE_MASK;
2989         vaddr_end &= PAGE_MASK;
2990
2991         /* Find the task_struct from pid */
2992         rcu_read_lock();
2993         task = find_task_by_vpid(pid);
2994         if (!task) {
2995                 rcu_read_unlock();
2996                 ret = -ESRCH;
2997                 goto out;
2998         }
2999         get_task_struct(task);
3000         rcu_read_unlock();
3001
3002         /* Find the mm_struct */
3003         mm = get_task_mm(task);
3004         put_task_struct(task);
3005
3006         if (!mm) {
3007                 ret = -EINVAL;
3008                 goto out;
3009         }
3010
3011         pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3012                  pid, vaddr_start, vaddr_end);
3013
3014         mmap_read_lock(mm);
3015         /*
3016          * always increase addr by PAGE_SIZE, since we could have a PTE page
3017          * table filled with PTE-mapped THPs, each of which is distinct.
3018          */
3019         for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
3020                 struct vm_area_struct *vma = vma_lookup(mm, addr);
3021                 struct page *page;
3022
3023                 if (!vma)
3024                         break;
3025
3026                 /* skip special VMA and hugetlb VMA */
3027                 if (vma_not_suitable_for_thp_split(vma)) {
3028                         addr = vma->vm_end;
3029                         continue;
3030                 }
3031
3032                 /* FOLL_DUMP to ignore special (like zero) pages */
3033                 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
3034
3035                 if (IS_ERR_OR_NULL(page))
3036                         continue;
3037
3038                 if (!is_transparent_hugepage(page))
3039                         goto next;
3040
3041                 total++;
3042                 if (!can_split_folio(page_folio(page), NULL))
3043                         goto next;
3044
3045                 if (!trylock_page(page))
3046                         goto next;
3047
3048                 if (!split_huge_page(page))
3049                         split++;
3050
3051                 unlock_page(page);
3052 next:
3053                 put_page(page);
3054                 cond_resched();
3055         }
3056         mmap_read_unlock(mm);
3057         mmput(mm);
3058
3059         pr_debug("%lu of %lu THP split\n", split, total);
3060
3061 out:
3062         return ret;
3063 }
3064
3065 static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3066                                 pgoff_t off_end)
3067 {
3068         struct filename *file;
3069         struct file *candidate;
3070         struct address_space *mapping;
3071         int ret = -EINVAL;
3072         pgoff_t index;
3073         int nr_pages = 1;
3074         unsigned long total = 0, split = 0;
3075
3076         file = getname_kernel(file_path);
3077         if (IS_ERR(file))
3078                 return ret;
3079
3080         candidate = file_open_name(file, O_RDONLY, 0);
3081         if (IS_ERR(candidate))
3082                 goto out;
3083
3084         pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3085                  file_path, off_start, off_end);
3086
3087         mapping = candidate->f_mapping;
3088
3089         for (index = off_start; index < off_end; index += nr_pages) {
3090                 struct folio *folio = __filemap_get_folio(mapping, index,
3091                                                 FGP_ENTRY, 0);
3092
3093                 nr_pages = 1;
3094                 if (xa_is_value(folio) || !folio)
3095                         continue;
3096
3097                 if (!folio_test_large(folio))
3098                         goto next;
3099
3100                 total++;
3101                 nr_pages = folio_nr_pages(folio);
3102
3103                 if (!folio_trylock(folio))
3104                         goto next;
3105
3106                 if (!split_folio(folio))
3107                         split++;
3108
3109                 folio_unlock(folio);
3110 next:
3111                 folio_put(folio);
3112                 cond_resched();
3113         }
3114
3115         filp_close(candidate, NULL);
3116         ret = 0;
3117
3118         pr_debug("%lu of %lu file-backed THP split\n", split, total);
3119 out:
3120         putname(file);
3121         return ret;
3122 }
3123
3124 #define MAX_INPUT_BUF_SZ 255
3125
3126 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3127                                 size_t count, loff_t *ppops)
3128 {
3129         static DEFINE_MUTEX(split_debug_mutex);
3130         ssize_t ret;
3131         /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
3132         char input_buf[MAX_INPUT_BUF_SZ];
3133         int pid;
3134         unsigned long vaddr_start, vaddr_end;
3135
3136         ret = mutex_lock_interruptible(&split_debug_mutex);
3137         if (ret)
3138                 return ret;
3139
3140         ret = -EFAULT;
3141
3142         memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3143         if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3144                 goto out;
3145
3146         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3147
3148         if (input_buf[0] == '/') {
3149                 char *tok;
3150                 char *buf = input_buf;
3151                 char file_path[MAX_INPUT_BUF_SZ];
3152                 pgoff_t off_start = 0, off_end = 0;
3153                 size_t input_len = strlen(input_buf);
3154
3155                 tok = strsep(&buf, ",");
3156                 if (tok) {
3157                         strcpy(file_path, tok);
3158                 } else {
3159                         ret = -EINVAL;
3160                         goto out;
3161                 }
3162
3163                 ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
3164                 if (ret != 2) {
3165                         ret = -EINVAL;
3166                         goto out;
3167                 }
3168                 ret = split_huge_pages_in_file(file_path, off_start, off_end);
3169                 if (!ret)
3170                         ret = input_len;
3171
3172                 goto out;
3173         }
3174
3175         ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
3176         if (ret == 1 && pid == 1) {
3177                 split_huge_pages_all();
3178                 ret = strlen(input_buf);
3179                 goto out;
3180         } else if (ret != 3) {
3181                 ret = -EINVAL;
3182                 goto out;
3183         }
3184
3185         ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
3186         if (!ret)
3187                 ret = strlen(input_buf);
3188 out:
3189         mutex_unlock(&split_debug_mutex);
3190         return ret;
3191
3192 }
3193
3194 static const struct file_operations split_huge_pages_fops = {
3195         .owner   = THIS_MODULE,
3196         .write   = split_huge_pages_write,
3197         .llseek  = no_llseek,
3198 };
3199
3200 static int __init split_huge_pages_debugfs(void)
3201 {
3202         debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3203                             &split_huge_pages_fops);
3204         return 0;
3205 }
3206 late_initcall(split_huge_pages_debugfs);
3207 #endif
3208
3209 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3210 int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3211                 struct page *page)
3212 {
3213         struct vm_area_struct *vma = pvmw->vma;
3214         struct mm_struct *mm = vma->vm_mm;
3215         unsigned long address = pvmw->address;
3216         bool anon_exclusive;
3217         pmd_t pmdval;
3218         swp_entry_t entry;
3219         pmd_t pmdswp;
3220
3221         if (!(pvmw->pmd && !pvmw->pte))
3222                 return 0;
3223
3224         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3225         pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
3226
3227         /* See page_try_share_anon_rmap(): invalidate PMD first. */
3228         anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
3229         if (anon_exclusive && page_try_share_anon_rmap(page)) {
3230                 set_pmd_at(mm, address, pvmw->pmd, pmdval);
3231                 return -EBUSY;
3232         }
3233
3234         if (pmd_dirty(pmdval))
3235                 set_page_dirty(page);
3236         if (pmd_write(pmdval))
3237                 entry = make_writable_migration_entry(page_to_pfn(page));
3238         else if (anon_exclusive)
3239                 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3240         else
3241                 entry = make_readable_migration_entry(page_to_pfn(page));
3242         if (pmd_young(pmdval))
3243                 entry = make_migration_entry_young(entry);
3244         if (pmd_dirty(pmdval))
3245                 entry = make_migration_entry_dirty(entry);
3246         pmdswp = swp_entry_to_pmd(entry);
3247         if (pmd_soft_dirty(pmdval))
3248                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
3249         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3250         page_remove_rmap(page, vma, true);
3251         put_page(page);
3252         trace_set_migration_pmd(address, pmd_val(pmdswp));
3253
3254         return 0;
3255 }
3256
3257 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3258 {
3259         struct vm_area_struct *vma = pvmw->vma;
3260         struct mm_struct *mm = vma->vm_mm;
3261         unsigned long address = pvmw->address;
3262         unsigned long haddr = address & HPAGE_PMD_MASK;
3263         pmd_t pmde;
3264         swp_entry_t entry;
3265
3266         if (!(pvmw->pmd && !pvmw->pte))
3267                 return;
3268
3269         entry = pmd_to_swp_entry(*pvmw->pmd);
3270         get_page(new);
3271         pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3272         if (pmd_swp_soft_dirty(*pvmw->pmd))
3273                 pmde = pmd_mksoft_dirty(pmde);
3274         if (is_writable_migration_entry(entry))
3275                 pmde = maybe_pmd_mkwrite(pmde, vma);
3276         if (pmd_swp_uffd_wp(*pvmw->pmd))
3277                 pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
3278         if (!is_migration_entry_young(entry))
3279                 pmde = pmd_mkold(pmde);
3280         /* NOTE: this may contain setting soft-dirty on some archs */
3281         if (PageDirty(new) && is_migration_entry_dirty(entry))
3282                 pmde = pmd_mkdirty(pmde);
3283
3284         if (PageAnon(new)) {
3285                 rmap_t rmap_flags = RMAP_COMPOUND;
3286
3287                 if (!is_readable_migration_entry(entry))
3288                         rmap_flags |= RMAP_EXCLUSIVE;
3289
3290                 page_add_anon_rmap(new, vma, haddr, rmap_flags);
3291         } else {
3292                 page_add_file_rmap(new, vma, true);
3293         }
3294         VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
3295         set_pmd_at(mm, haddr, pvmw->pmd, pmde);
3296
3297         /* No need to invalidate - it was non-present before */
3298         update_mmu_cache_pmd(vma, address, pvmw->pmd);
3299         trace_remove_migration_pmd(address, pmd_val(pmde));
3300 }
3301 #endif