mm/hugetlb: handle uffd-wp during fork()
authorPeter Xu <peterx@redhat.com>
Fri, 13 May 2022 03:22:55 +0000 (20:22 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 13 May 2022 14:20:11 +0000 (07:20 -0700)
Firstly, we'll need to pass in dst_vma into copy_hugetlb_page_range()
because for uffd-wp it's the dst vma that matters on deciding how we
should treat uffd-wp protected ptes.

We should recognize pte markers during fork and do the pte copy if needed.

[lkp@intel.com: vma_needs_copy can be static]
Link: https://lkml.kernel.org/r/Ylb0CGeFJlc4EzLk@7ec4ff11d4ae
Link: https://lkml.kernel.org/r/20220405014918.14932-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/hugetlb.h
mm/hugetlb.c
mm/memory.c

index 19cec41..04f0186 100644 (file)
@@ -137,7 +137,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len);
-int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
+int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
+                           struct vm_area_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         struct page **, struct vm_area_struct **,
                         unsigned long *, unsigned long *, long, unsigned int,
@@ -269,7 +270,9 @@ static inline struct page *follow_huge_addr(struct mm_struct *mm,
 }
 
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
-                       struct mm_struct *src, struct vm_area_struct *vma)
+                                         struct mm_struct *src,
+                                         struct vm_area_struct *dst_vma,
+                                         struct vm_area_struct *src_vma)
 {
        BUG();
        return 0;
index 99281ae..01f0e2e 100644 (file)
@@ -4719,23 +4719,24 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
 }
 
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-                           struct vm_area_struct *vma)
+                           struct vm_area_struct *dst_vma,
+                           struct vm_area_struct *src_vma)
 {
        pte_t *src_pte, *dst_pte, entry, dst_entry;
        struct page *ptepage;
        unsigned long addr;
-       bool cow = is_cow_mapping(vma->vm_flags);
-       struct hstate *h = hstate_vma(vma);
+       bool cow = is_cow_mapping(src_vma->vm_flags);
+       struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct address_space *mapping = src_vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;
 
        if (cow) {
-               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
-                                       vma->vm_start,
-                                       vma->vm_end);
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+                                       src_vma->vm_start,
+                                       src_vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
                mmap_assert_write_locked(src);
                raw_write_seqcount_begin(&src->write_protect_seq);
@@ -4749,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                i_mmap_lock_read(mapping);
        }
 
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+       for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
-               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+               dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@ -4789,6 +4790,7 @@ again:
                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                    is_hugetlb_entry_hwpoisoned(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
+                       bool uffd_wp = huge_pte_uffd_wp(entry);
 
                        if (!is_readable_migration_entry(swp_entry) && cow) {
                                /*
@@ -4798,10 +4800,21 @@ again:
                                swp_entry = make_readable_migration_entry(
                                                        swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
+                               if (userfaultfd_wp(src_vma) && uffd_wp)
+                                       entry = huge_pte_mkuffd_wp(entry);
                                set_huge_swap_pte_at(src, addr, src_pte,
                                                     entry, sz);
                        }
+                       if (!userfaultfd_wp(dst_vma) && uffd_wp)
+                               entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+               } else if (unlikely(is_pte_marker(entry))) {
+                       /*
+                        * We copy the pte marker only if the dst vma has
+                        * uffd-wp enabled.
+                        */
+                       if (userfaultfd_wp(dst_vma))
+                               set_huge_pte_at(dst, addr, dst_pte, entry);
                } else {
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
@@ -4819,20 +4832,21 @@ again:
                         */
                        if (!PageAnon(ptepage)) {
                                page_dup_file_rmap(ptepage, true);
-                       } else if (page_try_dup_anon_rmap(ptepage, true, vma)) {
+                       } else if (page_try_dup_anon_rmap(ptepage, true,
+                                                         src_vma)) {
                                pte_t src_pte_old = entry;
                                struct page *new;
 
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                /* Do not use reserve as it's private owned */
-                               new = alloc_huge_page(vma, addr, 1);
+                               new = alloc_huge_page(dst_vma, addr, 1);
                                if (IS_ERR(new)) {
                                        put_page(ptepage);
                                        ret = PTR_ERR(new);
                                        break;
                                }
-                               copy_user_huge_page(new, ptepage, addr, vma,
+                               copy_user_huge_page(new, ptepage, addr, dst_vma,
                                                    npages);
                                put_page(ptepage);
 
@@ -4842,13 +4856,13 @@ again:
                                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                entry = huge_ptep_get(src_pte);
                                if (!pte_same(src_pte_old, entry)) {
-                                       restore_reserve_on_error(h, vma, addr,
+                                       restore_reserve_on_error(h, dst_vma, addr,
                                                                new);
                                        put_page(new);
                                        /* dst_entry won't change as in child */
                                        goto again;
                                }
-                               hugetlb_install_page(vma, dst_pte, addr, new);
+                               hugetlb_install_page(dst_vma, dst_pte, addr, new);
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                continue;
index 82adda8..f4161fb 100644 (file)
@@ -1234,7 +1234,7 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
  * false when we can speed up fork() by allowing lazy page faults later until
  * when the child accesses the memory range.
  */
-bool
+static bool
 vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
        /*
@@ -1278,7 +1278,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
                return 0;
 
        if (is_vm_hugetlb_page(src_vma))
-               return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+               return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
 
        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                /*