mm/hugetlb: fix race condition of uffd missing/minor handling
authorPeter Xu <peterx@redhat.com>
Tue, 4 Oct 2022 19:33:58 +0000 (15:33 -0400)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 13 Oct 2022 01:51:50 +0000 (18:51 -0700)
Patch series "mm/hugetlb: Fix selftest failures with write check", v3.

Currently akpm mm-unstable fails with uffd hugetlb private mapping test
randomly on a write check.

The initial bisection of that points to the recent pmd unshare series, but
it turns out there's no direction relationship with the series but only
some timing change caused the race to start trigger.

The race should be fixed in patch 1.  Patch 2 is a trivial cleanup on the
similar race with hugetlb migrations, patch 3 comment on the write check
so when anyone read it again it'll be clear why it's there.

This patch (of 3):

After the recent rework patchset of hugetlb locking on pmd sharing,
kselftest for userfaultfd sometimes fails on hugetlb private tests with
unexpected write fault checks.

It turns out there's nothing wrong within the locking series regarding
this matter, but it could have changed the timing of threads so it can
trigger an old bug.

The real bug is when we call hugetlb_no_page() we're not with the pgtable
lock.  It means we're reading the pte values lockless.  It's perfectly
fine in most cases because before we do normal page allocations we'll take
the lock and check pte_same() again.  However before that, there are
actually two paths on userfaultfd missing/minor handling that may directly
move on with the fault process without checking the pte values.

It means for these two paths we may be generating an uffd message based on
an unstable pte, while an unstable pte can legally be anything as long as
the modifier holds the pgtable lock.

One example, which is also what happened in the failing kselftest and
caused the test failure, is that for private mappings wr-protection
changes can happen on one page.  While hugetlb_change_protection()
generally requires pte being cleared before being changed, then there can
be a race condition like:

        thread 1                              thread 2
        --------                              --------

      UFFDIO_WRITEPROTECT                     hugetlb_fault
        hugetlb_change_protection
          pgtable_lock()
          huge_ptep_modify_prot_start
                                              pte==NULL
                                              hugetlb_no_page
                                                generate uffd missing event
                                                even if page existed!!
          huge_ptep_modify_prot_commit
          pgtable_unlock()

Fix this by rechecking the pte after pgtable lock for both userfaultfd
missing & minor fault paths.

This bug should have been around starting from uffd hugetlb introduced, so
attaching a Fixes to the commit.  Also attach another Fixes to the minor
support commit for easier tracking.

Note that userfaultfd is actually fine with false positives (e.g.  caused
by pte changed), but not wrong logical events (e.g.  caused by reading a
pte during changing).  The latter can confuse the userspace, so the
strictness is very much preferred.  E.g., MISSING event should never
happen on the page after UFFDIO_COPY has correctly installed the page and
returned.

Link: https://lkml.kernel.org/r/20221004193400.110155-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221004193400.110155-2-peterx@redhat.com
Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook")
Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode")
Signed-off-by: Peter Xu <peterx@redhat.com>
Co-developed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/hugetlb.c

index 9a910612336dad400ac604432b0979d5ce40ff52..bf9d8d04bf4f82760173765fe6d2da856de154a5 100644 (file)
@@ -5535,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
        return handle_userfault(&vmf, reason);
 }
 
+/*
+ * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+                              pte_t *ptep, pte_t old_pte)
+{
+       spinlock_t *ptl;
+       bool same;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       same = pte_same(huge_ptep_get(ptep), old_pte);
+       spin_unlock(ptl);
+
+       return same;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
@@ -5575,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                if (idx >= size)
                        goto out;
                /* Check for page in userfault range */
-               if (userfaultfd_missing(vma))
-                       return hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr, address,
-                                                      VM_UFFD_MISSING);
+               if (userfaultfd_missing(vma)) {
+                       /*
+                        * Since hugetlb_no_page() was examining pte
+                        * without pgtable lock, we need to re-test under
+                        * lock because the pte may not be stable and could
+                        * have changed from under us.  Try to detect
+                        * either changed or during-changing ptes and retry
+                        * properly when needed.
+                        *
+                        * Note that userfaultfd is actually fine with
+                        * false positives (e.g. caused by pte changed),
+                        * but not wrong logical events (e.g. caused by
+                        * reading a pte during changing).  The latter can
+                        * confuse the userspace, so the strictness is very
+                        * much preferred.  E.g., MISSING event should
+                        * never happen on the page after UFFDIO_COPY has
+                        * correctly installed the page and returned.
+                        */
+                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                               ret = 0;
+                               goto out;
+                       }
+
+                       return hugetlb_handle_userfault(vma, mapping, idx, flags,
+                                                       haddr, address,
+                                                       VM_UFFD_MISSING);
+               }
 
                page = alloc_huge_page(vma, haddr, 0);
                if (IS_ERR(page)) {
@@ -5644,9 +5684,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                if (userfaultfd_minor(vma)) {
                        unlock_page(page);
                        put_page(page);
-                       return hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr, address,
-                                                      VM_UFFD_MINOR);
+                       /* See comment in userfaultfd_missing() block above */
+                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                               ret = 0;
+                               goto out;
+                       }
+                       return hugetlb_handle_userfault(vma, mapping, idx, flags,
+                                                       haddr, address,
+                                                       VM_UFFD_MINOR);
                }
        }