mm, gup: prevent pmd checking race in follow_pmd_mask()
authorHuang Ying <ying.huang@intel.com>
Fri, 8 Jun 2018 00:06:34 +0000 (17:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 8 Jun 2018 00:34:35 +0000 (17:34 -0700)
mmap_sem will be read locked when calling follow_pmd_mask().  But this
cannot prevent PMD from being changed for all cases when PTL is
unlocked, for example, from pmd_trans_huge() to pmd_none() via
MADV_DONTNEED.  So it is possible for the pmd_present() check in
follow_pmd_mask() to encounter an invalid PMD.  This may cause an
incorrect VM_BUG_ON() or an infinite loop.  Fix this by reading the PMD
entry into a local variable with READ_ONCE() and checking the local
variable and pmd_none() in the retry loop.

As Kirill pointed out, with PTL unlocked, the *pmd may be changed under
us, so reading it directly again and again may incur weird bugs.  So
although using *pmd directly other than for pmd_present() checking may
be safe, it is still better to replace them to read *pmd once and check
the local variable multiple times.

When PTL unlocked, replace all *pmd with local variable was suggested by
Kirill.

Link: http://lkml.kernel.org/r/20180419083514.1365-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/gup.c

index 0101539..1020c7f 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
                                    unsigned int flags, unsigned int *page_mask)
 {
-       pmd_t *pmd;
+       pmd_t *pmd, pmdval;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;
 
        pmd = pmd_offset(pudp, address);
-       if (pmd_none(*pmd))
+       /*
+        * The READ_ONCE() will stabilize the pmdval in a register or
+        * on the stack so that it will stop changing under the code.
+        */
+       pmdval = READ_ONCE(*pmd);
+       if (pmd_none(pmdval))
                return no_page_table(vma, flags);
-       if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+       if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
                page = follow_huge_pmd(mm, address, pmd, flags);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
-       if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+       if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
                page = follow_huge_pd(vma, address,
-                                     __hugepd(pmd_val(*pmd)), flags,
+                                     __hugepd(pmd_val(pmdval)), flags,
                                      PMD_SHIFT);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
 retry:
-       if (!pmd_present(*pmd)) {
+       if (!pmd_present(pmdval)) {
                if (likely(!(flags & FOLL_MIGRATION)))
                        return no_page_table(vma, flags);
                VM_BUG_ON(thp_migration_supported() &&
-                                 !is_pmd_migration_entry(*pmd));
-               if (is_pmd_migration_entry(*pmd))
+                                 !is_pmd_migration_entry(pmdval));
+               if (is_pmd_migration_entry(pmdval))
                        pmd_migration_entry_wait(mm, pmd);
+               pmdval = READ_ONCE(*pmd);
+               /*
+                * MADV_DONTNEED may convert the pmd to null because
+                * mmap_sem is held in read mode
+                */
+               if (pmd_none(pmdval))
+                       return no_page_table(vma, flags);
                goto retry;
        }
-       if (pmd_devmap(*pmd)) {
+       if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
                page = follow_devmap_pmd(vma, address, pmd, flags);
                spin_unlock(ptl);
                if (page)
                        return page;
        }
-       if (likely(!pmd_trans_huge(*pmd)))
+       if (likely(!pmd_trans_huge(pmdval)))
                return follow_page_pte(vma, address, pmd, flags);
 
-       if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+       if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
                return no_page_table(vma, flags);
 
 retry_locked:
        ptl = pmd_lock(mm, pmd);
+       if (unlikely(pmd_none(*pmd))) {
+               spin_unlock(ptl);
+               return no_page_table(vma, flags);
+       }
        if (unlikely(!pmd_present(*pmd))) {
                spin_unlock(ptl);
                if (likely(!(flags & FOLL_MIGRATION)))