userfaultfd: wp: support swap and page migration
authorPeter Xu <peterx@redhat.com>
Tue, 7 Apr 2020 03:06:01 +0000 (20:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Apr 2020 17:43:39 +0000 (10:43 -0700)
For either swap and page migration, we all use the bit 2 of the entry to
identify whether this entry is uffd write-protected.  It plays a similar
role as the existing soft dirty bit in swap entries but only for keeping
the uffd-wp tracking for a specific PTE/PMD.

Something special here is that when we want to recover the uffd-wp bit
from a swap/migration entry to the PTE bit we'll also need to take care of
the _PAGE_RW bit and make sure it's cleared, otherwise even with the
_PAGE_UFFD_WP bit we can't trap it at all.

In change_pte_range() we do nothing for uffd if the PTE is a swap entry.
That can lead to data mismatch if the page that we are going to write
protect is swapped out when sending the UFFDIO_WRITEPROTECT.  This patch
also applies/removes the uffd-wp bit even for the swap entries.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Martin Cracauer <cracauer@cons.org>
Cc: Marty McFadden <mcfadden8@llnl.gov>
Cc: Maya Gokhale <gokhale2@llnl.gov>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Link: http://lkml.kernel.org/r/20200220163112.11409-11-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/swapops.h
mm/huge_memory.c
mm/memory.c
mm/migrate.c
mm/mprotect.c
mm/rmap.c

index 877fd239b6fff261c74bfcb5a5fd125de3d7e741..9a6f06de183bec1065402870470635ca0983b60e 100644 (file)
@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
+       if (pte_swp_uffd_wp(pte))
+               pte = pte_swp_clear_uffd_wp(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
index 8164787cd51f29d568ad3202544b126dc8dbed07..6ecd1045113b538586e87a00aec4022cf500c1f1 100644 (file)
@@ -2297,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                write = is_write_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
+               uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                page = pmd_page(old_pmd);
                if (pmd_dirty(old_pmd))
@@ -2329,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_swp_mkuffd_wp(entry);
                } else {
                        entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
                        entry = maybe_mkwrite(entry, vma);
index f8b1969669b7bdf1bc5847fc6cfdc22b7f0da404..8ac9af73e9d2356c1a5f5f675a38de36bf9cb6b1 100644 (file)
@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                pte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(*src_pte))
                                        pte = pte_swp_mksoft_dirty(pte);
+                               if (pte_swp_uffd_wp(*src_pte))
+                                       pte = pte_swp_mkuffd_wp(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
                } else if (is_device_private_entry(entry)) {
@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                            is_cow_mapping(vm_flags)) {
                                make_device_private_entry_read(&entry);
                                pte = swp_entry_to_pte(entry);
+                               if (pte_swp_uffd_wp(*src_pte))
+                                       pte = pte_swp_mkuffd_wp(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
                }
@@ -3098,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
+       if (pte_swp_uffd_wp(vmf->orig_pte)) {
+               pte = pte_mkuffd_wp(pte);
+               pte = pte_wrprotect(pte);
+       }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
        vmf->orig_pte = pte;
index c1412e04975e1bb77ae7f1eaf1ccb97e020622d1..7160c1556f797fa961b77065e47e57e628ebf3de 100644 (file)
@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                entry = pte_to_swp_entry(*pvmw.pte);
                if (is_write_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
+               else if (pte_swp_uffd_wp(*pvmw.pte))
+                       pte = pte_mkuffd_wp(pte);
 
                if (unlikely(is_zone_device_page(new))) {
                        if (is_device_private_page(new)) {
                                entry = make_device_private_entry(new, pte_write(pte));
                                pte = swp_entry_to_pte(entry);
+                               if (pte_swp_uffd_wp(*pvmw.pte))
+                                       pte = pte_mkuffd_wp(pte);
                        }
                }
 
@@ -2338,6 +2342,8 @@ again:
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pte))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pte))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, addr, ptep, swp_pte);
 
                        /*
index e4fa41a24bec91daee6868d894ff6ea1d536d666..1d823b0503299b79ea5ac602796994c335bb5520 100644 (file)
@@ -139,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        }
                        ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
                        pages++;
-               } else if (IS_ENABLED(CONFIG_MIGRATION)) {
+               } else if (is_swap_pte(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
+                       pte_t newpte;
 
                        if (is_write_migration_entry(entry)) {
-                               pte_t newpte;
                                /*
                                 * A protection check is difficult so
                                 * just be safe and disable write
@@ -152,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                newpte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(oldpte))
                                        newpte = pte_swp_mksoft_dirty(newpte);
-                               set_pte_at(vma->vm_mm, addr, pte, newpte);
-
-                               pages++;
-                       }
-
-                       if (is_write_device_private_entry(entry)) {
-                               pte_t newpte;
-
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                       } else if (is_write_device_private_entry(entry)) {
                                /*
                                 * We do not preserve soft-dirtiness. See
                                 * copy_one_pte() for explanation.
                                 */
                                make_device_private_entry_read(&entry);
                                newpte = swp_entry_to_pte(entry);
-                               set_pte_at(vma->vm_mm, addr, pte, newpte);
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                       } else {
+                               newpte = oldpte;
+                       }
 
+                       if (uffd_wp)
+                               newpte = pte_swp_mkuffd_wp(newpte);
+                       else if (uffd_wp_resolve)
+                               newpte = pte_swp_clear_uffd_wp(newpte);
+
+                       if (!pte_same(oldpte, newpte)) {
+                               set_pte_at(vma->vm_mm, addr, pte, newpte);
                                pages++;
                        }
                }
index 374a9bfdbffa0e54b44251c0e24d49a14f5d21c3..ed8889bf4ede842b7515ffaa64384d48a620419d 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1502,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1601,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1667,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /* Invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,