mm/mprotect: do not flush when not required architecturally
authorNadav Amit <namit@vmware.com>
Tue, 10 May 2022 01:20:50 +0000 (18:20 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 13 May 2022 14:20:05 +0000 (07:20 -0700)
Currently, using mprotect() to unprotect a memory region or uffd to
unprotect a memory region causes a TLB flush.  However, in such cases the
PTE is often not modified (i.e., remain RO) and therefore not TLB flush is
needed.

Add an arch-specific pte_needs_flush() which tells whether a TLB flush is
needed based on the old PTE and the new one.  Implement an x86
pte_needs_flush().

Always flush the TLB when it is architecturally needed even when skipping
a TLB flush might only result in a spurious page-faults by skipping the
flush.

Even with such conservative manner, we can in the future further refine
the checks to test whether a PTE is present by only considering the
architectural _PAGE_PRESENT flag instead of {pte|pmd}_preesnt().  For not
be careful and use the latter.

Link: https://lkml.kernel.org/r/20220401180821.1986781-3-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/tlbflush.h
include/asm-generic/tlb.h
mm/huge_memory.c
mm/mprotect.c

index 40497a9..8668bc6 100644 (file)
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX       (_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP   (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define _PAGE_SOFTW4   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
 #else
 #define _PAGE_NX       (_AT(pteval_t, 0))
 #define _PAGE_DEVMAP   (_AT(pteval_t, 0))
+#define _PAGE_SOFTW4   (_AT(pteval_t, 0))
 #endif
 
 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
index 98fa0a1..4af5579 100644 (file)
@@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
+static inline bool pte_flags_need_flush(unsigned long oldflags,
+                                       unsigned long newflags,
+                                       bool ignore_access)
+{
+       /*
+        * Flags that require a flush when cleared but not when they are set.
+        * Only include flags that would not trigger spurious page-faults.
+        * Non-present entries are not cached. Hardware would set the
+        * dirty/access bit if needed without a fault.
+        */
+       const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
+                                       _PAGE_ACCESSED;
+       const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
+                                       _PAGE_SOFTW3 | _PAGE_SOFTW4;
+       const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
+                         _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
+                         _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
+                         _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
+       unsigned long diff = oldflags ^ newflags;
+
+       BUILD_BUG_ON(flush_on_clear & software_flags);
+       BUILD_BUG_ON(flush_on_clear & flush_on_change);
+       BUILD_BUG_ON(flush_on_change & software_flags);
+
+       /* Ignore software flags */
+       diff &= ~software_flags;
+
+       if (ignore_access)
+               diff &= ~_PAGE_ACCESSED;
+
+       /*
+        * Did any of the 'flush_on_clear' flags was clleared set from between
+        * 'oldflags' and 'newflags'?
+        */
+       if (diff & oldflags & flush_on_clear)
+               return true;
+
+       /* Flush on modified flags. */
+       if (diff & flush_on_change)
+               return true;
+
+       /* Ensure there are no flags that were left behind */
+       if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+           (diff & ~(flush_on_clear | software_flags | flush_on_change))) {
+               VM_WARN_ON_ONCE(1);
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * pte_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace PTEs.
+ */
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
+{
+       /* !PRESENT -> * ; no need for flush */
+       if (!(pte_flags(oldpte) & _PAGE_PRESENT))
+               return false;
+
+       /* PFN changed ; needs flush */
+       if (pte_pfn(oldpte) != pte_pfn(newpte))
+               return true;
+
+       /*
+        * check PTE flags; ignore access-bit; see comment in
+        * ptep_clear_flush_young().
+        */
+       return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
+                                   true);
+}
+#define pte_needs_flush pte_needs_flush
+
+/*
+ * huge_pmd_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace huge PMDs.
+ */
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
+{
+       /* !PRESENT -> * ; no need for flush */
+       if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
+               return false;
+
+       /* PFN changed ; needs flush */
+       if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
+               return true;
+
+       /*
+        * check PMD flags; do not ignore access-bit; see
+        * pmdp_clear_flush_young().
+        */
+       return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
+                                   false);
+}
+#define huge_pmd_needs_flush huge_pmd_needs_flush
+
 #endif /* !MODULE */
 
 static inline void __native_tlb_flush_global(unsigned long cr4)
index eee6f77..ff3e825 100644 (file)
@@ -658,6 +658,20 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
        } while (0)
 #endif
 
+#ifndef pte_needs_flush
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
+{
+       return true;
+}
+#endif
+
+#ifndef huge_pmd_needs_flush
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
+{
+       return true;
+}
+#endif
+
 #endif /* CONFIG_MMU */
 
 #endif /* _ASM_GENERIC__TLB_H */
index 8db17c0..2befa9c 100644 (file)
@@ -1715,7 +1715,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
-       pmd_t entry;
+       pmd_t oldpmd, entry;
        bool preserve_write;
        int ret;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
@@ -1804,9 +1804,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
         * pmdp_invalidate() is required to make sure we don't miss
         * dirty/young flags set by hardware.
         */
-       entry = pmdp_invalidate(vma, addr, pmd);
+       oldpmd = pmdp_invalidate(vma, addr, pmd);
 
-       entry = pmd_modify(entry, newprot);
+       entry = pmd_modify(oldpmd, newprot);
        if (preserve_write)
                entry = pmd_mk_savedwrite(entry);
        if (uffd_wp) {
@@ -1823,7 +1823,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);
 
-       tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
+       if (huge_pmd_needs_flush(oldpmd, entry))
+               tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
 
        BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
 unlock:
index 420be02..20a46f2 100644 (file)
@@ -152,7 +152,8 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
                                ptent = pte_mkwrite(ptent);
                        }
                        ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
-                       tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
+                       if (pte_needs_flush(oldpte, ptent))
+                               tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
                        pages++;
                } else if (is_swap_pte(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);