mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale...

author Mel Gorman <mgorman@suse.de>

Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 2 Aug 2017 23:34:46 +0000 (16:34 -0700)
author Mel Gorman <mgorman@suse.de>
Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Aug 2017 23:34:46 +0000 (16:34 -0700)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index ff15181..7f384bb 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -495,6 +495,10 @@ struct mm_struct {
          */
         bool tlb_flush_pending;
  #endif
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       /* See flush_tlb_batched_pending() */
+       bool tlb_flush_batched;
+#endif
         struct uprobes_state uprobes_state;
  #ifdef CONFIG_HUGETLB_PAGE
         atomic_long_t hugetlb_usage;
diff --git a/mm/internal.h b/mm/internal.h

index 24d88f0..4ef49fc 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  void try_to_unmap_flush(void);
  void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
  #else
  static inline void try_to_unmap_flush(void)
  {
@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
  static inline void try_to_unmap_flush_dirty(void)
  {
  }
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  
  extern const struct trace_print_flags pageflag_names[];
diff --git a/mm/madvise.c b/mm/madvise.c

index 9976852..47d8d8a 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  
         tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         for (; addr != end; pte++, addr += PAGE_SIZE) {
                 ptent = *pte;
diff --git a/mm/memory.c b/mm/memory.c

index 0e517be..f65beaa 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1197,6 +1197,7 @@ again:
         init_rss_vec(rss);
         start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         pte = start_pte;
+       flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 1a8c9ca..4180ad8 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
             atomic_read(&vma->vm_mm->mm_users) == 1)
                 target_node = numa_node_id();
  
+       flush_tlb_batched_pending(vma->vm_mm);
         arch_enter_lazy_mmu_mode();
         do {
                 oldpte = *pte;
diff --git a/mm/mremap.c b/mm/mremap.c

index cd8a1b1..6e3d857 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
         new_ptl = pte_lockptr(mm, new_pmd);
         if (new_ptl != old_ptl)
                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+       flush_tlb_batched_pending(vma->vm_mm);
         arch_enter_lazy_mmu_mode();
  
         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
diff --git a/mm/rmap.c b/mm/rmap.c

index ced14f1..c8993c6 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -605,6 +605,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
         tlb_ubc->flush_required = true;
  
         /*
+        * Ensure compiler does not re-order the setting of tlb_flush_batched
+        * before the PTE is cleared.
+        */
+       barrier();
+       mm->tlb_flush_batched = true;
+
+       /*
          * If the PTE was dirty then it's best to assume it's writable. The
          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
          * before the page is queued for IO.
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
  
         return should_defer;
  }
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+       if (mm->tlb_flush_batched) {
+               flush_tlb_mm(mm);
+
+               /*
+                * Do not allow the compiler to re-order the clearing of
+                * tlb_flush_batched before the tlb is flushed.
+                */
+               barrier();
+               mm->tlb_flush_batched = false;
+       }
+}
  #else
  static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
  {
author	Mel Gorman <mgorman@suse.de>
	Wed, 2 Aug 2017 20:31:52 +0000 (13:31 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 2 Aug 2017 23:34:46 +0000 (16:34 -0700)
include/linux/mm_types.h		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history