mm, oom: fix concurrent munlock and oom reaper unmap, v3

author David Rientjes <rientjes@google.com>

Fri, 11 May 2018 23:02:04 +0000 (16:02 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 16 May 2018 08:10:27 +0000 (10:10 +0200)
author David Rientjes <rientjes@google.com>
Fri, 11 May 2018 23:02:04 +0000 (16:02 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 16 May 2018 08:10:27 +0000 (10:10 +0200)
diff --git a/include/linux/oom.h b/include/linux/oom.h

index 5bad038..6adac11 100644 (file)
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -95,6 +95,8 @@ static inline int check_stable_address_space(struct mm_struct *mm)
         return 0;
  }
  
+void __oom_reap_task_mm(struct mm_struct *mm);
+
  extern unsigned long oom_badness(struct task_struct *p,
                 struct mem_cgroup *memcg, const nodemask_t *nodemask,
                 unsigned long totalpages);
diff --git a/mm/mmap.c b/mm/mmap.c

index 0de87a3..11f96fa 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2982,6 +2982,32 @@ void exit_mmap(struct mm_struct *mm)
         /* mm's last user has gone, and its about to be pulled down */
         mmu_notifier_release(mm);
  
+       if (unlikely(mm_is_oom_victim(mm))) {
+               /*
+                * Manually reap the mm to free as much memory as possible.
+                * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
+                * this mm from further consideration.  Taking mm->mmap_sem for
+                * write after setting MMF_OOM_SKIP will guarantee that the oom
+                * reaper will not run on this mm again after mmap_sem is
+                * dropped.
+                *
+                * Nothing can be holding mm->mmap_sem here and the above call
+                * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
+                * __oom_reap_task_mm() will not block.
+                *
+                * This needs to be done before calling munlock_vma_pages_all(),
+                * which clears VM_LOCKED, otherwise the oom reaper cannot
+                * reliably test it.
+                */
+               mutex_lock(&oom_lock);
+               __oom_reap_task_mm(mm);
+               mutex_unlock(&oom_lock);
+
+               set_bit(MMF_OOM_SKIP, &mm->flags);
+               down_write(&mm->mmap_sem);
+               up_write(&mm->mmap_sem);
+       }
+
         if (mm->locked_vm) {
                 vma = mm->mmap;
                 while (vma) {
@@ -3003,24 +3029,6 @@ void exit_mmap(struct mm_struct *mm)
         /* update_hiwater_rss(mm) here? but nobody should be looking */
         /* Use -1 here to ensure all VMAs in the mm are unmapped */
         unmap_vmas(&tlb, vma, 0, -1);
-
-       if (unlikely(mm_is_oom_victim(mm))) {
-               /*
-                * Wait for oom_reap_task() to stop working on this
-                * mm. Because MMF_OOM_SKIP is already set before
-                * calling down_read(), oom_reap_task() will not run
-                * on this "mm" post up_write().
-                *
-                * mm_is_oom_victim() cannot be set from under us
-                * either because victim->mm is already set to NULL
-                * under task_lock before calling mmput and oom_mm is
-                * set not NULL by the OOM killer only if victim->mm
-                * is found not NULL while holding the task_lock.
-                */
-               set_bit(MMF_OOM_SKIP, &mm->flags);
-               down_write(&mm->mmap_sem);
-               up_write(&mm->mmap_sem);
-       }
         free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
         tlb_finish_mmu(&tlb, 0, -1);
  
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 10aed8d..58977f6 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -456,7 +456,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
         return false;
  }
  
-
  #ifdef CONFIG_MMU
  /*
   * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -467,16 +466,51 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
  static struct task_struct *oom_reaper_list;
  static DEFINE_SPINLOCK(oom_reaper_lock);
  
-static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+void __oom_reap_task_mm(struct mm_struct *mm)
  {
-       struct mmu_gather tlb;
         struct vm_area_struct *vma;
+
+       /*
+        * Tell all users of get_user/copy_from_user etc... that the content
+        * is no longer stable. No barriers really needed because unmapping
+        * should imply barriers already and the reader would hit a page fault
+        * if it stumbled over a reaped memory.
+        */
+       set_bit(MMF_UNSTABLE, &mm->flags);
+
+       for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+               if (!can_madv_dontneed_vma(vma))
+                       continue;
+
+               /*
+                * Only anonymous pages have a good chance to be dropped
+                * without additional steps which we cannot afford as we
+                * are OOM already.
+                *
+                * We do not even care about fs backed pages because all
+                * which are reclaimable have already been reclaimed and
+                * we do not want to block exit_mmap by keeping mm ref
+                * count elevated without a good reason.
+                */
+               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+                       struct mmu_gather tlb;
+
+                       tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
+                       unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+                                        NULL);
+                       tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+               }
+       }
+}
+
+static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
         bool ret = true;
  
         /*
          * We have to make sure to not race with the victim exit path
          * and cause premature new oom victim selection:
-        * __oom_reap_task_mm           exit_mm
+        * oom_reap_task_mm             exit_mm
          *   mmget_not_zero
          *                                mmput
          *                                  atomic_dec_and_test
@@ -524,35 +558,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
  
         trace_start_task_reaping(tsk->pid);
  
-       /*
-        * Tell all users of get_user/copy_from_user etc... that the content
-        * is no longer stable. No barriers really needed because unmapping
-        * should imply barriers already and the reader would hit a page fault
-        * if it stumbled over a reaped memory.
-        */
-       set_bit(MMF_UNSTABLE, &mm->flags);
-
-       for (vma = mm->mmap ; vma; vma = vma->vm_next) {
-               if (!can_madv_dontneed_vma(vma))
-                       continue;
+       __oom_reap_task_mm(mm);
  
-               /*
-                * Only anonymous pages have a good chance to be dropped
-                * without additional steps which we cannot afford as we
-                * are OOM already.
-                *
-                * We do not even care about fs backed pages because all
-                * which are reclaimable have already been reclaimed and
-                * we do not want to block exit_mmap by keeping mm ref
-                * count elevated without a good reason.
-                */
-               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-                       tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-                       unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-                                        NULL);
-                       tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
-               }
-       }
         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                         task_pid_nr(tsk), tsk->comm,
                         K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -573,13 +580,12 @@ static void oom_reap_task(struct task_struct *tsk)
         struct mm_struct *mm = tsk->signal->oom_mm;
  
         /* Retry the down_read_trylock(mmap_sem) a few times */
-       while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
+       while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
                 schedule_timeout_idle(HZ/10);
  
         if (attempts <= MAX_OOM_REAP_RETRIES)
                 goto done;
  
-
         pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
                 task_pid_nr(tsk), tsk->comm);
         debug_show_all_locks();
author	David Rientjes <rientjes@google.com>
	Fri, 11 May 2018 23:02:04 +0000 (16:02 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 16 May 2018 08:10:27 +0000 (10:10 +0200)
include/linux/oom.h		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history