thp: allocate memory in khugepaged outside of mmap_sem write mode
authorAndrea Arcangeli <aarcange@redhat.com>
Thu, 13 Jan 2011 23:47:06 +0000 (15:47 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:32:45 +0000 (17:32 -0800)
This tries to be more friendly to filesystem in userland, with userland
backends that allocate memory in the I/O paths and that could deadlock if
khugepaged holds the mmap_sem write mode of the userland backend while
allocating memory.  Memory allocation may wait for writeback I/O
completion from the daemon that may be blocked in the mmap_sem read mode
if a page fault happens and the daemon wasn't using mlock for the memory
required for the I/O submission and completion.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/huge_memory.c

index f6559e7..bce6e12 100644 (file)
@@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
-                              struct page **hpage)
+                              struct page **hpage,
+                              struct vm_area_struct *vma)
 {
-       struct vm_area_struct *vma;
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd, _pmd;
@@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm,
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 #ifndef CONFIG_NUMA
        VM_BUG_ON(!*hpage);
+       new_page = *hpage;
 #else
        VM_BUG_ON(*hpage);
+       /*
+        * Allocate the page while the vma is still valid and under
+        * the mmap_sem read mode so there is no memory allocation
+        * later when we take the mmap_sem in write mode. This is more
+        * friendly behavior (OTOH it may actually hide bugs) to
+        * filesystems in userland with daemons allocating memory in
+        * the userland I/O paths.  Allocating memory with the
+        * mmap_sem in read mode is good idea also to allow greater
+        * scalability.
+        */
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       if (unlikely(!new_page)) {
+               up_read(&mm->mmap_sem);
+               *hpage = ERR_PTR(-ENOMEM);
+               return;
+       }
 #endif
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+               up_read(&mm->mmap_sem);
+               put_page(new_page);
+               return;
+       }
+
+       /* after allocating the hugepage upgrade to mmap_sem write mode */
+       up_read(&mm->mmap_sem);
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
 
-#ifndef CONFIG_NUMA
-       new_page = *hpage;
-#else
-       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
-       if (unlikely(!new_page)) {
-               *hpage = ERR_PTR(-ENOMEM);
-               goto out;
-       }
-#endif
-       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
-               goto out_put_page;
-
        anon_vma_lock(vma->anon_vma);
 
        pte = pte_offset_map(pmd, address);
@@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
                mem_cgroup_uncharge_page(new_page);
-               goto out_put_page;
+               goto out;
        }
 
        /*
@@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm,
        *hpage = NULL;
 #endif
        khugepaged_pages_collapsed++;
-out:
+out_up_write:
        up_write(&mm->mmap_sem);
        return;
 
-out_put_page:
+out:
 #ifdef CONFIG_NUMA
        put_page(new_page);
 #endif
-       goto out;
+       goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                ret = 1;
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-       if (ret) {
-               up_read(&mm->mmap_sem);
-               collapse_huge_page(mm, address, hpage);
-       }
+       if (ret)
+               /* collapse_huge_page will return with the mmap_sem released */
+               collapse_huge_page(mm, address, hpage, vma);
 out:
        return ret;
 }