mm: Prevent parallel splits during THP migration
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / huge_memory.c
index f60c4eb..d8534b3 100644 (file)
@@ -211,24 +211,29 @@ static void put_huge_zero_page(void)
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
-static int shrink_huge_zero_page(struct shrinker *shrink,
-               struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+                                       struct shrink_control *sc)
 {
-       if (!sc->nr_to_scan)
-               /* we can free zero page only if last reference remains */
-               return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+       /* we can free zero page only if last reference remains */
+       return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
 
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                struct page *zero_page = xchg(&huge_zero_page, NULL);
                BUG_ON(zero_page == NULL);
                __free_page(zero_page);
+               return HPAGE_PMD_NR;
        }
 
        return 0;
 }
 
 static struct shrinker huge_zero_page_shrinker = {
-       .shrink = shrink_huge_zero_page,
+       .count_objects = shrink_huge_zero_page_count,
+       .scan_objects = shrink_huge_zero_page_scan,
        .seeks = DEFAULT_SEEKS,
 };
 
@@ -1273,32 +1278,50 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+       struct anon_vma *anon_vma = NULL;
        struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        int target_nid;
        int current_nid = -1;
-       bool migrated;
+       bool migrated, page_locked;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
 
        page = pmd_page(pmd);
-       get_page(page);
        current_nid = page_to_nid(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (current_nid == numa_node_id())
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
+       /*
+        * Acquire the page lock to serialise THP migrations but avoid dropping
+        * page_table_lock if at all possible
+        */
+       page_locked = trylock_page(page);
        target_nid = mpol_misplaced(page, vma, haddr);
        if (target_nid == -1) {
-               put_page(page);
-               goto clear_pmdnuma;
+               /* If the page was locked, there are no parallel migrations */
+               if (page_locked) {
+                       unlock_page(page);
+                       goto clear_pmdnuma;
+               }
+
+               /* Otherwise wait for potential migrations and retry fault */
+               spin_unlock(&mm->page_table_lock);
+               wait_on_page_locked(page);
+               goto out;
        }
 
-       /* Acquire the page lock to serialise THP migrations */
+       /* Page is misplaced, serialise migrations and parallel THP splits */
+       get_page(page);
        spin_unlock(&mm->page_table_lock);
-       lock_page(page);
+       if (!page_locked) {
+               lock_page(page);
+               page_locked = true;
+       }
+       anon_vma = page_lock_anon_vma_read(page);
 
        /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
@@ -1307,21 +1330,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                put_page(page);
                goto out_unlock;
        }
-       spin_unlock(&mm->page_table_lock);
 
        /* Migrate the THP to the requested node */
+       spin_unlock(&mm->page_table_lock);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
                                pmdp, pmd, addr, page, target_nid);
        if (!migrated)
                goto check_same;
 
        task_numa_fault(target_nid, HPAGE_PMD_NR, true);
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
        return 0;
 
 check_same:
        spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
+       if (unlikely(!pmd_same(pmd, *pmdp))) {
+               /* Someone else took our fault */
+               current_nid = -1;
                goto out_unlock;
+       }
 clear_pmdnuma:
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
@@ -1329,6 +1357,11 @@ clear_pmdnuma:
        update_mmu_cache_pmd(vma, addr, pmdp);
 out_unlock:
        spin_unlock(&mm->page_table_lock);
+
+out:
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
+
        if (current_nid != -1)
                task_numa_fault(current_nid, HPAGE_PMD_NR, false);
        return 0;
@@ -2692,6 +2725,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 
        mmun_start = haddr;
        mmun_end   = haddr + HPAGE_PMD_SIZE;
+again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2714,7 +2748,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        split_huge_page(page);
 
        put_page(page);
-       BUG_ON(pmd_trans_huge(*pmd));
+
+       /*
+        * We don't always have down_write of mmap_sem here: a racing
+        * do_huge_pmd_wp_page() might have copied-on-write to another
+        * huge page before our split_huge_page() got the anon_vma lock.
+        */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               goto again;
 }
 
 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,