powerpc/mm/book3s64: Avoid sending IPI on clearing PMD
authorAneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tue, 5 May 2020 07:17:27 +0000 (12:47 +0530)
committerMichael Ellerman <mpe@ellerman.id.au>
Tue, 5 May 2020 11:20:16 +0000 (21:20 +1000)
Now that all the lockless page table walk is careful w.r.t the PTE
address returned, we can now revert
commit: 13bd817bb884 ("powerpc/thp: Serialize pmd clear against a linux page table walk.")

We also drop the equivalent IPI from other pte updates routines. We still keep
IPI in hash pmdp collapse and that is to take care of parallel hash page table
insert. The radix pmdp collapse flush can possibly be removed once I am sure
generic code doesn't have the any expectations around parallel gup walk.

This speeds up Qemu guest RAM del/unplug time as below

128 core, 496GB guest:

Without patch:
munmap start: timer = 13162 ms, PID=7684
munmap finish: timer = 95312 ms, PID=7684 - delta = 82150 ms

With patch:
munmap start: timer = 196449 ms, PID=6681
munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200505071729.54912-21-aneesh.kumar@linux.ibm.com
arch/powerpc/mm/book3s64/hash_pgtable.c
arch/powerpc/mm/book3s64/pgtable.c
arch/powerpc/mm/book3s64/radix_pgtable.c

index 64733b9cb20a87ea48e245bf1bbd901d6e8c0f1d..64ca375278dcc832531955d3544e87b68c8c4d40 100644 (file)
@@ -363,17 +363,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
         * hash fault look at them.
         */
        memset(pgtable, 0, PTE_FRAG_SIZE);
-       /*
-        * Serialize against find_current_mm_pte variants which does lock-less
-        * lookup in page tables with local interrupts disabled. For huge pages
-        * it casts pmd_t to pte_t. Since format of pte_t is different from
-        * pmd_t we want to prevent transit from pmd pointing to page table
-        * to pmd pointing to huge page (and back) while interrupts are disabled.
-        * We clear pmd to possibly replace it with page table pointer in
-        * different code paths. So make sure we wait for the parallel
-        * find_curren_mm_pte to finish.
-        */
-       serialize_against_pte_lookup(mm);
        return old_pmd;
 }
 
index e0bb69c616e4a2267dddb9da5e59574d06ea8b52..127325ead50538e99a8b4ca1e723563db6b31def 100644 (file)
@@ -109,14 +109,6 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 
        old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-       /*
-        * This ensures that generic code that rely on IRQ disabling
-        * to prevent a parallel THP split work as expected.
-        *
-        * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
-        * a special case check in pmd_access_permitted.
-        */
-       serialize_against_pte_lookup(vma->vm_mm);
        return __pmd(old_pmd);
 }
 
index 8f9edf07063ad53914d59b451cedcc0a99bb6417..dfb9fe92aea8c0c740f2b4d11cee668d06a4188f 100644 (file)
@@ -962,7 +962,13 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
        pmd = *pmdp;
        pmd_clear(pmdp);
 
-       /*FIXME!!  Verify whether we need this kick below */
+       /*
+        * pmdp collapse_flush need to ensure that there are no parallel gup
+        * walk after this call. This is needed so that we can have stable
+        * page ref count when collapsing a page. We don't allow a collapse page
+        * if we have gup taken on the page. We can ensure that by sending IPI
+        * because gup walk happens with IRQ disabled.
+        */
        serialize_against_pte_lookup(vma->vm_mm);
 
        radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
@@ -1023,17 +1029,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 
        old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
        old_pmd = __pmd(old);
-       /*
-        * Serialize against find_current_mm_pte which does lock-less
-        * lookup in page tables with local interrupts disabled. For huge pages
-        * it casts pmd_t to pte_t. Since format of pte_t is different from
-        * pmd_t we want to prevent transit from pmd pointing to page table
-        * to pmd pointing to huge page (and back) while interrupts are disabled.
-        * We clear pmd to possibly replace it with page table pointer in
-        * different code paths. So make sure we wait for the parallel
-        * find_current_mm_pte to finish.
-        */
-       serialize_against_pte_lookup(mm);
        return old_pmd;
 }