From 4064b982706375025628094e51d11cf1a958a5d3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 1 Apr 2020 21:08:45 -0700 Subject: [PATCH] mm: allow VM_FAULT_RETRY for multiple times The idea comes from a discussion between Linus and Andrea [1]. Before this patch we only allow a page fault to retry once. We achieved this by clearing the FAULT_FLAG_ALLOW_RETRY flag when doing handle_mm_fault() the second time. This was majorly used to avoid unexpected starvation of the system by looping over forever to handle the page fault on a single page. However that should hardly happen, and after all for each code path to return a VM_FAULT_RETRY we'll first wait for a condition (during which time we should possibly yield the cpu) to happen before VM_FAULT_RETRY is really returned. This patch removes the restriction by keeping the FAULT_FLAG_ALLOW_RETRY flag when we receive VM_FAULT_RETRY. It means that the page fault handler now can retry the page fault for multiple times if necessary without the need to generate another page fault event. Meanwhile we still keep the FAULT_FLAG_TRIED flag so page fault handler can still identify whether a page fault is the first attempt or not. Then we'll have these combinations of fault flags (only considering ALLOW_RETRY flag and TRIED flag): - ALLOW_RETRY and !TRIED: this means the page fault allows to retry, and this is the first try - ALLOW_RETRY and TRIED: this means the page fault allows to retry, and this is not the first try - !ALLOW_RETRY and !TRIED: this means the page fault does not allow to retry at all - !ALLOW_RETRY and TRIED: this is forbidden and should never be used In existing code we have multiple places that has taken special care of the first condition above by checking against (fault_flags & FAULT_FLAG_ALLOW_RETRY). This patch introduces a simple helper to detect the first retry of a page fault by checking against both (fault_flags & FAULT_FLAG_ALLOW_RETRY) and !(fault_flag & FAULT_FLAG_TRIED) because now even the 2nd try will have the ALLOW_RETRY set, then use that helper in all existing special paths. One example is in __lock_page_or_retry(), now we'll drop the mmap_sem only in the first attempt of page fault and we'll keep it in follow up retries, so old locking behavior will be retained. This will be a nice enhancement for current code [2] at the same time a supporting material for the future userfaultfd-writeprotect work, since in that work there will always be an explicit userfault writeprotect retry for protected pages, and if that cannot resolve the page fault (e.g., when userfaultfd-writeprotect is used in conjunction with swapped pages) then we'll possibly need a 3rd retry of the page fault. It might also benefit other potential users who will have similar requirement like userfault write-protection. GUP code is not touched yet and will be covered in follow up patch. Please read the thread below for more information. [1] https://lore.kernel.org/lkml/20171102193644.GB22686@redhat.com/ [2] https://lore.kernel.org/lkml/20181230154648.GB9832@redhat.com/ Suggested-by: Linus Torvalds Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Tested-by: Brian Geffon Cc: Bobby Powers Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Matthew Wilcox Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Link: http://lkml.kernel.org/r/20200220160246.9790-1-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 1 - arch/arm/mm/fault.c | 3 --- arch/arm64/mm/fault.c | 5 ----- arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/mm/fault.c | 3 --- arch/microblaze/mm/fault.c | 1 - arch/mips/mm/fault.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nios2/mm/fault.c | 3 --- arch/openrisc/mm/fault.c | 1 - arch/parisc/mm/fault.c | 4 +--- arch/powerpc/mm/fault.c | 6 ------ arch/riscv/mm/fault.c | 5 ----- arch/s390/mm/fault.c | 5 +---- arch/sh/mm/fault.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/um/kernel/trap.c | 1 - arch/unicore32/mm/fault.c | 4 +--- arch/x86/mm/fault.c | 2 -- arch/xtensa/mm/fault.c | 1 - drivers/gpu/drm/ttm/ttm_bo_vm.c | 12 +++++++++--- include/linux/mm.h | 37 +++++++++++++++++++++++++++++++++++++ mm/filemap.c | 2 +- mm/internal.h | 6 +++--- 27 files changed, 54 insertions(+), 57 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index fcfa229..c2d7b6d 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -169,7 +169,7 @@ retry: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 643fad7..92b339c 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -145,7 +145,6 @@ retry: */ if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 18ef0b1..b598e69 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -319,9 +319,6 @@ retry: regs, addr); } if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index cbb29a4..1027851 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -521,12 +521,7 @@ retry: } if (fault & VM_FAULT_RETRY) { - /* - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of - * starvation. - */ if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; mm_flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index d9e15d9..72334b2 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -102,7 +102,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index b5aa4e8..30d0c1f 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -167,7 +167,6 @@ retry: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 182799f..f7afb98 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -162,9 +162,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 32da027..3248141 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -236,7 +236,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index ec464da..4a0eafe 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -178,7 +178,6 @@ good_area: tsk->min_flt++; } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 2810a4e..0cf0c08 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -246,7 +246,6 @@ good_area: 1, regs, addr); } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index c38bea4..ec9d8a9 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -157,9 +157,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 30d5c51..8af1cc7 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -181,7 +181,6 @@ good_area: else tsk->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 8e88e5c..86e8c84 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -328,14 +328,12 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; - /* * No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ - + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d7e1f8d..d15f0f0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -590,13 +590,7 @@ good_area: * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - /* We retry only once */ if (flags & FAULT_FLAG_ALLOW_RETRY) { - /* - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. - */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index a252d9e..be84e32 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -144,11 +144,6 @@ good_area: 1, regs, addr); } if (fault & VM_FAULT_RETRY) { - /* - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. - */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY); flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 551ac31..aeccdb3 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -513,10 +513,7 @@ retry: fault = VM_FAULT_PFAULT; goto out_up; } - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT); + flags &= ~FAULT_FLAG_RETRY_NOWAIT; flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index d9c8f2d..13ee4d2 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -481,7 +481,6 @@ good_area: regs, address); } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index a91b0c2d..f6e0e60 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -261,7 +261,6 @@ good_area: 1, regs, address); } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 30653418..c0c0dd4 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -449,7 +449,6 @@ good_area: 1, regs, address); } if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index c59ad37..8f18cf5 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -97,7 +97,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 34a9045..a9bd08f 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -259,9 +259,7 @@ retry: else tsk->min_flt++; if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f70a08e..859519f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1479,8 +1479,6 @@ good_area: */ if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { - /* Retry at most once */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7d196dc..e7172bd 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -128,7 +128,6 @@ good_area: else current->min_flt++; if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 389128b..cb8829c 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -59,9 +59,10 @@ static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, /* * If possible, avoid waiting for GPU with mmap_sem - * held. + * held. We only do this if the fault allows retry and this + * is the first attempt. */ - if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault_flag_allow_retry_first(vmf->flags)) { ret = VM_FAULT_RETRY; if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out_unlock; @@ -135,7 +136,12 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo, * for the buffer to become unreserved. */ if (unlikely(!dma_resv_trylock(bo->base.resv))) { - if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * If the fault allows retry and this is the first + * fault attempt, we try to release the mmap_sem + * before waiting + */ + if (fault_flag_allow_retry_first(vmf->flags)) { if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { ttm_bo_get(bo); up_read(&vmf->vma->vm_mm->mmap_sem); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7eeabc3..e8e1afa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -394,6 +394,25 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. */ #define FAULT_FLAG_WRITE 0x01 #define FAULT_FLAG_MKWRITE 0x02 @@ -414,6 +433,24 @@ extern pgprot_t protection_map[16]; FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) +/** + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * + * This is mostly used for places where we want to try to avoid taking + * the mmap_sem for too long a time when waiting for another condition + * to change, in which case we can try to be polite to release the + * mmap_sem in the first round to avoid potential starvation of other + * processes that would also want the mmap_sem. + * + * Return: true if the page fault allows retry and this is the first + * attempt of the fault handling; false otherwise. + */ +static inline bool fault_flag_allow_retry_first(unsigned int flags) +{ + return (flags & FAULT_FLAG_ALLOW_RETRY) && + (!(flags & FAULT_FLAG_TRIED)); +} + #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ diff --git a/mm/filemap.c b/mm/filemap.c index 1a58dd6..0fbdc8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1386,7 +1386,7 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_sem is not released * even though return 0. diff --git a/mm/internal.h b/mm/internal.h index 91d1d38..9fb2b8c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -400,10 +400,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. + * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { + if (fault_flag_allow_retry_first(flags) && + !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); up_read(&vmf->vma->vm_mm->mmap_sem); } -- 2.7.4