Merge tag 'zstd-linus-v6.2' of https://github.com/terrelln/linux
[platform/kernel/linux-starfive.git] / mm / memory.c
index 8c84209..aad226d 100644 (file)
@@ -162,58 +162,11 @@ static int __init init_zero_pfn(void)
 }
 early_initcall(init_zero_pfn);
 
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+void mm_trace_rss_stat(struct mm_struct *mm, int member)
 {
-       trace_rss_stat(mm, member, count);
+       trace_rss_stat(mm, member);
 }
 
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
-       int i;
-
-       for (i = 0; i < NR_MM_COUNTERS; i++) {
-               if (current->rss_stat.count[i]) {
-                       add_mm_counter(mm, i, current->rss_stat.count[i]);
-                       current->rss_stat.count[i] = 0;
-               }
-       }
-       current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
-       struct task_struct *task = current;
-
-       if (likely(task->mm == mm))
-               task->rss_stat.count[member] += val;
-       else
-               add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
-       if (unlikely(task != current))
-               return;
-       if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
-               sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-
-static void check_sync_rss_stat(struct task_struct *task)
-{
-}
-
-#endif /* SPLIT_RSS_COUNTING */
-
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -1384,12 +1337,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte,
                              struct zap_details *details, pte_t pteval)
 {
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
        if (zap_drop_file_uffd_wp(details))
                return;
 
        pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
-#endif
 }
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1423,6 +1374,8 @@ again:
                        break;
 
                if (pte_present(ptent)) {
+                       unsigned int delay_rmap;
+
                        page = vm_normal_page(vma, addr, ptent);
                        if (unlikely(!should_zap_page(details, page)))
                                continue;
@@ -1434,20 +1387,26 @@ again:
                        if (unlikely(!page))
                                continue;
 
+                       delay_rmap = 0;
                        if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
-                                       force_flush = 1;
                                        set_page_dirty(page);
+                                       if (tlb_delay_rmap(tlb)) {
+                                               delay_rmap = 1;
+                                               force_flush = 1;
+                                       }
                                }
                                if (pte_young(ptent) &&
                                    likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        mark_page_accessed(page);
                        }
                        rss[mm_counter(page)]--;
-                       page_remove_rmap(page, vma, false);
-                       if (unlikely(page_mapcount(page) < 0))
-                               print_bad_pte(vma, addr, ptent, page);
-                       if (unlikely(__tlb_remove_page(tlb, page))) {
+                       if (!delay_rmap) {
+                               page_remove_rmap(page, vma, false);
+                               if (unlikely(page_mapcount(page) < 0))
+                                       print_bad_pte(vma, addr, ptent, page);
+                       }
+                       if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
                                force_flush = 1;
                                addr += PAGE_SIZE;
                                break;
@@ -1504,8 +1463,10 @@ again:
        arch_leave_lazy_mmu_mode();
 
        /* Do the actual TLB flush before dropping ptl */
-       if (force_flush)
+       if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
+               tlb_flush_rmaps(tlb, vma);
+       }
        pte_unmap_unlock(start_pte, ptl);
 
        /*
@@ -1859,7 +1820,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        get_page(page);
-       inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+       inc_mm_counter(vma->vm_mm, mm_counter_file(page));
        page_add_file_rmap(page, vma, false);
        set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
        return 0;
@@ -2847,10 +2808,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
        return same;
 }
 
-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
-                                      struct vm_fault *vmf)
+/*
+ * Return:
+ *     0:              copied succeeded
+ *     -EHWPOISON:     copy failed due to hwpoison in source page
+ *     -EAGAIN:        copied failed (some other reason)
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+                                     struct vm_fault *vmf)
 {
-       bool ret;
+       int ret;
        void *kaddr;
        void __user *uaddr;
        bool locked = false;
@@ -2859,8 +2826,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
        unsigned long addr = vmf->address;
 
        if (likely(src)) {
-               copy_user_highpage(dst, src, addr, vma);
-               return true;
+               if (copy_mc_user_highpage(dst, src, addr, vma)) {
+                       memory_failure_queue(page_to_pfn(src), 0);
+                       return -EHWPOISON;
+               }
+               return 0;
        }
 
        /*
@@ -2887,7 +2857,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
                         * and update local tlb only
                         */
                        update_mmu_tlb(vma, addr, vmf->pte);
-                       ret = false;
+                       ret = -EAGAIN;
                        goto pte_unlock;
                }
 
@@ -2912,7 +2882,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
                if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        update_mmu_tlb(vma, addr, vmf->pte);
-                       ret = false;
+                       ret = -EAGAIN;
                        goto pte_unlock;
                }
 
@@ -2931,7 +2901,7 @@ warn:
                }
        }
 
-       ret = true;
+       ret = 0;
 
 pte_unlock:
        if (locked)
@@ -3103,6 +3073,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
+       int ret;
 
        delayacct_wpcopy_start();
 
@@ -3120,19 +3091,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                if (!new_page)
                        goto oom;
 
-               if (!__wp_page_copy_user(new_page, old_page, vmf)) {
+               ret = __wp_page_copy_user(new_page, old_page, vmf);
+               if (ret) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
+                        * The -EHWPOISON case will not be retried.
                         */
                        put_page(new_page);
                        if (old_page)
                                put_page(old_page);
 
                        delayacct_wpcopy_end();
-                       return 0;
+                       return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(new_page, old_page);
        }
@@ -3155,12 +3128,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
-                               dec_mm_counter_fast(mm,
-                                               mm_counter_file(old_page));
-                               inc_mm_counter_fast(mm, MM_ANONPAGES);
+                               dec_mm_counter(mm, mm_counter_file(old_page));
+                               inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
-                       inc_mm_counter_fast(mm, MM_ANONPAGES);
+                       inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3241,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        }
 
        delayacct_wpcopy_end();
-       return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
+       return 0;
 oom_free_new:
        put_page(new_page);
 oom:
@@ -3305,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
                return finish_mkwrite_fault(vmf);
        }
        wp_page_reuse(vmf);
-       return VM_FAULT_WRITE;
+       return 0;
 }
 
 static vm_fault_t wp_page_shared(struct vm_fault *vmf)
        __releases(vmf->ptl)
 {
        struct vm_area_struct *vma = vmf->vma;
-       vm_fault_t ret = VM_FAULT_WRITE;
+       vm_fault_t ret = 0;
 
        get_page(vmf->page);
 
@@ -3369,10 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 {
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
-       struct folio *folio;
-
-       VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
-       VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
+       struct folio *folio = NULL;
 
        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, *vmf->pte)) {
@@ -3390,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
        }
 
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
-       if (!vmf->page) {
-               if (unlikely(unshare)) {
-                       /* No anonymous page -> nothing to do. */
-                       pte_unmap_unlock(vmf->pte, vmf->ptl);
-                       return 0;
-               }
 
+       /*
+        * Shared mapping: we are guaranteed to have VM_WRITE and
+        * FAULT_FLAG_WRITE set at this point.
+        */
+       if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
@@ -3404,20 +3372,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
-               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                    (VM_WRITE|VM_SHARED))
+               if (!vmf->page)
                        return wp_pfn_shared(vmf);
-
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-               return wp_page_copy(vmf);
+               return wp_page_shared(vmf);
        }
 
+       if (vmf->page)
+               folio = page_folio(vmf->page);
+
        /*
-        * Take out anonymous pages first, anonymous shared vmas are
-        * not dirty accountable.
+        * Private mapping: create an exclusive anonymous page copy if reuse
+        * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         */
-       folio = page_folio(vmf->page);
-       if (folio_test_anon(folio)) {
+       if (folio && folio_test_anon(folio)) {
                /*
                 * If the page is exclusive to this process we must reuse the
                 * page without further checks.
@@ -3463,24 +3430,18 @@ reuse:
                        return 0;
                }
                wp_page_reuse(vmf);
-               return VM_FAULT_WRITE;
-       } else if (unshare) {
-               /* No anonymous page -> nothing to do. */
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
-       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                       (VM_WRITE|VM_SHARED))) {
-               return wp_page_shared(vmf);
        }
 copy:
        /*
         * Ok, we need to copy. Oh, well..
         */
-       get_page(vmf->page);
+       if (folio)
+               folio_get(folio);
 
        pte_unmap_unlock(vmf->pte, vmf->ptl);
 #ifdef CONFIG_KSM
-       if (PageKsm(vmf->page))
+       if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
 #endif
        return wp_page_copy(vmf);
@@ -3700,11 +3661,14 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
        unsigned long marker = pte_marker_get(entry);
 
        /*
-        * PTE markers should always be with file-backed memories, and the
-        * marker should never be empty.  If anything weird happened, the best
-        * thing to do is to kill the process along with its mm.
+        * PTE markers should never be empty.  If anything weird happened,
+        * the best thing to do is to kill the process along with its mm.
         */
-       if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+       if (WARN_ON_ONCE(!marker))
+               return VM_FAULT_SIGBUS;
+
+       /* Higher priority than uffd-wp when data corrupted */
+       if (marker & PTE_MARKER_SWAPIN_ERROR)
                return VM_FAULT_SIGBUS;
 
        if (pte_marker_entry_uffd_wp(entry))
@@ -3766,8 +3730,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        put_page(vmf->page);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
-               } else if (is_swapin_error_entry(entry)) {
-                       ret = VM_FAULT_SIGBUS;
                } else if (is_pte_marker_entry(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
@@ -3967,8 +3929,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        if (should_try_to_free_swap(folio, vma, vmf->flags))
                folio_free_swap(folio);
 
-       inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-       dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+       dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
 
        /*
@@ -3982,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                        vmf->flags &= ~FAULT_FLAG_WRITE;
-                       ret |= VM_FAULT_WRITE;
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
@@ -4148,7 +4109,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }
 
-       inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, vmf->address);
        lru_cache_add_inactive_or_unevictable(page, vma);
 setpte:
@@ -4338,11 +4299,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
                entry = pte_mkuffd_wp(pte_wrprotect(entry));
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
-               inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+               inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, addr);
                lru_cache_add_inactive_or_unevictable(page, vma);
        } else {
-               inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+               inc_mm_counter(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, vma, false);
        }
        set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
@@ -4712,10 +4673,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL;
        int page_nid = NUMA_NO_NODE;
+       bool writable = false;
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
-       bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;
 
        /*
@@ -4734,6 +4695,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        old_pte = ptep_get(vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
 
+       /*
+        * Detect now whether the PTE could be writable; this information
+        * is only valid while holding the PT lock.
+        */
+       writable = pte_write(pte);
+       if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+           can_change_pte_writable(vma, vmf->address, pte))
+               writable = true;
+
        page = vm_normal_page(vma, vmf->address, pte);
        if (!page || is_zone_device_page(page))
                goto out_map;
@@ -4750,7 +4720,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
-       if (!was_writable)
+       if (!writable)
                flags |= TNF_NO_GROUP;
 
        /*
@@ -4777,6 +4747,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
                goto out_map;
        }
        pte_unmap_unlock(vmf->pte, vmf->ptl);
+       writable = false;
 
        /* Migrate to the requested node */
        if (migrate_misplaced_page(page, vma, target_nid)) {
@@ -4805,7 +4776,7 @@ out_map:
        old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
-       if (was_writable)
+       if (writable)
                pte = pte_mkwrite(pte);
        ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
        update_mmu_cache(vma, vmf->address, vmf->pte);
@@ -4826,6 +4797,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+       vm_fault_t ret;
 
        if (vma_is_anonymous(vmf->vma)) {
                if (likely(!unshare) &&
@@ -4833,11 +4805,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
                        return handle_userfault(vmf, VM_UFFD_WP);
                return do_huge_pmd_wp_page(vmf);
        }
-       if (vmf->vma->vm_ops->huge_fault) {
-               vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
-               if (!(ret & VM_FAULT_FALLBACK))
-                       return ret;
+       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vmf->vma->vm_ops->huge_fault) {
+                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+                       if (!(ret & VM_FAULT_FALLBACK))
+                               return ret;
+               }
        }
 
        /* COW or write-notify handled on pte level: split pmd. */
@@ -4863,14 +4837,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+       vm_fault_t ret;
+
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vmf->vma))
                goto split;
-       if (vmf->vma->vm_ops->huge_fault) {
-               vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-
-               if (!(ret & VM_FAULT_FALLBACK))
-                       return ret;
+       if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+               if (vmf->vma->vm_ops->huge_fault) {
+                       ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+                       if (!(ret & VM_FAULT_FALLBACK))
+                               return ret;
+               }
        }
 split:
        /* COW or write-notify not handled on PUD level: split pud.*/
@@ -5178,6 +5155,30 @@ static void lru_gen_exit_fault(void)
 }
 #endif /* CONFIG_LRU_GEN */
 
+static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
+                                      unsigned int *flags)
+{
+       if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
+               if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
+                       return VM_FAULT_SIGSEGV;
+               /*
+                * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
+                * just treat it like an ordinary read-fault otherwise.
+                */
+               if (!is_cow_mapping(vma->vm_flags))
+                       *flags &= ~FAULT_FLAG_UNSHARE;
+       } else if (*flags & FAULT_FLAG_WRITE) {
+               /* Write faults on read-only mappings are impossible ... */
+               if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
+                       return VM_FAULT_SIGSEGV;
+               /* ... and FOLL_FORCE only applies to COW mappings. */
+               if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
+                                !is_cow_mapping(vma->vm_flags)))
+                       return VM_FAULT_SIGSEGV;
+       }
+       return 0;
+}
+
 /*
  * By the time we get here, we already hold the mm semaphore
  *
@@ -5194,8 +5195,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        count_vm_event(PGFAULT);
        count_memcg_event_mm(vma->vm_mm, PGFAULT);
 
-       /* do counter updates before entering really critical section. */
-       check_sync_rss_stat(current);
+       ret = sanitize_fault_flags(vma, &flags);
+       if (ret)
+               return ret;
 
        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,