Merge tag 'x86_mm_for_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
diff --combined arch/x86/Kconfig

index f87590c,aa9f73f..53bab12
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -27,7 -27,6 +27,7 @@@ config X86_6
         # Options that are inherently 64-bit kernel only:
         select ARCH_HAS_GIGANTIC_PAGE
         select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ +      select ARCH_SUPPORTS_PER_VMA_LOCK
         select ARCH_USE_CMPXCHG_LOCKREF
         select HAVE_ARCH_SOFT_DIRTY
         select MODULES_USE_ELF_RELA
@@@ -126,8 -125,8 +126,8 @@@ config X8
         select ARCH_WANTS_NO_INSTR
         select ARCH_WANT_GENERAL_HUGETLB
         select ARCH_WANT_HUGE_PMD_SHARE
- -      select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP  if X86_64
         select ARCH_WANT_LD_ORPHAN_WARN
+ +      select ARCH_WANT_OPTIMIZE_VMEMMAP       if X86_64
         select ARCH_WANTS_THP_SWAP              if X86_64
         select ARCH_HAS_PARANOID_L1D_FLUSH
         select BUILDTIME_TABLE_SORT
@@@ -163,7 -162,6 +163,7 @@@
         select GUP_GET_PXX_LOW_HIGH             if X86_PAE
         select HARDIRQS_SW_RESEND
         select HARDLOCKUP_CHECK_TIMESTAMP       if X86_64
+ +      select HAS_IOPORT
         select HAVE_ACPI_APEI                   if ACPI
         select HAVE_ACPI_APEI_NMI               if ACPI
         select HAVE_ALIGNED_STRUCT_PAGE         if SLUB
@@@ -285,6 -283,7 +285,6 @@@
         select RTC_LIB
         select RTC_MC146818_LIB
         select SPARSE_IRQ
- -      select SRCU
         select SYSCTL_EXCEPTION_TRACE
         select THREAD_INFO_IN_TASK
         select TRACE_IRQFLAGS_SUPPORT
@@@ -435,7 -434,7 +435,7 @@@ config SM
           Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
           Management" code will be disabled if you say Y here.
   
- -        See also <file:Documentation/x86/i386/IO-APIC.rst>,
+ +        See also <file:Documentation/arch/x86/i386/IO-APIC.rst>,
           <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
           <http://www.tldp.org/docs.html#howto>.
   
@@@ -1325,7 -1324,7 +1325,7 @@@ config MICROCOD
           the Linux kernel.
   
           The preferred method to load microcode from a detached initrd is described
- -        in Documentation/x86/microcode.rst. For that you need to enable
+ +        in Documentation/arch/x86/microcode.rst. For that you need to enable
           CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
           initrd for microcode blobs.
   
@@@ -1511,7 -1510,7 +1511,7 @@@ config X86_5LEVE
           A kernel with the option enabled can be booted on machines that
           support 4- or 5-level paging.
   
- -        See Documentation/x86/x86_64/5level-paging.rst for more
+ +        See Documentation/arch/x86/x86_64/5level-paging.rst for more
           information.
   
           Say N if unsure.
@@@ -1775,7 -1774,7 +1775,7 @@@ config MTR
           You can safely say Y even if your machine doesn't have MTRRs, you'll
           just add about 9 KB to your kernel.
   
- -        See <file:Documentation/x86/mtrr.rst> for more information.
+ +        See <file:Documentation/arch/x86/mtrr.rst> for more information.
   
   config MTRR_SANITIZER
         def_bool y
@@@ -1939,6 -1938,7 +1939,6 @@@ config X86_SG
         depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
         depends on CRYPTO=y
         depends on CRYPTO_SHA256=y
- -      select SRCU
         select MMU_NOTIFIER
         select NUMA_KEEP_MEMINFO if NUMA
         select XARRAY_MULTI
@@@ -2290,6 -2290,17 +2290,17 @@@ config RANDOMIZE_MEMORY_PHYSICAL_PADDIN
   
           If unsure, leave at the default value.
   
+ config ADDRESS_MASKING
+       bool "Linear Address Masking support"
+       depends on X86_64
+       help
+         Linear Address Masking (LAM) modifies the checking that is applied
+         to 64-bit linear addresses, allowing software to use of the
+         untranslated address bits for metadata.
+ 
+         The capability can be used for efficient address sanitizers (ASAN)
+         implementation and for optimizations in JITs.
+ 
   config HOTPLUG_CPU
         def_bool y
         depends on SMP
@@@ -2551,7 -2562,7 +2562,7 @@@ config PAGE_TABLE_ISOLATIO
           ensuring that the majority of kernel addresses are not mapped
           into userspace.
   
- -        See Documentation/x86/pti.rst for more details.
+ +        See Documentation/arch/x86/pti.rst for more details.
   
   config RETPOLINE
         bool "Avoid speculative indirect branches in kernel"
diff --combined arch/x86/include/asm/mmu_context.h

index c3ad8a5,4c396e9..1d29dc7
--- 1/arch/x86/include/asm/mmu_context.h
--- 2/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@@ -16,6 -16,13 +16,6 @@@
   
   extern atomic64_t last_mm_ctx_id;
   
- -#ifndef CONFIG_PARAVIRT_XXL
- -static inline void paravirt_activate_mm(struct mm_struct *prev,
- -                                      struct mm_struct *next)
- -{
- -}
- -#endif        /* !CONFIG_PARAVIRT_XXL */
- -
   #ifdef CONFIG_PERF_EVENTS
   DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
   DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
@@@ -85,6 -92,51 +85,51 @@@ static inline void switch_ldt(struct mm
   }
   #endif
   
+ #ifdef CONFIG_ADDRESS_MASKING
+ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+ {
+       return mm->context.lam_cr3_mask;
+ }
+ 
+ static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+ {
+       mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+       mm->context.untag_mask = oldmm->context.untag_mask;
+ }
+ 
+ #define mm_untag_mask mm_untag_mask
+ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+ {
+       return mm->context.untag_mask;
+ }
+ 
+ static inline void mm_reset_untag_mask(struct mm_struct *mm)
+ {
+       mm->context.untag_mask = -1UL;
+ }
+ 
+ #define arch_pgtable_dma_compat arch_pgtable_dma_compat
+ static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+ {
+       return !mm_lam_cr3_mask(mm) ||
+               test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+ }
+ #else
+ 
+ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+ {
+       return 0;
+ }
+ 
+ static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+ {
+ }
+ 
+ static inline void mm_reset_untag_mask(struct mm_struct *mm)
+ {
+ }
+ #endif
+ 
   #define enter_lazy_tlb enter_lazy_tlb
   extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
   
@@@ -109,6 -161,7 +154,7 @@@ static inline int init_new_context(stru
                 mm->context.execute_only_pkey = -1;
         }
   #endif
+       mm_reset_untag_mask(mm);
         init_new_context_ldt(mm);
         return 0;
   }
@@@ -128,7 -181,7 +174,7 @@@ extern void switch_mm_irqs_off(struct m
   
   #define activate_mm(prev, next)                       \
   do {                                          \
- -      paravirt_activate_mm((prev), (next));   \
+ +      paravirt_enter_mmap(next);              \
         switch_mm((prev), (next), NULL);        \
   } while (0);
   
@@@ -161,7 -214,8 +207,8 @@@ static inline void arch_dup_pkeys(struc
   static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
   {
         arch_dup_pkeys(oldmm, mm);
- -      paravirt_arch_dup_mmap(oldmm, mm);
+ +      paravirt_enter_mmap(mm);
+       dup_lam(oldmm, mm);
         return ldt_dup_context(oldmm, mm);
   }
   
@@@ -175,7 -229,7 +222,7 @@@ static inline void arch_exit_mmap(struc
   static inline bool is_64bit_mm(struct mm_struct *mm)
   {
         return  !IS_ENABLED(CONFIG_IA32_EMULATION) ||
-               !(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
+               !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
   }
   #else
   static inline bool is_64bit_mm(struct mm_struct *mm)
diff --combined arch/x86/include/uapi/asm/prctl.h

index f298c77,eb290d8..e8d7ebb
--- 1/arch/x86/include/uapi/asm/prctl.h
--- 2/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@@ -16,11 -16,13 +16,16 @@@
   #define ARCH_GET_XCOMP_GUEST_PERM     0x1024
   #define ARCH_REQ_XCOMP_GUEST_PERM     0x1025
   
+ +#define ARCH_XCOMP_TILECFG            17
+ +#define ARCH_XCOMP_TILEDATA           18
+ +
   #define ARCH_MAP_VDSO_X32             0x2001
   #define ARCH_MAP_VDSO_32              0x2002
   #define ARCH_MAP_VDSO_64              0x2003
   
+ #define ARCH_GET_UNTAG_MASK           0x4001
+ #define ARCH_ENABLE_TAGGED_ADDR               0x4002
+ #define ARCH_GET_MAX_TAG_BITS         0x4003
+ #define ARCH_FORCE_TAGGED_SVA         0x4004
+ 
   #endif /* _ASM_X86_PRCTL_H */
diff --combined arch/x86/mm/init.c

index cbc53da,659b6c0..3cdac0f
--- 1/arch/x86/mm/init.c
--- 2/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@@ -806,7 -806,7 +806,7 @@@ void __init poking_init(void
         BUG_ON(!poking_mm);
   
         /* Xen PV guests need the PGD to be pinned. */
- -      paravirt_arch_dup_mmap(NULL, poking_mm);
+ +      paravirt_enter_mmap(poking_mm);
   
         /*
          * Randomize the poking address, but make sure that the following page
@@@ -1048,6 -1048,11 +1048,11 @@@ __visible DEFINE_PER_CPU_ALIGNED(struc
         .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
   };
   
+ #ifdef CONFIG_ADDRESS_MASKING
+ DEFINE_PER_CPU(u64, tlbstate_untag_mask);
+ EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
+ #endif
+ 
   void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
   {
         /* entry 0 MUST be WB (hardwired to speed up translations) */
diff --combined arch/x86/mm/tlb.c

index 16c5292,724f98d..267acf2
--- 1/arch/x86/mm/tlb.c
--- 2/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@@ -154,26 -154,30 +154,30 @@@ static inline u16 user_pcid(u16 asid
         return ret;
   }
   
- static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
   {
+       unsigned long cr3 = __sme_pa(pgd) | lam;
+ 
         if (static_cpu_has(X86_FEATURE_PCID)) {
-               return __sme_pa(pgd) | kern_pcid(asid);
+               VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+               cr3 |= kern_pcid(asid);
         } else {
                 VM_WARN_ON_ONCE(asid != 0);
-               return __sme_pa(pgd);
         }
+ 
+       return cr3;
   }
   
- static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+                                             unsigned long lam)
   {
-       VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
         /*
          * Use boot_cpu_has() instead of this_cpu_has() as this function
          * might be called during early boot. This should work even after
          * boot because all CPU's the have same capabilities:
          */
         VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
-       return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+       return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
   }
   
   /*
@@@ -274,15 -278,16 +278,16 @@@ static inline void invalidate_user_asid
                   (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
   }
   
- static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+                           bool need_flush)
   {
         unsigned long new_mm_cr3;
   
         if (need_flush) {
                 invalidate_user_asid(new_asid);
-               new_mm_cr3 = build_cr3(pgdir, new_asid);
+               new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
         } else {
-               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
         }
   
         /*
@@@ -491,6 -496,7 +496,7 @@@ void switch_mm_irqs_off(struct mm_struc
   {
         struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
         u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       unsigned long new_lam = mm_lam_cr3_mask(next);
         bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
         unsigned cpu = smp_processor_id();
         u64 next_tlb_gen;
@@@ -520,7 -526,8 +526,8 @@@
          * isn't free.
          */
   #ifdef CONFIG_DEBUG_VM
-       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+                                                  tlbstate_lam_cr3_mask()))) {
                 /*
                  * If we were to BUG here, we'd be very likely to kill
                  * the system so hard that we don't see the call trace.
@@@ -552,10 -559,16 +559,16 @@@
          * instruction.
          */
         if (real_prev == next) {
+               /* Not actually switching mm's */
                 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                            next->context.ctx_id);
   
                 /*
+                * If this races with another thread that enables lam, 'new_lam'
+                * might not match tlbstate_lam_cr3_mask().
+                */
+ 
+               /*
                  * Even in lazy TLB mode, the CPU should stay set in the
                  * mm_cpumask. The TLB shootdown code can figure out from
                  * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
@@@ -622,15 -635,16 +635,16 @@@
                 barrier();
         }
   
+       set_tlbstate_lam_mode(next);
         if (need_flush) {
                 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
                 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-               load_new_mm_cr3(next->pgd, new_asid, true);
+               load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
   
                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
         } else {
                 /* The new ASID is already up to date. */
-               load_new_mm_cr3(next->pgd, new_asid, false);
+               load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
   
                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
         }
@@@ -691,6 -705,10 +705,10 @@@ void initialize_tlbstate_and_flush(void
         /* Assert that CR3 already references the right mm. */
         WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
   
+       /* LAM expected to be disabled */
+       WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+       WARN_ON(mm_lam_cr3_mask(mm));
+ 
         /*
          * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
          * doesn't work like other CR4 bits because it can only be set from
@@@ -699,8 -717,8 +717,8 @@@
         WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
                 !(cr4_read_shadow() & X86_CR4_PCIDE));
   
-       /* Force ASID 0 and force a TLB flush. */
-       write_cr3(build_cr3(mm->pgd, 0));
+       /* Disable LAM, force ASID 0 and force a TLB flush. */
+       write_cr3(build_cr3(mm->pgd, 0, 0));
   
         /* Reinitialize tlbstate. */
         this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@@ -708,6 -726,7 +726,7 @@@
         this_cpu_write(cpu_tlbstate.next_asid, 1);
         this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
         this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+       set_tlbstate_lam_mode(mm);
   
         for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
                 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@@ -925,7 -944,7 +944,7 @@@ void flush_tlb_multi(const struct cpuma
   }
   
   /*
- - * See Documentation/x86/tlb.rst for details.  We choose 33
+ + * See Documentation/arch/x86/tlb.rst for details.  We choose 33
    * because it is large enough to cover the vast majority (at
    * least 95%) of allocations, and is small enough that we are
    * confident it will not cause too much overhead.  Each single
@@@ -1071,8 -1090,10 +1090,10 @@@ void flush_tlb_kernel_range(unsigned lo
    */
   unsigned long __get_current_cr3_fast(void)
   {
-       unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
-               this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+       unsigned long cr3 =
+               build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+                         this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+                         tlbstate_lam_cr3_mask());
   
         /* For now, be very restrictive about when this can be called. */
         VM_WARN_ON(in_nmi() || preemptible());
diff --combined fs/proc/array.c

index 425824a,6daea62..d35bbf3
--- 1/fs/proc/array.c
--- 2/fs/proc/array.c
+++ b/fs/proc/array.c
@@@ -91,6 -91,7 +91,7 @@@
   #include <linux/user_namespace.h>
   #include <linux/fs_struct.h>
   #include <linux/kthread.h>
+ #include <linux/mmu_context.h>
   
   #include <asm/processor.h>
   #include "internal.h"
@@@ -219,8 -220,6 +220,8 @@@ static inline void task_state(struct se
                 seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
   #endif
         seq_putc(m, '\n');
+ +
+ +      seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
   }
   
   void render_sigset_t(struct seq_file *m, const char *header,
@@@ -425,6 -424,11 +426,11 @@@ static inline void task_thp_status(stru
         seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
   }
   
+ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+ {
+       seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+ }
+ 
   int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                         struct pid *pid, struct task_struct *task)
   {
@@@ -440,6 -444,7 +446,7 @@@
                 task_mem(m, mm);
                 task_core_dumping(m, task);
                 task_thp_status(m, mm);
+               task_untag_mask(m, mm);
                 mmput(mm);
         }
         task_sig(m, task);
diff --combined fs/proc/task_mmu.c

index cb49479,29fd6b1..420510f
--- 1/fs/proc/task_mmu.c
--- 2/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@@ -782,6 -782,7 +782,6 @@@ static void smap_gather_stats(struct vm
         if (start >= vma->vm_end)
                 return;
   
- -#ifdef CONFIG_SHMEM
         if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
                 /*
                  * For shared or readonly shmem mappings we know that all
@@@ -802,7 -803,7 +802,7 @@@
                         ops = &smaps_shmem_walk_ops;
                 }
         }
- -#endif
+ +
         /* mmap_lock is held in m_start */
         if (!start)
                 walk_page_vma(vma, ops, mss);
@@@ -1688,8 -1689,13 +1688,13 @@@ static ssize_t pagemap_read(struct fil
   
         /* watch out for wraparound */
         start_vaddr = end_vaddr;
-       if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
-               start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+       if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+               ret = mmap_read_lock_killable(mm);
+               if (ret)
+                       goto out_free;
+               start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+               mmap_read_unlock(mm);
+       }
   
         /* Ensure the address is inside the task */
         if (start_vaddr > mm->task_size)
diff --combined include/linux/mm.h

index 3731999,289ae4c..27ce770
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -29,7 -29,6 +29,7 @@@
   #include <linux/pgtable.h>
   #include <linux/kasan.h>
   #include <linux/memremap.h>
+ +#include <linux/slab.h>
   
   struct mempolicy;
   struct anon_vma;
@@@ -39,7 -38,6 +39,7 @@@ struct pt_regs
   
   extern int sysctl_page_lock_unfairness;
   
+ +void mm_core_init(void);
   void init_mm_internals(void);
   
   #ifndef CONFIG_NUMA           /* Don't use mapnrs, do it properly */
@@@ -98,17 -96,6 +98,6 @@@ extern int mmap_rnd_compat_bits __read_
   #include <asm/page.h>
   #include <asm/processor.h>
   
- /*
-  * Architectures that support memory tagging (assigning tags to memory regions,
-  * embedding these tags into addresses that point to these memory regions, and
-  * checking that the memory and the pointer tags match on memory accesses)
-  * redefine this macro to strip tags from pointers.
-  * It's defined as noop for architectures that don't support memory tagging.
-  */
- #ifndef untagged_addr
- #define untagged_addr(addr) (addr)
- #endif
- 
   #ifndef __pa_symbol
   #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
   #endif
@@@ -258,8 -245,6 +247,8 @@@ void setup_initial_init_mm(void *start_
   struct vm_area_struct *vm_area_alloc(struct mm_struct *);
   struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
   void vm_area_free(struct vm_area_struct *);
+ +/* Use only if VMA has no other users */
+ +void __vm_area_free(struct vm_area_struct *vma);
   
   #ifndef CONFIG_MMU
   extern struct rb_root nommu_region_tree;
@@@ -482,8 -467,7 +471,8 @@@ static inline bool fault_flag_allow_ret
         { FAULT_FLAG_USER,              "USER" }, \
         { FAULT_FLAG_REMOTE,            "REMOTE" }, \
         { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
- -      { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
+ +      { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }, \
+ +      { FAULT_FLAG_VMA_LOCK,          "VMA_LOCK" }
   
   /*
    * vm_fault is filled by the pagefault handler and passed to the vma's
@@@ -628,131 -612,6 +617,131 @@@ struct vm_operations_struct 
                                           unsigned long addr);
   };
   
+ +#ifdef CONFIG_NUMA_BALANCING
+ +static inline void vma_numab_state_init(struct vm_area_struct *vma)
+ +{
+ +      vma->numab_state = NULL;
+ +}
+ +static inline void vma_numab_state_free(struct vm_area_struct *vma)
+ +{
+ +      kfree(vma->numab_state);
+ +}
+ +#else
+ +static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+ +static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+ +#endif /* CONFIG_NUMA_BALANCING */
+ +
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +/*
+ + * Try to read-lock a vma. The function is allowed to occasionally yield false
+ + * locked result to avoid performance overhead, in which case we fall back to
+ + * using mmap_lock. The function should never yield false unlocked result.
+ + */
+ +static inline bool vma_start_read(struct vm_area_struct *vma)
+ +{
+ +      /* Check before locking. A race might cause false locked result. */
+ +      if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ +              return false;
+ +
+ +      if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+ +              return false;
+ +
+ +      /*
+ +       * Overflow might produce false locked result.
+ +       * False unlocked result is impossible because we modify and check
+ +       * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ +       * modification invalidates all existing locks.
+ +       */
+ +      if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ +              up_read(&vma->vm_lock->lock);
+ +              return false;
+ +      }
+ +      return true;
+ +}
+ +
+ +static inline void vma_end_read(struct vm_area_struct *vma)
+ +{
+ +      rcu_read_lock(); /* keeps vma alive till the end of up_read */
+ +      up_read(&vma->vm_lock->lock);
+ +      rcu_read_unlock();
+ +}
+ +
+ +static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+ +{
+ +      mmap_assert_write_locked(vma->vm_mm);
+ +
+ +      /*
+ +       * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ +       * mm->mm_lock_seq can't be concurrently modified.
+ +       */
+ +      *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ +      return (vma->vm_lock_seq == *mm_lock_seq);
+ +}
+ +
+ +static inline void vma_start_write(struct vm_area_struct *vma)
+ +{
+ +      int mm_lock_seq;
+ +
+ +      if (__is_vma_write_locked(vma, &mm_lock_seq))
+ +              return;
+ +
+ +      down_write(&vma->vm_lock->lock);
+ +      vma->vm_lock_seq = mm_lock_seq;
+ +      up_write(&vma->vm_lock->lock);
+ +}
+ +
+ +static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ +{
+ +      int mm_lock_seq;
+ +
+ +      if (__is_vma_write_locked(vma, &mm_lock_seq))
+ +              return true;
+ +
+ +      if (!down_write_trylock(&vma->vm_lock->lock))
+ +              return false;
+ +
+ +      vma->vm_lock_seq = mm_lock_seq;
+ +      up_write(&vma->vm_lock->lock);
+ +      return true;
+ +}
+ +
+ +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+ +{
+ +      int mm_lock_seq;
+ +
+ +      VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ +}
+ +
+ +static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+ +{
+ +      /* When detaching vma should be write-locked */
+ +      if (detached)
+ +              vma_assert_write_locked(vma);
+ +      vma->detached = detached;
+ +}
+ +
+ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ +                                        unsigned long address);
+ +
+ +#else /* CONFIG_PER_VMA_LOCK */
+ +
+ +static inline void vma_init_lock(struct vm_area_struct *vma) {}
+ +static inline bool vma_start_read(struct vm_area_struct *vma)
+ +              { return false; }
+ +static inline void vma_end_read(struct vm_area_struct *vma) {}
+ +static inline void vma_start_write(struct vm_area_struct *vma) {}
+ +static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ +              { return true; }
+ +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+ +static inline void vma_mark_detached(struct vm_area_struct *vma,
+ +                                   bool detached) {}
+ +
+ +#endif /* CONFIG_PER_VMA_LOCK */
+ +
+ +/*
+ + * WARNING: vma_init does not initialize vma->vm_lock.
+ + * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ + */
   static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
   {
         static const struct vm_operations_struct dummy_vm_ops = {};
@@@ -761,8 -620,6 +750,8 @@@
         vma->vm_mm = mm;
         vma->vm_ops = &dummy_vm_ops;
         INIT_LIST_HEAD(&vma->anon_vma_chain);
+ +      vma_mark_detached(vma, false);
+ +      vma_numab_state_init(vma);
   }
   
   /* Use when VMA is not part of the VMA tree and needs no locking */
@@@ -776,28 -633,28 +765,28 @@@ static inline void vm_flags_init(struc
   static inline void vm_flags_reset(struct vm_area_struct *vma,
                                   vm_flags_t flags)
   {
- -      mmap_assert_write_locked(vma->vm_mm);
+ +      vma_start_write(vma);
         vm_flags_init(vma, flags);
   }
   
   static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                        vm_flags_t flags)
   {
- -      mmap_assert_write_locked(vma->vm_mm);
+ +      vma_start_write(vma);
         WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
   }
   
   static inline void vm_flags_set(struct vm_area_struct *vma,
                                 vm_flags_t flags)
   {
- -      mmap_assert_write_locked(vma->vm_mm);
+ +      vma_start_write(vma);
         ACCESS_PRIVATE(vma, __vm_flags) |= flags;
   }
   
   static inline void vm_flags_clear(struct vm_area_struct *vma,
                                   vm_flags_t flags)
   {
- -      mmap_assert_write_locked(vma->vm_mm);
+ +      vma_start_write(vma);
         ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
   }
   
@@@ -818,7 -675,7 +807,7 @@@ static inline void __vm_flags_mod(struc
   static inline void vm_flags_mod(struct vm_area_struct *vma,
                                 vm_flags_t set, vm_flags_t clear)
   {
- -      mmap_assert_write_locked(vma->vm_mm);
+ +      vma_start_write(vma);
         __vm_flags_mod(vma, set, clear);
   }
   
@@@ -1686,16 -1543,6 +1675,16 @@@ static inline int xchg_page_access_time
         last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
         return last_time << PAGE_ACCESS_TIME_BUCKETS;
   }
+ +
+ +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+ +{
+ +      unsigned int pid_bit;
+ +
+ +      pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+ +      if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+ +              __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ +      }
+ +}
   #else /* !CONFIG_NUMA_BALANCING */
   static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
   {
@@@ -1745,10 -1592,6 +1734,10 @@@ static inline bool cpupid_match_pid(str
   {
         return false;
   }
+ +
+ +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+ +{
+ +}
   #endif /* CONFIG_NUMA_BALANCING */
   
   #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@@ -2782,6 -2625,12 +2771,6 @@@ static inline bool ptlock_init(struct p
   static inline void ptlock_free(struct page *page) {}
   #endif /* USE_SPLIT_PTE_PTLOCKS */
   
- -static inline void pgtable_init(void)
- -{
- -      ptlock_cache_init();
- -      pgtable_cache_init();
- -}
- -
   static inline bool pgtable_pte_page_ctor(struct page *page)
   {
         if (!ptlock_init(page))
@@@ -2925,6 -2774,7 +2914,6 @@@ extern unsigned long free_reserved_area
                                         int poison, const char *s);
   
   extern void adjust_managed_page_count(struct page *page, long count);
- -extern void mem_init_print_info(void);
   
   extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
   
@@@ -3035,6 -2885,7 +3024,6 @@@ extern void setup_per_cpu_pageset(void)
   extern int min_free_kbytes;
   extern int watermark_boost_factor;
   extern int watermark_scale_factor;
- -extern bool arch_has_descending_max_zone_pfns(void);
   
   /* nommu.c */
   extern atomic_long_t mmap_pages_allocated;
@@@ -3323,6 -3174,8 +3312,6 @@@ vm_fault_t vmf_insert_pfn_prot(struct v
                         unsigned long pfn, pgprot_t pgprot);
   vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                         pfn_t pfn);
- -vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- -                      pfn_t pfn, pgprot_t pgprot);
   vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                 unsigned long addr, pfn_t pfn);
   int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@@ -3392,6 -3245,7 +3381,6 @@@ extern int apply_to_existing_page_range
                                    unsigned long address, unsigned long size,
                                    pte_fn_t fn, void *data);
   
- -extern void __init init_mem_debugging_and_hardening(void);
   #ifdef CONFIG_PAGE_POISONING
   extern void __kernel_poison_pages(struct page *page, int numpages);
   extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@@ -3560,22 -3414,6 +3549,22 @@@ void vmemmap_populate_print_last(void)
   void vmemmap_free(unsigned long start, unsigned long end,
                 struct vmem_altmap *altmap);
   #endif
+ +
+ +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+ +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ +                                         struct dev_pagemap *pgmap)
+ +{
+ +      return is_power_of_2(sizeof(struct page)) &&
+ +              pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+ +}
+ +#else
+ +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ +                                         struct dev_pagemap *pgmap)
+ +{
+ +      return false;
+ +}
+ +#endif
+ +
   void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                   unsigned long nr_pages);
   
@@@ -3593,6 -3431,8 +3582,6 @@@ int mf_dax_kill_procs(struct address_sp
   extern int memory_failure(unsigned long pfn, int flags);
   extern void memory_failure_queue_kick(int cpu);
   extern int unpoison_memory(unsigned long pfn);
- -extern int sysctl_memory_failure_early_kill;
- -extern int sysctl_memory_failure_recovery;
   extern void shake_page(struct page *p);
   extern atomic_long_t num_poisoned_pages __read_mostly;
   extern int soft_offline_page(unsigned long pfn, int flags);
@@@ -3602,7 -3442,6 +3591,7 @@@ extern int __get_huge_page_for_hwpoison
                                         bool *migratable_cleared);
   void num_poisoned_pages_inc(unsigned long pfn);
   void num_poisoned_pages_sub(unsigned long pfn, long i);
+ +struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
   #else
   static inline void memory_failure_queue(unsigned long pfn, int flags)
   {
@@@ -3623,12 -3462,6 +3612,12 @@@ static inline void num_poisoned_pages_s
   }
   #endif
   
+ +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+ +void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+ +                   struct vm_area_struct *vma, struct list_head *to_kill,
+ +                   unsigned long ksm_addr);
+ +#endif
+ +
   #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
   extern void memblk_nr_poison_inc(unsigned long pfn);
   extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@@ -3698,12 -3531,14 +3687,12 @@@ extern const struct attribute_group mem
   extern void clear_huge_page(struct page *page,
                             unsigned long addr_hint,
                             unsigned int pages_per_huge_page);
- -extern void copy_user_huge_page(struct page *dst, struct page *src,
- -                              unsigned long addr_hint,
- -                              struct vm_area_struct *vma,
- -                              unsigned int pages_per_huge_page);
- -extern long copy_huge_page_from_user(struct page *dst_page,
- -                              const void __user *usr_src,
- -                              unsigned int pages_per_huge_page,
- -                              bool allow_pagefault);
+ +int copy_user_large_folio(struct folio *dst, struct folio *src,
+ +                        unsigned long addr_hint,
+ +                        struct vm_area_struct *vma);
+ +long copy_folio_from_user(struct folio *dst_folio,
+ +                         const void __user *usr_src,
+ +                         bool allow_pagefault);
   
   /**
    * vma_is_special_huge - Are transhuge page-table entries considered special?
diff --combined include/linux/sched/mm.h

index 689dbe8,b69fe7e..af12fcb
--- 1/include/linux/sched/mm.h
--- 2/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@@ -79,34 -79,6 +79,34 @@@ static inline void mmdrop_sched(struct 
   }
   #endif
   
+ +/* Helpers for lazy TLB mm refcounting */
+ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+ +{
+ +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ +              mmgrab(mm);
+ +}
+ +
+ +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+ +{
+ +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ +              mmdrop(mm);
+ +      } else {
+ +              /*
+ +               * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ +               * membarrier comment finish_task_switch which relies on this.
+ +               */
+ +              smp_mb();
+ +      }
+ +}
+ +
+ +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+ +{
+ +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ +              mmdrop_sched(mm);
+ +      else
+ +              smp_mb(); /* see mmdrop_lazy_tlb() above */
+ +}
+ +
   /**
    * mmget() - Pin the address space associated with a &struct mm_struct.
    * @mm: The address space to pin.
@@@ -485,6 -457,11 +485,11 @@@ static inline void mm_pasid_init(struc
         mm->pasid = INVALID_IOASID;
   }
   
+ static inline bool mm_valid_pasid(struct mm_struct *mm)
+ {
+       return mm->pasid != INVALID_IOASID;
+ }
+ 
   /* Associate a PASID with an mm_struct: */
   static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
   {
@@@ -493,13 -470,14 +498,14 @@@
   
   static inline void mm_pasid_drop(struct mm_struct *mm)
   {
-       if (pasid_valid(mm->pasid)) {
+       if (mm_valid_pasid(mm)) {
                 ioasid_free(mm->pasid);
                 mm->pasid = INVALID_IOASID;
         }
   }
   #else
   static inline void mm_pasid_init(struct mm_struct *mm) {}
+ static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
   static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
   static inline void mm_pasid_drop(struct mm_struct *mm) {}
   #endif
diff --combined mm/gup.c

index 1f72a71,5ee8b68..ff689c8
--- 1/mm/gup.c
--- 2/mm/gup.c
+++ b/mm/gup.c
@@@ -1085,7 -1085,7 +1085,7 @@@ static long __get_user_pages(struct mm_
         if (!nr_pages)
                 return 0;
   
-       start = untagged_addr(start);
+       start = untagged_addr_remote(mm, start);
   
         VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
   
@@@ -1259,7 -1259,7 +1259,7 @@@ int fixup_user_fault(struct mm_struct *
         struct vm_area_struct *vma;
         vm_fault_t ret;
   
-       address = untagged_addr(address);
+       address = untagged_addr_remote(mm, address);
   
         if (unlocked)
                 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@@ -2193,7 -2193,7 +2193,7 @@@ static bool is_valid_gup_args(struct pa
    * This does not guarantee that the page exists in the user mappings when
    * get_user_pages_remote returns, and there may even be a completely different
    * page there in some cases (eg. if mmapped pagecache has been invalidated
- - * and subsequently re faulted). However it does guarantee that the page
+ + * and subsequently re-faulted). However it does guarantee that the page
    * won't be freed completely. And mostly callers simply care that the page
    * contains data that was valid *at some point in time*. Typically, an IO
    * or similar operation cannot guarantee anything stronger anyway because
diff --combined mm/madvise.c

index 24c5cff,d4b67f3..b5ffbaf
--- 1/mm/madvise.c
--- 2/mm/madvise.c
+++ b/mm/madvise.c
@@@ -852,9 -852,21 +852,9 @@@ static long madvise_dontneed_free(struc
                 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
   
                 mmap_read_lock(mm);
- -              vma = find_vma(mm, start);
+ +              vma = vma_lookup(mm, start);
                 if (!vma)
                         return -ENOMEM;
- -              if (start < vma->vm_start) {
- -                      /*
- -                       * This "vma" under revalidation is the one
- -                       * with the lowest vma->vm_start where start
- -                       * is also < vma->vm_end. If start <
- -                       * vma->vm_start it means an hole materialized
- -                       * in the user address space within the
- -                       * virtual range passed to MADV_DONTNEED
- -                       * or MADV_FREE.
- -                       */
- -                      return -ENOMEM;
- -              }
                 /*
                  * Potential end adjustment for hugetlb vma is OK as
                  * the check below keeps end within vma.
@@@ -1390,8 -1402,6 +1390,6 @@@ int do_madvise(struct mm_struct *mm, un
         size_t len;
         struct blk_plug plug;
   
-       start = untagged_addr(start);
- 
         if (!madvise_behavior_valid(behavior))
                 return -EINVAL;
   
@@@ -1423,6 -1433,9 +1421,9 @@@
                 mmap_read_lock(mm);
         }
   
+       start = untagged_addr_remote(mm, start);
+       end = start + len;
+ 
         blk_start_plug(&plug);
         error = madvise_walk_vmas(mm, start, end, behavior,
                         madvise_vma_behavior);
@@@ -1444,7 -1457,7 +1445,7 @@@ SYSCALL_DEFINE5(process_madvise, int, p
                 size_t, vlen, int, behavior, unsigned int, flags)
   {
         ssize_t ret;
- -      struct iovec iovstack[UIO_FASTIOV], iovec;
+ +      struct iovec iovstack[UIO_FASTIOV];
         struct iovec *iov = iovstack;
         struct iov_iter iter;
         struct task_struct *task;
@@@ -1491,11 -1504,12 +1492,11 @@@
         total_len = iov_iter_count(&iter);
   
         while (iov_iter_count(&iter)) {
- -              iovec = iov_iter_iovec(&iter);
- -              ret = do_madvise(mm, (unsigned long)iovec.iov_base,
- -                                      iovec.iov_len, behavior);
+ +              ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
+ +                                      iter_iov_len(&iter), behavior);
                 if (ret < 0)
                         break;
- -              iov_iter_advance(&iter, iovec.iov_len);
+ +              iov_iter_advance(&iter, iter_iov_len(&iter));
         }
   
         ret = (total_len - iov_iter_count(&iter)) ? : ret;
diff --combined mm/migrate.c

index 02cace7,8cd11bc..01cac26
--- 1/mm/migrate.c
--- 2/mm/migrate.c
+++ b/mm/migrate.c
@@@ -213,15 -213,20 +213,15 @@@ static bool remove_migration_pte(struc
                 if (pte_swp_soft_dirty(*pvmw.pte))
                         pte = pte_mksoft_dirty(pte);
   
- -              /*
- -               * Recheck VMA as permissions can change since migration started
- -               */
                 entry = pte_to_swp_entry(*pvmw.pte);
                 if (!is_migration_entry_young(entry))
                         pte = pte_mkold(pte);
                 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
                         pte = pte_mkdirty(pte);
                 if (is_writable_migration_entry(entry))
- -                      pte = maybe_mkwrite(pte, vma);
+ +                      pte = pte_mkwrite(pte);
                 else if (pte_swp_uffd_wp(*pvmw.pte))
                         pte = pte_mkuffd_wp(pte);
- -              else
- -                      pte = pte_wrprotect(pte);
   
                 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
                         rmap_flags |= RMAP_EXCLUSIVE;
@@@ -244,6 -249,7 +244,6 @@@
                 if (folio_test_hugetlb(folio)) {
                         unsigned int shift = huge_page_shift(hstate_vma(vma));
   
- -                      pte = pte_mkhuge(pte);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                         if (folio_test_anon(folio))
                                 hugepage_add_anon_rmap(new, vma, pvmw.address,
@@@ -1106,8 -1112,9 +1106,8 @@@ static void migrate_folio_done(struct f
   /* Obtain the lock on page, remove all ptes. */
   static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
                                unsigned long private, struct folio *src,
- -                             struct folio **dstp, int force, bool avoid_force_lock,
- -                             enum migrate_mode mode, enum migrate_reason reason,
- -                             struct list_head *ret)
+ +                             struct folio **dstp, enum migrate_mode mode,
+ +                             enum migrate_reason reason, struct list_head *ret)
   {
         struct folio *dst;
         int rc = -EAGAIN;
@@@ -1137,7 -1144,7 +1137,7 @@@
         dst->private = NULL;
   
         if (!folio_trylock(src)) {
- -              if (!force || mode == MIGRATE_ASYNC)
+ +              if (mode == MIGRATE_ASYNC)
                         goto out;
   
                 /*
@@@ -1156,6 -1163,17 +1156,6 @@@
                 if (current->flags & PF_MEMALLOC)
                         goto out;
   
- -              /*
- -               * We have locked some folios and are going to wait to lock
- -               * this folio.  To avoid a potential deadlock, let's bail
- -               * out and not do that. The locked folios will be moved and
- -               * unlocked, then we can wait to lock this folio.
- -               */
- -              if (avoid_force_lock) {
- -                      rc = -EDEADLOCK;
- -                      goto out;
- -              }
- -
                 folio_lock(src);
         }
         locked = true;
@@@ -1175,6 -1193,8 +1175,6 @@@
                         rc = -EBUSY;
                         goto out;
                 }
- -              if (!force)
- -                      goto out;
                 folio_wait_writeback(src);
         }
   
@@@ -1233,7 -1253,7 +1233,7 @@@
                 /* Establish migration ptes */
                 VM_BUG_ON_FOLIO(folio_test_anon(src) &&
                                !folio_test_ksm(src) && !anon_vma, src);
- -              try_to_migrate(src, TTU_BATCH_FLUSH);
+ +              try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
                 page_was_mapped = 1;
         }
   
@@@ -1247,7 -1267,7 +1247,7 @@@ out
          * A folio that has not been unmapped will be restored to
          * right list unless we want to retry.
          */
- -      if (rc == -EAGAIN || rc == -EDEADLOCK)
+ +      if (rc == -EAGAIN)
                 ret = NULL;
   
         migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
@@@ -1488,9 -1508,6 +1488,9 @@@ static inline int try_split_folio(struc
   #define NR_MAX_BATCHED_MIGRATION      512
   #endif
   #define NR_MAX_MIGRATE_PAGES_RETRY    10
+ +#define NR_MAX_MIGRATE_ASYNC_RETRY    3
+ +#define NR_MAX_MIGRATE_SYNC_RETRY                                     \
+ +      (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
   
   struct migrate_pages_stats {
         int nr_succeeded;       /* Normal and large folios migrated successfully, in
@@@ -1601,19 -1618,13 +1601,19 @@@ static int migrate_hugetlbs(struct list
   /*
    * migrate_pages_batch() first unmaps folios in the from list as many as
    * possible, then move the unmapped folios.
+ + *
+ + * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
+ + * lock or bit when we have locked more than one folio.  Which may cause
+ + * deadlock (e.g., for loop device).  So, if mode != MIGRATE_ASYNC, the
+ + * length of the from list must be <= 1.
    */
   static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
                 free_page_t put_new_page, unsigned long private,
                 enum migrate_mode mode, int reason, struct list_head *ret_folios,
- -              struct migrate_pages_stats *stats)
+ +              struct list_head *split_folios, struct migrate_pages_stats *stats,
+ +              int nr_pass)
   {
- -      int retry;
+ +      int retry = 1;
         int large_retry = 1;
         int thp_retry = 1;
         int nr_failed = 0;
@@@ -1623,15 -1634,21 +1623,15 @@@
         bool is_large = false;
         bool is_thp = false;
         struct folio *folio, *folio2, *dst = NULL, *dst2;
- -      int rc, rc_saved, nr_pages;
- -      LIST_HEAD(split_folios);
+ +      int rc, rc_saved = 0, nr_pages;
         LIST_HEAD(unmap_folios);
         LIST_HEAD(dst_folios);
         bool nosplit = (reason == MR_NUMA_MISPLACED);
- -      bool no_split_folio_counting = false;
- -      bool avoid_force_lock;
   
- -retry:
- -      rc_saved = 0;
- -      avoid_force_lock = false;
- -      retry = 1;
- -      for (pass = 0;
- -           pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
- -           pass++) {
+ +      VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
+ +                      !list_empty(from) && !list_is_singular(from));
+ +
+ +      for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
                 retry = 0;
                 large_retry = 0;
                 thp_retry = 0;
@@@ -1662,7 -1679,7 +1662,7 @@@
                         if (!thp_migration_supported() && is_thp) {
                                 nr_large_failed++;
                                 stats->nr_thp_failed++;
- -                              if (!try_split_folio(folio, &split_folios)) {
+ +                              if (!try_split_folio(folio, split_folios)) {
                                         stats->nr_thp_split++;
                                         continue;
                                 }
@@@ -1672,13 -1689,15 +1672,13 @@@
                         }
   
                         rc = migrate_folio_unmap(get_new_page, put_new_page, private,
- -                                               folio, &dst, pass > 2, avoid_force_lock,
- -                                               mode, reason, ret_folios);
+ +                                               folio, &dst, mode, reason, ret_folios);
                         /*
                          * The rules are:
                          *      Success: folio will be freed
                          *      Unmap: folio will be put on unmap_folios list,
                          *             dst folio put on dst_folios list
                          *      -EAGAIN: stay on the from list
- -                       *      -EDEADLOCK: stay on the from list
                          *      -ENOMEM: stay on the from list
                          *      Other errno: put on ret_folios list
                          */
@@@ -1693,7 -1712,7 +1693,7 @@@
                                         stats->nr_thp_failed += is_thp;
                                         /* Large folio NUMA faulting doesn't split to retry. */
                                         if (!nosplit) {
- -                                              int ret = try_split_folio(folio, &split_folios);
+ +                                              int ret = try_split_folio(folio, split_folios);
   
                                                 if (!ret) {
                                                         stats->nr_thp_split += is_thp;
@@@ -1707,17 -1726,21 +1707,17 @@@
                                                         large_retry++;
                                                         thp_retry += is_thp;
                                                         nr_retry_pages += nr_pages;
+ +                                                      /* Undo duplicated failure counting. */
+ +                                                      nr_large_failed--;
+ +                                                      stats->nr_thp_failed -= is_thp;
                                                         break;
                                                 }
                                         }
- -                              } else if (!no_split_folio_counting) {
+ +                              } else {
                                         nr_failed++;
                                 }
   
                                 stats->nr_failed_pages += nr_pages + nr_retry_pages;
- -                              /*
- -                               * There might be some split folios of fail-to-migrate large
- -                               * folios left in split_folios list. Move them to ret_folios
- -                               * list so that they could be put back to the right list by
- -                               * the caller otherwise the folio refcnt will be leaked.
- -                               */
- -                              list_splice_init(&split_folios, ret_folios);
                                 /* nr_failed isn't updated for not used */
                                 nr_large_failed += large_retry;
                                 stats->nr_thp_failed += thp_retry;
@@@ -1726,11 -1749,19 +1726,11 @@@
                                         goto out;
                                 else
                                         goto move;
- -                      case -EDEADLOCK:
- -                              /*
- -                               * The folio cannot be locked for potential deadlock.
- -                               * Go move (and unlock) all locked folios.  Then we can
- -                               * try again.
- -                               */
- -                              rc_saved = rc;
- -                              goto move;
                         case -EAGAIN:
                                 if (is_large) {
                                         large_retry++;
                                         thp_retry += is_thp;
- -                              } else if (!no_split_folio_counting) {
+ +                              } else {
                                         retry++;
                                 }
                                 nr_retry_pages += nr_pages;
@@@ -1740,6 -1771,11 +1740,6 @@@
                                 stats->nr_thp_succeeded += is_thp;
                                 break;
                         case MIGRATEPAGE_UNMAP:
- -                              /*
- -                               * We have locked some folios, don't force lock
- -                               * to avoid deadlock.
- -                               */
- -                              avoid_force_lock = true;
                                 list_move_tail(&folio->lru, &unmap_folios);
                                 list_add_tail(&dst->lru, &dst_folios);
                                 break;
@@@ -1753,7 -1789,7 +1753,7 @@@
                                 if (is_large) {
                                         nr_large_failed++;
                                         stats->nr_thp_failed += is_thp;
- -                              } else if (!no_split_folio_counting) {
+ +                              } else {
                                         nr_failed++;
                                 }
   
@@@ -1771,7 -1807,9 +1771,7 @@@ move
         try_to_unmap_flush();
   
         retry = 1;
- -      for (pass = 0;
- -           pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
- -           pass++) {
+ +      for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
                 retry = 0;
                 large_retry = 0;
                 thp_retry = 0;
@@@ -1800,7 -1838,7 +1800,7 @@@
                                 if (is_large) {
                                         large_retry++;
                                         thp_retry += is_thp;
- -                              } else if (!no_split_folio_counting) {
+ +                              } else {
                                         retry++;
                                 }
                                 nr_retry_pages += nr_pages;
@@@ -1813,7 -1851,7 +1813,7 @@@
                                 if (is_large) {
                                         nr_large_failed++;
                                         stats->nr_thp_failed += is_thp;
- -                              } else if (!no_split_folio_counting) {
+ +                              } else {
                                         nr_failed++;
                                 }
   
@@@ -1850,52 -1888,30 +1850,52 @@@ out
                 dst2 = list_next_entry(dst, lru);
         }
   
- -      /*
- -       * Try to migrate split folios of fail-to-migrate large folios, no
- -       * nr_failed counting in this round, since all split folios of a
- -       * large folio is counted as 1 failure in the first round.
- -       */
- -      if (rc >= 0 && !list_empty(&split_folios)) {
- -              /*
- -               * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
- -               * retries) to ret_folios to avoid migrating them again.
- -               */
- -              list_splice_init(from, ret_folios);
- -              list_splice_init(&split_folios, from);
- -              no_split_folio_counting = true;
- -              goto retry;
- -      }
+ +      return rc;
+ +}
   
+ +static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
+ +              free_page_t put_new_page, unsigned long private,
+ +              enum migrate_mode mode, int reason, struct list_head *ret_folios,
+ +              struct list_head *split_folios, struct migrate_pages_stats *stats)
+ +{
+ +      int rc, nr_failed = 0;
+ +      LIST_HEAD(folios);
+ +      struct migrate_pages_stats astats;
+ +
+ +      memset(&astats, 0, sizeof(astats));
+ +      /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
+ +      rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
+ +                               reason, &folios, split_folios, &astats,
+ +                               NR_MAX_MIGRATE_ASYNC_RETRY);
+ +      stats->nr_succeeded += astats.nr_succeeded;
+ +      stats->nr_thp_succeeded += astats.nr_thp_succeeded;
+ +      stats->nr_thp_split += astats.nr_thp_split;
+ +      if (rc < 0) {
+ +              stats->nr_failed_pages += astats.nr_failed_pages;
+ +              stats->nr_thp_failed += astats.nr_thp_failed;
+ +              list_splice_tail(&folios, ret_folios);
+ +              return rc;
+ +      }
+ +      stats->nr_thp_failed += astats.nr_thp_split;
+ +      nr_failed += astats.nr_thp_split;
         /*
- -       * We have unlocked all locked folios, so we can force lock now, let's
- -       * try again.
+ +       * Fall back to migrate all failed folios one by one synchronously. All
+ +       * failed folios except split THPs will be retried, so their failure
+ +       * isn't counted
          */
- -      if (rc == -EDEADLOCK)
- -              goto retry;
+ +      list_splice_tail_init(&folios, from);
+ +      while (!list_empty(from)) {
+ +              list_move(from->next, &folios);
+ +              rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
+ +                                       private, mode, reason, ret_folios,
+ +                                       split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
+ +              list_splice_tail_init(&folios, ret_folios);
+ +              if (rc < 0)
+ +                      return rc;
+ +              nr_failed += rc;
+ +      }
   
- -      return rc;
+ +      return nr_failed;
   }
   
   /*
@@@ -1933,7 -1949,6 +1933,7 @@@ int migrate_pages(struct list_head *fro
         struct folio *folio, *folio2;
         LIST_HEAD(folios);
         LIST_HEAD(ret_folios);
+ +      LIST_HEAD(split_folios);
         struct migrate_pages_stats stats;
   
         trace_mm_migrate_pages_start(mode, reason);
@@@ -1944,7 -1959,6 +1944,7 @@@
                                      mode, reason, &stats, &ret_folios);
         if (rc_gather < 0)
                 goto out;
+ +
   again:
         nr_pages = 0;
         list_for_each_entry_safe(folio, folio2, from, lru) {
@@@ -1955,36 -1969,20 +1955,36 @@@
                 }
   
                 nr_pages += folio_nr_pages(folio);
- -              if (nr_pages > NR_MAX_BATCHED_MIGRATION)
+ +              if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
                         break;
         }
- -      if (nr_pages > NR_MAX_BATCHED_MIGRATION)
- -              list_cut_before(&folios, from, &folio->lru);
+ +      if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ +              list_cut_before(&folios, from, &folio2->lru);
         else
                 list_splice_init(from, &folios);
- -      rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
- -                               mode, reason, &ret_folios, &stats);
+ +      if (mode == MIGRATE_ASYNC)
+ +              rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
+ +                                       mode, reason, &ret_folios, &split_folios, &stats,
+ +                                       NR_MAX_MIGRATE_PAGES_RETRY);
+ +      else
+ +              rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
+ +                                      mode, reason, &ret_folios, &split_folios, &stats);
         list_splice_tail_init(&folios, &ret_folios);
         if (rc < 0) {
                 rc_gather = rc;
+ +              list_splice_tail(&split_folios, &ret_folios);
                 goto out;
         }
+ +      if (!list_empty(&split_folios)) {
+ +              /*
+ +               * Failure isn't counted since all split folios of a large folio
+ +               * is counted as 1 failure already.  And, we only try to migrate
+ +               * with minimal effort, force MIGRATE_ASYNC mode and retry once.
+ +               */
+ +              migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
+ +                                  MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
+ +              list_splice_tail_init(&split_folios, &ret_folios);
+ +      }
         rc_gather += rc;
         if (!list_empty(from))
                 goto again;
@@@ -2099,15 -2097,18 +2099,18 @@@ static int do_move_pages_to_node(struc
    *         target node
    *     1 - when it has been queued
    */
- static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+ static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
                 int node, struct list_head *pagelist, bool migrate_all)
   {
         struct vm_area_struct *vma;
+       unsigned long addr;
         struct page *page;
         int err;
         bool isolated;
   
         mmap_read_lock(mm);
+       addr = (unsigned long)untagged_addr_remote(mm, p);
+ 
         err = -EFAULT;
         vma = vma_lookup(mm, addr);
         if (!vma || !vma_migratable(vma))
@@@ -2213,7 -2214,6 +2216,6 @@@ static int do_pages_move(struct mm_stru
   
         for (i = start = 0; i < nr_pages; i++) {
                 const void __user *p;
-               unsigned long addr;
                 int node;
   
                 err = -EFAULT;
@@@ -2221,7 -2221,6 +2223,6 @@@
                         goto out_flush;
                 if (get_user(node, nodes + i))
                         goto out_flush;
-               addr = (unsigned long)untagged_addr(p);
   
                 err = -ENODEV;
                 if (node < 0 || node >= MAX_NUMNODES)
@@@ -2249,8 -2248,8 +2250,8 @@@
                  * Errors in the page lookup or isolation are not fatal and we simply
                  * report them via status
                  */
-               err = add_page_for_migration(mm, addr, current_node,
-                               &pagelist, flags & MPOL_MF_MOVE_ALL);
+               err = add_page_for_migration(mm, p, current_node, &pagelist,
+                                            flags & MPOL_MF_MOVE_ALL);
   
                 if (err > 0) {
                         /* The page is successfully queued for migration */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/mmu_context.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/uapi/asm/prctl.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/tlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/array.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/task_mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/gup.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/madvise.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history