Merge tag 'x86_mm_for_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 16:43:49 +0000 (09:43 -0700)
Pull x86 LAM (Linear Address Masking) support from Dave Hansen:
 "Add support for the new Linear Address Masking CPU feature.

  This is similar to ARM's Top Byte Ignore and allows userspace to store
  metadata in some bits of pointers without masking it out before use"

* tag 'x86_mm_for_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/iommu/sva: Do not allow to set FORCE_TAGGED_SVA bit from outside
  x86/mm/iommu/sva: Fix error code for LAM enabling failure due to SVA
  selftests/x86/lam: Add test cases for LAM vs thread creation
  selftests/x86/lam: Add ARCH_FORCE_TAGGED_SVA test cases for linear-address masking
  selftests/x86/lam: Add inherit test cases for linear-address masking
  selftests/x86/lam: Add io_uring test cases for linear-address masking
  selftests/x86/lam: Add mmap and SYSCALL test cases for linear-address masking
  selftests/x86/lam: Add malloc and tag-bits test cases for linear-address masking
  x86/mm/iommu/sva: Make LAM and SVA mutually exclusive
  iommu/sva: Replace pasid_valid() helper with mm_valid_pasid()
  mm: Expose untagging mask in /proc/$PID/status
  x86/mm: Provide arch_prctl() interface for LAM
  x86/mm: Reduce untagged_addr() overhead for systems without LAM
  x86/uaccess: Provide untagged_addr() and remove tags before address check
  mm: Introduce untagged_addr_remote()
  x86/mm: Handle LAM on context switch
  x86: CPUID and CR3/CR4 flags for Linear Address Masking
  x86: Allow atomic MM_CONTEXT flags setting
  x86/mm: Rework address range check in get_user() and put_user()

12 files changed:
1  2 
arch/x86/Kconfig
arch/x86/include/asm/mmu_context.h
arch/x86/include/uapi/asm/prctl.h
arch/x86/mm/init.c
arch/x86/mm/tlb.c
fs/proc/array.c
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/sched/mm.h
mm/gup.c
mm/madvise.c
mm/migrate.c

diff --combined arch/x86/Kconfig
@@@ -27,7 -27,6 +27,7 @@@ config X86_6
        # Options that are inherently 64-bit kernel only:
        select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 +      select ARCH_SUPPORTS_PER_VMA_LOCK
        select ARCH_USE_CMPXCHG_LOCKREF
        select HAVE_ARCH_SOFT_DIRTY
        select MODULES_USE_ELF_RELA
@@@ -126,8 -125,8 +126,8 @@@ config X8
        select ARCH_WANTS_NO_INSTR
        select ARCH_WANT_GENERAL_HUGETLB
        select ARCH_WANT_HUGE_PMD_SHARE
 -      select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP  if X86_64
        select ARCH_WANT_LD_ORPHAN_WARN
 +      select ARCH_WANT_OPTIMIZE_VMEMMAP       if X86_64
        select ARCH_WANTS_THP_SWAP              if X86_64
        select ARCH_HAS_PARANOID_L1D_FLUSH
        select BUILDTIME_TABLE_SORT
        select GUP_GET_PXX_LOW_HIGH             if X86_PAE
        select HARDIRQS_SW_RESEND
        select HARDLOCKUP_CHECK_TIMESTAMP       if X86_64
 +      select HAS_IOPORT
        select HAVE_ACPI_APEI                   if ACPI
        select HAVE_ACPI_APEI_NMI               if ACPI
        select HAVE_ALIGNED_STRUCT_PAGE         if SLUB
        select RTC_LIB
        select RTC_MC146818_LIB
        select SPARSE_IRQ
 -      select SRCU
        select SYSCTL_EXCEPTION_TRACE
        select THREAD_INFO_IN_TASK
        select TRACE_IRQFLAGS_SUPPORT
@@@ -435,7 -434,7 +435,7 @@@ config SM
          Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
          Management" code will be disabled if you say Y here.
  
 -        See also <file:Documentation/x86/i386/IO-APIC.rst>,
 +        See also <file:Documentation/arch/x86/i386/IO-APIC.rst>,
          <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
          <http://www.tldp.org/docs.html#howto>.
  
@@@ -1325,7 -1324,7 +1325,7 @@@ config MICROCOD
          the Linux kernel.
  
          The preferred method to load microcode from a detached initrd is described
 -        in Documentation/x86/microcode.rst. For that you need to enable
 +        in Documentation/arch/x86/microcode.rst. For that you need to enable
          CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
          initrd for microcode blobs.
  
@@@ -1511,7 -1510,7 +1511,7 @@@ config X86_5LEVE
          A kernel with the option enabled can be booted on machines that
          support 4- or 5-level paging.
  
 -        See Documentation/x86/x86_64/5level-paging.rst for more
 +        See Documentation/arch/x86/x86_64/5level-paging.rst for more
          information.
  
          Say N if unsure.
@@@ -1775,7 -1774,7 +1775,7 @@@ config MTR
          You can safely say Y even if your machine doesn't have MTRRs, you'll
          just add about 9 KB to your kernel.
  
 -        See <file:Documentation/x86/mtrr.rst> for more information.
 +        See <file:Documentation/arch/x86/mtrr.rst> for more information.
  
  config MTRR_SANITIZER
        def_bool y
@@@ -1939,6 -1938,7 +1939,6 @@@ config X86_SG
        depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
        depends on CRYPTO=y
        depends on CRYPTO_SHA256=y
 -      select SRCU
        select MMU_NOTIFIER
        select NUMA_KEEP_MEMINFO if NUMA
        select XARRAY_MULTI
@@@ -2290,6 -2290,17 +2290,17 @@@ config RANDOMIZE_MEMORY_PHYSICAL_PADDIN
  
          If unsure, leave at the default value.
  
+ config ADDRESS_MASKING
+       bool "Linear Address Masking support"
+       depends on X86_64
+       help
+         Linear Address Masking (LAM) modifies the checking that is applied
+         to 64-bit linear addresses, allowing software to use of the
+         untranslated address bits for metadata.
+         The capability can be used for efficient address sanitizers (ASAN)
+         implementation and for optimizations in JITs.
  config HOTPLUG_CPU
        def_bool y
        depends on SMP
@@@ -2551,7 -2562,7 +2562,7 @@@ config PAGE_TABLE_ISOLATIO
          ensuring that the majority of kernel addresses are not mapped
          into userspace.
  
 -        See Documentation/x86/pti.rst for more details.
 +        See Documentation/arch/x86/pti.rst for more details.
  
  config RETPOLINE
        bool "Avoid speculative indirect branches in kernel"
  
  extern atomic64_t last_mm_ctx_id;
  
 -#ifndef CONFIG_PARAVIRT_XXL
 -static inline void paravirt_activate_mm(struct mm_struct *prev,
 -                                      struct mm_struct *next)
 -{
 -}
 -#endif        /* !CONFIG_PARAVIRT_XXL */
 -
  #ifdef CONFIG_PERF_EVENTS
  DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
  DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
@@@ -85,6 -92,51 +85,51 @@@ static inline void switch_ldt(struct mm
  }
  #endif
  
+ #ifdef CONFIG_ADDRESS_MASKING
+ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+ {
+       return mm->context.lam_cr3_mask;
+ }
+ static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+ {
+       mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+       mm->context.untag_mask = oldmm->context.untag_mask;
+ }
+ #define mm_untag_mask mm_untag_mask
+ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+ {
+       return mm->context.untag_mask;
+ }
+ static inline void mm_reset_untag_mask(struct mm_struct *mm)
+ {
+       mm->context.untag_mask = -1UL;
+ }
+ #define arch_pgtable_dma_compat arch_pgtable_dma_compat
+ static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+ {
+       return !mm_lam_cr3_mask(mm) ||
+               test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+ }
+ #else
+ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+ {
+       return 0;
+ }
+ static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+ {
+ }
+ static inline void mm_reset_untag_mask(struct mm_struct *mm)
+ {
+ }
+ #endif
  #define enter_lazy_tlb enter_lazy_tlb
  extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
  
@@@ -109,6 -161,7 +154,7 @@@ static inline int init_new_context(stru
                mm->context.execute_only_pkey = -1;
        }
  #endif
+       mm_reset_untag_mask(mm);
        init_new_context_ldt(mm);
        return 0;
  }
@@@ -128,7 -181,7 +174,7 @@@ extern void switch_mm_irqs_off(struct m
  
  #define activate_mm(prev, next)                       \
  do {                                          \
 -      paravirt_activate_mm((prev), (next));   \
 +      paravirt_enter_mmap(next);              \
        switch_mm((prev), (next), NULL);        \
  } while (0);
  
@@@ -161,7 -214,8 +207,8 @@@ static inline void arch_dup_pkeys(struc
  static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
  {
        arch_dup_pkeys(oldmm, mm);
 -      paravirt_arch_dup_mmap(oldmm, mm);
 +      paravirt_enter_mmap(mm);
+       dup_lam(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
  }
  
@@@ -175,7 -229,7 +222,7 @@@ static inline void arch_exit_mmap(struc
  static inline bool is_64bit_mm(struct mm_struct *mm)
  {
        return  !IS_ENABLED(CONFIG_IA32_EMULATION) ||
-               !(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
+               !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
  }
  #else
  static inline bool is_64bit_mm(struct mm_struct *mm)
  #define ARCH_GET_XCOMP_GUEST_PERM     0x1024
  #define ARCH_REQ_XCOMP_GUEST_PERM     0x1025
  
 +#define ARCH_XCOMP_TILECFG            17
 +#define ARCH_XCOMP_TILEDATA           18
 +
  #define ARCH_MAP_VDSO_X32             0x2001
  #define ARCH_MAP_VDSO_32              0x2002
  #define ARCH_MAP_VDSO_64              0x2003
  
+ #define ARCH_GET_UNTAG_MASK           0x4001
+ #define ARCH_ENABLE_TAGGED_ADDR               0x4002
+ #define ARCH_GET_MAX_TAG_BITS         0x4003
+ #define ARCH_FORCE_TAGGED_SVA         0x4004
  #endif /* _ASM_X86_PRCTL_H */
diff --combined arch/x86/mm/init.c
@@@ -806,7 -806,7 +806,7 @@@ void __init poking_init(void
        BUG_ON(!poking_mm);
  
        /* Xen PV guests need the PGD to be pinned. */
 -      paravirt_arch_dup_mmap(NULL, poking_mm);
 +      paravirt_enter_mmap(poking_mm);
  
        /*
         * Randomize the poking address, but make sure that the following page
@@@ -1048,6 -1048,11 +1048,11 @@@ __visible DEFINE_PER_CPU_ALIGNED(struc
        .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
  };
  
+ #ifdef CONFIG_ADDRESS_MASKING
+ DEFINE_PER_CPU(u64, tlbstate_untag_mask);
+ EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
+ #endif
  void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
  {
        /* entry 0 MUST be WB (hardwired to speed up translations) */
diff --combined arch/x86/mm/tlb.c
@@@ -154,26 -154,30 +154,30 @@@ static inline u16 user_pcid(u16 asid
        return ret;
  }
  
- static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
  {
+       unsigned long cr3 = __sme_pa(pgd) | lam;
        if (static_cpu_has(X86_FEATURE_PCID)) {
-               return __sme_pa(pgd) | kern_pcid(asid);
+               VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+               cr3 |= kern_pcid(asid);
        } else {
                VM_WARN_ON_ONCE(asid != 0);
-               return __sme_pa(pgd);
        }
+       return cr3;
  }
  
- static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+                                             unsigned long lam)
  {
-       VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
        /*
         * Use boot_cpu_has() instead of this_cpu_has() as this function
         * might be called during early boot. This should work even after
         * boot because all CPU's the have same capabilities:
         */
        VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
-       return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+       return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
  }
  
  /*
@@@ -274,15 -278,16 +278,16 @@@ static inline void invalidate_user_asid
                  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
  }
  
- static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+                           bool need_flush)
  {
        unsigned long new_mm_cr3;
  
        if (need_flush) {
                invalidate_user_asid(new_asid);
-               new_mm_cr3 = build_cr3(pgdir, new_asid);
+               new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
        } else {
-               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
        }
  
        /*
@@@ -491,6 -496,7 +496,7 @@@ void switch_mm_irqs_off(struct mm_struc
  {
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+       unsigned long new_lam = mm_lam_cr3_mask(next);
        bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
        unsigned cpu = smp_processor_id();
        u64 next_tlb_gen;
         * isn't free.
         */
  #ifdef CONFIG_DEBUG_VM
-       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+                                                  tlbstate_lam_cr3_mask()))) {
                /*
                 * If we were to BUG here, we'd be very likely to kill
                 * the system so hard that we don't see the call trace.
         * instruction.
         */
        if (real_prev == next) {
+               /* Not actually switching mm's */
                VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                           next->context.ctx_id);
  
                /*
+                * If this races with another thread that enables lam, 'new_lam'
+                * might not match tlbstate_lam_cr3_mask().
+                */
+               /*
                 * Even in lazy TLB mode, the CPU should stay set in the
                 * mm_cpumask. The TLB shootdown code can figure out from
                 * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
                barrier();
        }
  
+       set_tlbstate_lam_mode(next);
        if (need_flush) {
                this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
                this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-               load_new_mm_cr3(next->pgd, new_asid, true);
+               load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
  
                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
        } else {
                /* The new ASID is already up to date. */
-               load_new_mm_cr3(next->pgd, new_asid, false);
+               load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
  
                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
        }
@@@ -691,6 -705,10 +705,10 @@@ void initialize_tlbstate_and_flush(void
        /* Assert that CR3 already references the right mm. */
        WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
  
+       /* LAM expected to be disabled */
+       WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+       WARN_ON(mm_lam_cr3_mask(mm));
        /*
         * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
         * doesn't work like other CR4 bits because it can only be set from
        WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
                !(cr4_read_shadow() & X86_CR4_PCIDE));
  
-       /* Force ASID 0 and force a TLB flush. */
-       write_cr3(build_cr3(mm->pgd, 0));
+       /* Disable LAM, force ASID 0 and force a TLB flush. */
+       write_cr3(build_cr3(mm->pgd, 0, 0));
  
        /* Reinitialize tlbstate. */
        this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
        this_cpu_write(cpu_tlbstate.next_asid, 1);
        this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
        this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+       set_tlbstate_lam_mode(mm);
  
        for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
                this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@@ -925,7 -944,7 +944,7 @@@ void flush_tlb_multi(const struct cpuma
  }
  
  /*
 - * See Documentation/x86/tlb.rst for details.  We choose 33
 + * See Documentation/arch/x86/tlb.rst for details.  We choose 33
   * because it is large enough to cover the vast majority (at
   * least 95%) of allocations, and is small enough that we are
   * confident it will not cause too much overhead.  Each single
@@@ -1071,8 -1090,10 +1090,10 @@@ void flush_tlb_kernel_range(unsigned lo
   */
  unsigned long __get_current_cr3_fast(void)
  {
-       unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
-               this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+       unsigned long cr3 =
+               build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+                         this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+                         tlbstate_lam_cr3_mask());
  
        /* For now, be very restrictive about when this can be called. */
        VM_WARN_ON(in_nmi() || preemptible());
diff --combined fs/proc/array.c
@@@ -91,6 -91,7 +91,7 @@@
  #include <linux/user_namespace.h>
  #include <linux/fs_struct.h>
  #include <linux/kthread.h>
+ #include <linux/mmu_context.h>
  
  #include <asm/processor.h>
  #include "internal.h"
@@@ -219,8 -220,6 +220,8 @@@ static inline void task_state(struct se
                seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
  #endif
        seq_putc(m, '\n');
 +
 +      seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
  }
  
  void render_sigset_t(struct seq_file *m, const char *header,
@@@ -425,6 -424,11 +426,11 @@@ static inline void task_thp_status(stru
        seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
  }
  
+ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+ {
+       seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+ }
  int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
  {
                task_mem(m, mm);
                task_core_dumping(m, task);
                task_thp_status(m, mm);
+               task_untag_mask(m, mm);
                mmput(mm);
        }
        task_sig(m, task);
diff --combined fs/proc/task_mmu.c
@@@ -782,6 -782,7 +782,6 @@@ static void smap_gather_stats(struct vm
        if (start >= vma->vm_end)
                return;
  
 -#ifdef CONFIG_SHMEM
        if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
                /*
                 * For shared or readonly shmem mappings we know that all
                        ops = &smaps_shmem_walk_ops;
                }
        }
 -#endif
 +
        /* mmap_lock is held in m_start */
        if (!start)
                walk_page_vma(vma, ops, mss);
@@@ -1688,8 -1689,13 +1688,13 @@@ static ssize_t pagemap_read(struct fil
  
        /* watch out for wraparound */
        start_vaddr = end_vaddr;
-       if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
-               start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+       if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+               ret = mmap_read_lock_killable(mm);
+               if (ret)
+                       goto out_free;
+               start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+               mmap_read_unlock(mm);
+       }
  
        /* Ensure the address is inside the task */
        if (start_vaddr > mm->task_size)
diff --combined include/linux/mm.h
@@@ -29,7 -29,6 +29,7 @@@
  #include <linux/pgtable.h>
  #include <linux/kasan.h>
  #include <linux/memremap.h>
 +#include <linux/slab.h>
  
  struct mempolicy;
  struct anon_vma;
@@@ -39,7 -38,6 +39,7 @@@ struct pt_regs
  
  extern int sysctl_page_lock_unfairness;
  
 +void mm_core_init(void);
  void init_mm_internals(void);
  
  #ifndef CONFIG_NUMA           /* Don't use mapnrs, do it properly */
@@@ -98,17 -96,6 +98,6 @@@ extern int mmap_rnd_compat_bits __read_
  #include <asm/page.h>
  #include <asm/processor.h>
  
- /*
-  * Architectures that support memory tagging (assigning tags to memory regions,
-  * embedding these tags into addresses that point to these memory regions, and
-  * checking that the memory and the pointer tags match on memory accesses)
-  * redefine this macro to strip tags from pointers.
-  * It's defined as noop for architectures that don't support memory tagging.
-  */
- #ifndef untagged_addr
- #define untagged_addr(addr) (addr)
- #endif
  #ifndef __pa_symbol
  #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
  #endif
@@@ -258,8 -245,6 +247,8 @@@ void setup_initial_init_mm(void *start_
  struct vm_area_struct *vm_area_alloc(struct mm_struct *);
  struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
  void vm_area_free(struct vm_area_struct *);
 +/* Use only if VMA has no other users */
 +void __vm_area_free(struct vm_area_struct *vma);
  
  #ifndef CONFIG_MMU
  extern struct rb_root nommu_region_tree;
@@@ -482,8 -467,7 +471,8 @@@ static inline bool fault_flag_allow_ret
        { FAULT_FLAG_USER,              "USER" }, \
        { FAULT_FLAG_REMOTE,            "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
 -      { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
 +      { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }, \
 +      { FAULT_FLAG_VMA_LOCK,          "VMA_LOCK" }
  
  /*
   * vm_fault is filled by the pagefault handler and passed to the vma's
@@@ -628,131 -612,6 +617,131 @@@ struct vm_operations_struct 
                                          unsigned long addr);
  };
  
 +#ifdef CONFIG_NUMA_BALANCING
 +static inline void vma_numab_state_init(struct vm_area_struct *vma)
 +{
 +      vma->numab_state = NULL;
 +}
 +static inline void vma_numab_state_free(struct vm_area_struct *vma)
 +{
 +      kfree(vma->numab_state);
 +}
 +#else
 +static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
 +static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 +#endif /* CONFIG_NUMA_BALANCING */
 +
 +#ifdef CONFIG_PER_VMA_LOCK
 +/*
 + * Try to read-lock a vma. The function is allowed to occasionally yield false
 + * locked result to avoid performance overhead, in which case we fall back to
 + * using mmap_lock. The function should never yield false unlocked result.
 + */
 +static inline bool vma_start_read(struct vm_area_struct *vma)
 +{
 +      /* Check before locking. A race might cause false locked result. */
 +      if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 +              return false;
 +
 +      if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
 +              return false;
 +
 +      /*
 +       * Overflow might produce false locked result.
 +       * False unlocked result is impossible because we modify and check
 +       * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
 +       * modification invalidates all existing locks.
 +       */
 +      if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
 +              up_read(&vma->vm_lock->lock);
 +              return false;
 +      }
 +      return true;
 +}
 +
 +static inline void vma_end_read(struct vm_area_struct *vma)
 +{
 +      rcu_read_lock(); /* keeps vma alive till the end of up_read */
 +      up_read(&vma->vm_lock->lock);
 +      rcu_read_unlock();
 +}
 +
 +static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
 +{
 +      mmap_assert_write_locked(vma->vm_mm);
 +
 +      /*
 +       * current task is holding mmap_write_lock, both vma->vm_lock_seq and
 +       * mm->mm_lock_seq can't be concurrently modified.
 +       */
 +      *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
 +      return (vma->vm_lock_seq == *mm_lock_seq);
 +}
 +
 +static inline void vma_start_write(struct vm_area_struct *vma)
 +{
 +      int mm_lock_seq;
 +
 +      if (__is_vma_write_locked(vma, &mm_lock_seq))
 +              return;
 +
 +      down_write(&vma->vm_lock->lock);
 +      vma->vm_lock_seq = mm_lock_seq;
 +      up_write(&vma->vm_lock->lock);
 +}
 +
 +static inline bool vma_try_start_write(struct vm_area_struct *vma)
 +{
 +      int mm_lock_seq;
 +
 +      if (__is_vma_write_locked(vma, &mm_lock_seq))
 +              return true;
 +
 +      if (!down_write_trylock(&vma->vm_lock->lock))
 +              return false;
 +
 +      vma->vm_lock_seq = mm_lock_seq;
 +      up_write(&vma->vm_lock->lock);
 +      return true;
 +}
 +
 +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 +{
 +      int mm_lock_seq;
 +
 +      VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 +}
 +
 +static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
 +{
 +      /* When detaching vma should be write-locked */
 +      if (detached)
 +              vma_assert_write_locked(vma);
 +      vma->detached = detached;
 +}
 +
 +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 +                                        unsigned long address);
 +
 +#else /* CONFIG_PER_VMA_LOCK */
 +
 +static inline void vma_init_lock(struct vm_area_struct *vma) {}
 +static inline bool vma_start_read(struct vm_area_struct *vma)
 +              { return false; }
 +static inline void vma_end_read(struct vm_area_struct *vma) {}
 +static inline void vma_start_write(struct vm_area_struct *vma) {}
 +static inline bool vma_try_start_write(struct vm_area_struct *vma)
 +              { return true; }
 +static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 +static inline void vma_mark_detached(struct vm_area_struct *vma,
 +                                   bool detached) {}
 +
 +#endif /* CONFIG_PER_VMA_LOCK */
 +
 +/*
 + * WARNING: vma_init does not initialize vma->vm_lock.
 + * Use vm_area_alloc()/vm_area_free() if vma needs locking.
 + */
  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
  {
        static const struct vm_operations_struct dummy_vm_ops = {};
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
 +      vma_mark_detached(vma, false);
 +      vma_numab_state_init(vma);
  }
  
  /* Use when VMA is not part of the VMA tree and needs no locking */
@@@ -776,28 -633,28 +765,28 @@@ static inline void vm_flags_init(struc
  static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
  {
 -      mmap_assert_write_locked(vma->vm_mm);
 +      vma_start_write(vma);
        vm_flags_init(vma, flags);
  }
  
  static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
  {
 -      mmap_assert_write_locked(vma->vm_mm);
 +      vma_start_write(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
  }
  
  static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
  {
 -      mmap_assert_write_locked(vma->vm_mm);
 +      vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
  }
  
  static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
  {
 -      mmap_assert_write_locked(vma->vm_mm);
 +      vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
  }
  
@@@ -818,7 -675,7 +807,7 @@@ static inline void __vm_flags_mod(struc
  static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
  {
 -      mmap_assert_write_locked(vma->vm_mm);
 +      vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
  }
  
@@@ -1686,16 -1543,6 +1675,16 @@@ static inline int xchg_page_access_time
        last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
  }
 +
 +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 +{
 +      unsigned int pid_bit;
 +
 +      pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
 +      if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
 +              __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
 +      }
 +}
  #else /* !CONFIG_NUMA_BALANCING */
  static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
  {
@@@ -1745,10 -1592,6 +1734,10 @@@ static inline bool cpupid_match_pid(str
  {
        return false;
  }
 +
 +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 +{
 +}
  #endif /* CONFIG_NUMA_BALANCING */
  
  #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@@ -2782,6 -2625,12 +2771,6 @@@ static inline bool ptlock_init(struct p
  static inline void ptlock_free(struct page *page) {}
  #endif /* USE_SPLIT_PTE_PTLOCKS */
  
 -static inline void pgtable_init(void)
 -{
 -      ptlock_cache_init();
 -      pgtable_cache_init();
 -}
 -
  static inline bool pgtable_pte_page_ctor(struct page *page)
  {
        if (!ptlock_init(page))
@@@ -2925,6 -2774,7 +2914,6 @@@ extern unsigned long free_reserved_area
                                        int poison, const char *s);
  
  extern void adjust_managed_page_count(struct page *page, long count);
 -extern void mem_init_print_info(void);
  
  extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
  
@@@ -3035,6 -2885,7 +3024,6 @@@ extern void setup_per_cpu_pageset(void)
  extern int min_free_kbytes;
  extern int watermark_boost_factor;
  extern int watermark_scale_factor;
 -extern bool arch_has_descending_max_zone_pfns(void);
  
  /* nommu.c */
  extern atomic_long_t mmap_pages_allocated;
@@@ -3323,6 -3174,8 +3312,6 @@@ vm_fault_t vmf_insert_pfn_prot(struct v
                        unsigned long pfn, pgprot_t pgprot);
  vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
 -vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
 -                      pfn_t pfn, pgprot_t pgprot);
  vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
  int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@@ -3392,6 -3245,7 +3381,6 @@@ extern int apply_to_existing_page_range
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);
  
 -extern void __init init_mem_debugging_and_hardening(void);
  #ifdef CONFIG_PAGE_POISONING
  extern void __kernel_poison_pages(struct page *page, int numpages);
  extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@@ -3560,22 -3414,6 +3549,22 @@@ void vmemmap_populate_print_last(void)
  void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
  #endif
 +
 +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
 +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
 +                                         struct dev_pagemap *pgmap)
 +{
 +      return is_power_of_2(sizeof(struct page)) &&
 +              pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
 +}
 +#else
 +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
 +                                         struct dev_pagemap *pgmap)
 +{
 +      return false;
 +}
 +#endif
 +
  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);
  
@@@ -3593,6 -3431,8 +3582,6 @@@ int mf_dax_kill_procs(struct address_sp
  extern int memory_failure(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
 -extern int sysctl_memory_failure_early_kill;
 -extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
@@@ -3602,7 -3442,6 +3591,7 @@@ extern int __get_huge_page_for_hwpoison
                                        bool *migratable_cleared);
  void num_poisoned_pages_inc(unsigned long pfn);
  void num_poisoned_pages_sub(unsigned long pfn, long i);
 +struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
  #else
  static inline void memory_failure_queue(unsigned long pfn, int flags)
  {
@@@ -3623,12 -3462,6 +3612,12 @@@ static inline void num_poisoned_pages_s
  }
  #endif
  
 +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
 +void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
 +                   struct vm_area_struct *vma, struct list_head *to_kill,
 +                   unsigned long ksm_addr);
 +#endif
 +
  #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
  extern void memblk_nr_poison_inc(unsigned long pfn);
  extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@@ -3698,12 -3531,14 +3687,12 @@@ extern const struct attribute_group mem
  extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
 -extern void copy_user_huge_page(struct page *dst, struct page *src,
 -                              unsigned long addr_hint,
 -                              struct vm_area_struct *vma,
 -                              unsigned int pages_per_huge_page);
 -extern long copy_huge_page_from_user(struct page *dst_page,
 -                              const void __user *usr_src,
 -                              unsigned int pages_per_huge_page,
 -                              bool allow_pagefault);
 +int copy_user_large_folio(struct folio *dst, struct folio *src,
 +                        unsigned long addr_hint,
 +                        struct vm_area_struct *vma);
 +long copy_folio_from_user(struct folio *dst_folio,
 +                         const void __user *usr_src,
 +                         bool allow_pagefault);
  
  /**
   * vma_is_special_huge - Are transhuge page-table entries considered special?
diff --combined include/linux/sched/mm.h
@@@ -79,34 -79,6 +79,34 @@@ static inline void mmdrop_sched(struct 
  }
  #endif
  
 +/* Helpers for lazy TLB mm refcounting */
 +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 +{
 +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
 +              mmgrab(mm);
 +}
 +
 +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 +{
 +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
 +              mmdrop(mm);
 +      } else {
 +              /*
 +               * mmdrop_lazy_tlb must provide a full memory barrier, see the
 +               * membarrier comment finish_task_switch which relies on this.
 +               */
 +              smp_mb();
 +      }
 +}
 +
 +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
 +{
 +      if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
 +              mmdrop_sched(mm);
 +      else
 +              smp_mb(); /* see mmdrop_lazy_tlb() above */
 +}
 +
  /**
   * mmget() - Pin the address space associated with a &struct mm_struct.
   * @mm: The address space to pin.
@@@ -485,6 -457,11 +485,11 @@@ static inline void mm_pasid_init(struc
        mm->pasid = INVALID_IOASID;
  }
  
+ static inline bool mm_valid_pasid(struct mm_struct *mm)
+ {
+       return mm->pasid != INVALID_IOASID;
+ }
  /* Associate a PASID with an mm_struct: */
  static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
  {
  
  static inline void mm_pasid_drop(struct mm_struct *mm)
  {
-       if (pasid_valid(mm->pasid)) {
+       if (mm_valid_pasid(mm)) {
                ioasid_free(mm->pasid);
                mm->pasid = INVALID_IOASID;
        }
  }
  #else
  static inline void mm_pasid_init(struct mm_struct *mm) {}
+ static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
  static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
  static inline void mm_pasid_drop(struct mm_struct *mm) {}
  #endif
diff --combined mm/gup.c
+++ b/mm/gup.c
@@@ -1085,7 -1085,7 +1085,7 @@@ static long __get_user_pages(struct mm_
        if (!nr_pages)
                return 0;
  
-       start = untagged_addr(start);
+       start = untagged_addr_remote(mm, start);
  
        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
  
@@@ -1259,7 -1259,7 +1259,7 @@@ int fixup_user_fault(struct mm_struct *
        struct vm_area_struct *vma;
        vm_fault_t ret;
  
-       address = untagged_addr(address);
+       address = untagged_addr_remote(mm, address);
  
        if (unlocked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@@ -2193,7 -2193,7 +2193,7 @@@ static bool is_valid_gup_args(struct pa
   * This does not guarantee that the page exists in the user mappings when
   * get_user_pages_remote returns, and there may even be a completely different
   * page there in some cases (eg. if mmapped pagecache has been invalidated
 - * and subsequently re faulted). However it does guarantee that the page
 + * and subsequently re-faulted). However it does guarantee that the page
   * won't be freed completely. And mostly callers simply care that the page
   * contains data that was valid *at some point in time*. Typically, an IO
   * or similar operation cannot guarantee anything stronger anyway because
diff --combined mm/madvise.c
@@@ -852,9 -852,21 +852,9 @@@ static long madvise_dontneed_free(struc
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */
  
                mmap_read_lock(mm);
 -              vma = find_vma(mm, start);
 +              vma = vma_lookup(mm, start);
                if (!vma)
                        return -ENOMEM;
 -              if (start < vma->vm_start) {
 -                      /*
 -                       * This "vma" under revalidation is the one
 -                       * with the lowest vma->vm_start where start
 -                       * is also < vma->vm_end. If start <
 -                       * vma->vm_start it means an hole materialized
 -                       * in the user address space within the
 -                       * virtual range passed to MADV_DONTNEED
 -                       * or MADV_FREE.
 -                       */
 -                      return -ENOMEM;
 -              }
                /*
                 * Potential end adjustment for hugetlb vma is OK as
                 * the check below keeps end within vma.
@@@ -1390,8 -1402,6 +1390,6 @@@ int do_madvise(struct mm_struct *mm, un
        size_t len;
        struct blk_plug plug;
  
-       start = untagged_addr(start);
        if (!madvise_behavior_valid(behavior))
                return -EINVAL;
  
                mmap_read_lock(mm);
        }
  
+       start = untagged_addr_remote(mm, start);
+       end = start + len;
        blk_start_plug(&plug);
        error = madvise_walk_vmas(mm, start, end, behavior,
                        madvise_vma_behavior);
@@@ -1444,7 -1457,7 +1445,7 @@@ SYSCALL_DEFINE5(process_madvise, int, p
                size_t, vlen, int, behavior, unsigned int, flags)
  {
        ssize_t ret;
 -      struct iovec iovstack[UIO_FASTIOV], iovec;
 +      struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        struct task_struct *task;
        total_len = iov_iter_count(&iter);
  
        while (iov_iter_count(&iter)) {
 -              iovec = iov_iter_iovec(&iter);
 -              ret = do_madvise(mm, (unsigned long)iovec.iov_base,
 -                                      iovec.iov_len, behavior);
 +              ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
 +                                      iter_iov_len(&iter), behavior);
                if (ret < 0)
                        break;
 -              iov_iter_advance(&iter, iovec.iov_len);
 +              iov_iter_advance(&iter, iter_iov_len(&iter));
        }
  
        ret = (total_len - iov_iter_count(&iter)) ? : ret;
diff --combined mm/migrate.c
@@@ -213,15 -213,20 +213,15 @@@ static bool remove_migration_pte(struc
                if (pte_swp_soft_dirty(*pvmw.pte))
                        pte = pte_mksoft_dirty(pte);
  
 -              /*
 -               * Recheck VMA as permissions can change since migration started
 -               */
                entry = pte_to_swp_entry(*pvmw.pte);
                if (!is_migration_entry_young(entry))
                        pte = pte_mkold(pte);
                if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
                        pte = pte_mkdirty(pte);
                if (is_writable_migration_entry(entry))
 -                      pte = maybe_mkwrite(pte, vma);
 +                      pte = pte_mkwrite(pte);
                else if (pte_swp_uffd_wp(*pvmw.pte))
                        pte = pte_mkuffd_wp(pte);
 -              else
 -                      pte = pte_wrprotect(pte);
  
                if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;
                if (folio_test_hugetlb(folio)) {
                        unsigned int shift = huge_page_shift(hstate_vma(vma));
  
 -                      pte = pte_mkhuge(pte);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                        if (folio_test_anon(folio))
                                hugepage_add_anon_rmap(new, vma, pvmw.address,
@@@ -1106,8 -1112,9 +1106,8 @@@ static void migrate_folio_done(struct f
  /* Obtain the lock on page, remove all ptes. */
  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
                               unsigned long private, struct folio *src,
 -                             struct folio **dstp, int force, bool avoid_force_lock,
 -                             enum migrate_mode mode, enum migrate_reason reason,
 -                             struct list_head *ret)
 +                             struct folio **dstp, enum migrate_mode mode,
 +                             enum migrate_reason reason, struct list_head *ret)
  {
        struct folio *dst;
        int rc = -EAGAIN;
        dst->private = NULL;
  
        if (!folio_trylock(src)) {
 -              if (!force || mode == MIGRATE_ASYNC)
 +              if (mode == MIGRATE_ASYNC)
                        goto out;
  
                /*
                if (current->flags & PF_MEMALLOC)
                        goto out;
  
 -              /*
 -               * We have locked some folios and are going to wait to lock
 -               * this folio.  To avoid a potential deadlock, let's bail
 -               * out and not do that. The locked folios will be moved and
 -               * unlocked, then we can wait to lock this folio.
 -               */
 -              if (avoid_force_lock) {
 -                      rc = -EDEADLOCK;
 -                      goto out;
 -              }
 -
                folio_lock(src);
        }
        locked = true;
                        rc = -EBUSY;
                        goto out;
                }
 -              if (!force)
 -                      goto out;
                folio_wait_writeback(src);
        }
  
                /* Establish migration ptes */
                VM_BUG_ON_FOLIO(folio_test_anon(src) &&
                               !folio_test_ksm(src) && !anon_vma, src);
 -              try_to_migrate(src, TTU_BATCH_FLUSH);
 +              try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
                page_was_mapped = 1;
        }
  
@@@ -1247,7 -1267,7 +1247,7 @@@ out
         * A folio that has not been unmapped will be restored to
         * right list unless we want to retry.
         */
 -      if (rc == -EAGAIN || rc == -EDEADLOCK)
 +      if (rc == -EAGAIN)
                ret = NULL;
  
        migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
@@@ -1488,9 -1508,6 +1488,9 @@@ static inline int try_split_folio(struc
  #define NR_MAX_BATCHED_MIGRATION      512
  #endif
  #define NR_MAX_MIGRATE_PAGES_RETRY    10
 +#define NR_MAX_MIGRATE_ASYNC_RETRY    3
 +#define NR_MAX_MIGRATE_SYNC_RETRY                                     \
 +      (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
  
  struct migrate_pages_stats {
        int nr_succeeded;       /* Normal and large folios migrated successfully, in
@@@ -1601,19 -1618,13 +1601,19 @@@ static int migrate_hugetlbs(struct list
  /*
   * migrate_pages_batch() first unmaps folios in the from list as many as
   * possible, then move the unmapped folios.
 + *
 + * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
 + * lock or bit when we have locked more than one folio.  Which may cause
 + * deadlock (e.g., for loop device).  So, if mode != MIGRATE_ASYNC, the
 + * length of the from list must be <= 1.
   */
  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
                free_page_t put_new_page, unsigned long private,
                enum migrate_mode mode, int reason, struct list_head *ret_folios,
 -              struct migrate_pages_stats *stats)
 +              struct list_head *split_folios, struct migrate_pages_stats *stats,
 +              int nr_pass)
  {
 -      int retry;
 +      int retry = 1;
        int large_retry = 1;
        int thp_retry = 1;
        int nr_failed = 0;
        bool is_large = false;
        bool is_thp = false;
        struct folio *folio, *folio2, *dst = NULL, *dst2;
 -      int rc, rc_saved, nr_pages;
 -      LIST_HEAD(split_folios);
 +      int rc, rc_saved = 0, nr_pages;
        LIST_HEAD(unmap_folios);
        LIST_HEAD(dst_folios);
        bool nosplit = (reason == MR_NUMA_MISPLACED);
 -      bool no_split_folio_counting = false;
 -      bool avoid_force_lock;
  
 -retry:
 -      rc_saved = 0;
 -      avoid_force_lock = false;
 -      retry = 1;
 -      for (pass = 0;
 -           pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
 -           pass++) {
 +      VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
 +                      !list_empty(from) && !list_is_singular(from));
 +
 +      for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
                retry = 0;
                large_retry = 0;
                thp_retry = 0;
                        if (!thp_migration_supported() && is_thp) {
                                nr_large_failed++;
                                stats->nr_thp_failed++;
 -                              if (!try_split_folio(folio, &split_folios)) {
 +                              if (!try_split_folio(folio, split_folios)) {
                                        stats->nr_thp_split++;
                                        continue;
                                }
                        }
  
                        rc = migrate_folio_unmap(get_new_page, put_new_page, private,
 -                                               folio, &dst, pass > 2, avoid_force_lock,
 -                                               mode, reason, ret_folios);
 +                                               folio, &dst, mode, reason, ret_folios);
                        /*
                         * The rules are:
                         *      Success: folio will be freed
                         *      Unmap: folio will be put on unmap_folios list,
                         *             dst folio put on dst_folios list
                         *      -EAGAIN: stay on the from list
 -                       *      -EDEADLOCK: stay on the from list
                         *      -ENOMEM: stay on the from list
                         *      Other errno: put on ret_folios list
                         */
                                        stats->nr_thp_failed += is_thp;
                                        /* Large folio NUMA faulting doesn't split to retry. */
                                        if (!nosplit) {
 -                                              int ret = try_split_folio(folio, &split_folios);
 +                                              int ret = try_split_folio(folio, split_folios);
  
                                                if (!ret) {
                                                        stats->nr_thp_split += is_thp;
                                                        large_retry++;
                                                        thp_retry += is_thp;
                                                        nr_retry_pages += nr_pages;
 +                                                      /* Undo duplicated failure counting. */
 +                                                      nr_large_failed--;
 +                                                      stats->nr_thp_failed -= is_thp;
                                                        break;
                                                }
                                        }
 -                              } else if (!no_split_folio_counting) {
 +                              } else {
                                        nr_failed++;
                                }
  
                                stats->nr_failed_pages += nr_pages + nr_retry_pages;
 -                              /*
 -                               * There might be some split folios of fail-to-migrate large
 -                               * folios left in split_folios list. Move them to ret_folios
 -                               * list so that they could be put back to the right list by
 -                               * the caller otherwise the folio refcnt will be leaked.
 -                               */
 -                              list_splice_init(&split_folios, ret_folios);
                                /* nr_failed isn't updated for not used */
                                nr_large_failed += large_retry;
                                stats->nr_thp_failed += thp_retry;
                                        goto out;
                                else
                                        goto move;
 -                      case -EDEADLOCK:
 -                              /*
 -                               * The folio cannot be locked for potential deadlock.
 -                               * Go move (and unlock) all locked folios.  Then we can
 -                               * try again.
 -                               */
 -                              rc_saved = rc;
 -                              goto move;
                        case -EAGAIN:
                                if (is_large) {
                                        large_retry++;
                                        thp_retry += is_thp;
 -                              } else if (!no_split_folio_counting) {
 +                              } else {
                                        retry++;
                                }
                                nr_retry_pages += nr_pages;
                                stats->nr_thp_succeeded += is_thp;
                                break;
                        case MIGRATEPAGE_UNMAP:
 -                              /*
 -                               * We have locked some folios, don't force lock
 -                               * to avoid deadlock.
 -                               */
 -                              avoid_force_lock = true;
                                list_move_tail(&folio->lru, &unmap_folios);
                                list_add_tail(&dst->lru, &dst_folios);
                                break;
                                if (is_large) {
                                        nr_large_failed++;
                                        stats->nr_thp_failed += is_thp;
 -                              } else if (!no_split_folio_counting) {
 +                              } else {
                                        nr_failed++;
                                }
  
@@@ -1771,7 -1807,9 +1771,7 @@@ move
        try_to_unmap_flush();
  
        retry = 1;
 -      for (pass = 0;
 -           pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
 -           pass++) {
 +      for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
                retry = 0;
                large_retry = 0;
                thp_retry = 0;
                                if (is_large) {
                                        large_retry++;
                                        thp_retry += is_thp;
 -                              } else if (!no_split_folio_counting) {
 +                              } else {
                                        retry++;
                                }
                                nr_retry_pages += nr_pages;
                                if (is_large) {
                                        nr_large_failed++;
                                        stats->nr_thp_failed += is_thp;
 -                              } else if (!no_split_folio_counting) {
 +                              } else {
                                        nr_failed++;
                                }
  
                dst2 = list_next_entry(dst, lru);
        }
  
 -      /*
 -       * Try to migrate split folios of fail-to-migrate large folios, no
 -       * nr_failed counting in this round, since all split folios of a
 -       * large folio is counted as 1 failure in the first round.
 -       */
 -      if (rc >= 0 && !list_empty(&split_folios)) {
 -              /*
 -               * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
 -               * retries) to ret_folios to avoid migrating them again.
 -               */
 -              list_splice_init(from, ret_folios);
 -              list_splice_init(&split_folios, from);
 -              no_split_folio_counting = true;
 -              goto retry;
 -      }
 +      return rc;
 +}
  
 +static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
 +              free_page_t put_new_page, unsigned long private,
 +              enum migrate_mode mode, int reason, struct list_head *ret_folios,
 +              struct list_head *split_folios, struct migrate_pages_stats *stats)
 +{
 +      int rc, nr_failed = 0;
 +      LIST_HEAD(folios);
 +      struct migrate_pages_stats astats;
 +
 +      memset(&astats, 0, sizeof(astats));
 +      /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
 +      rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
 +                               reason, &folios, split_folios, &astats,
 +                               NR_MAX_MIGRATE_ASYNC_RETRY);
 +      stats->nr_succeeded += astats.nr_succeeded;
 +      stats->nr_thp_succeeded += astats.nr_thp_succeeded;
 +      stats->nr_thp_split += astats.nr_thp_split;
 +      if (rc < 0) {
 +              stats->nr_failed_pages += astats.nr_failed_pages;
 +              stats->nr_thp_failed += astats.nr_thp_failed;
 +              list_splice_tail(&folios, ret_folios);
 +              return rc;
 +      }
 +      stats->nr_thp_failed += astats.nr_thp_split;
 +      nr_failed += astats.nr_thp_split;
        /*
 -       * We have unlocked all locked folios, so we can force lock now, let's
 -       * try again.
 +       * Fall back to migrate all failed folios one by one synchronously. All
 +       * failed folios except split THPs will be retried, so their failure
 +       * isn't counted
         */
 -      if (rc == -EDEADLOCK)
 -              goto retry;
 +      list_splice_tail_init(&folios, from);
 +      while (!list_empty(from)) {
 +              list_move(from->next, &folios);
 +              rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
 +                                       private, mode, reason, ret_folios,
 +                                       split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
 +              list_splice_tail_init(&folios, ret_folios);
 +              if (rc < 0)
 +                      return rc;
 +              nr_failed += rc;
 +      }
  
 -      return rc;
 +      return nr_failed;
  }
  
  /*
@@@ -1933,7 -1949,6 +1933,7 @@@ int migrate_pages(struct list_head *fro
        struct folio *folio, *folio2;
        LIST_HEAD(folios);
        LIST_HEAD(ret_folios);
 +      LIST_HEAD(split_folios);
        struct migrate_pages_stats stats;
  
        trace_mm_migrate_pages_start(mode, reason);
                                     mode, reason, &stats, &ret_folios);
        if (rc_gather < 0)
                goto out;
 +
  again:
        nr_pages = 0;
        list_for_each_entry_safe(folio, folio2, from, lru) {
                }
  
                nr_pages += folio_nr_pages(folio);
 -              if (nr_pages > NR_MAX_BATCHED_MIGRATION)
 +              if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
                        break;
        }
 -      if (nr_pages > NR_MAX_BATCHED_MIGRATION)
 -              list_cut_before(&folios, from, &folio->lru);
 +      if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
 +              list_cut_before(&folios, from, &folio2->lru);
        else
                list_splice_init(from, &folios);
 -      rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
 -                               mode, reason, &ret_folios, &stats);
 +      if (mode == MIGRATE_ASYNC)
 +              rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
 +                                       mode, reason, &ret_folios, &split_folios, &stats,
 +                                       NR_MAX_MIGRATE_PAGES_RETRY);
 +      else
 +              rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
 +                                      mode, reason, &ret_folios, &split_folios, &stats);
        list_splice_tail_init(&folios, &ret_folios);
        if (rc < 0) {
                rc_gather = rc;
 +              list_splice_tail(&split_folios, &ret_folios);
                goto out;
        }
 +      if (!list_empty(&split_folios)) {
 +              /*
 +               * Failure isn't counted since all split folios of a large folio
 +               * is counted as 1 failure already.  And, we only try to migrate
 +               * with minimal effort, force MIGRATE_ASYNC mode and retry once.
 +               */
 +              migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
 +                                  MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
 +              list_splice_tail_init(&split_folios, &ret_folios);
 +      }
        rc_gather += rc;
        if (!list_empty(from))
                goto again;
@@@ -2099,15 -2097,18 +2099,18 @@@ static int do_move_pages_to_node(struc
   *         target node
   *     1 - when it has been queued
   */
- static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+ static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
                int node, struct list_head *pagelist, bool migrate_all)
  {
        struct vm_area_struct *vma;
+       unsigned long addr;
        struct page *page;
        int err;
        bool isolated;
  
        mmap_read_lock(mm);
+       addr = (unsigned long)untagged_addr_remote(mm, p);
        err = -EFAULT;
        vma = vma_lookup(mm, addr);
        if (!vma || !vma_migratable(vma))
@@@ -2213,7 -2214,6 +2216,6 @@@ static int do_pages_move(struct mm_stru
  
        for (i = start = 0; i < nr_pages; i++) {
                const void __user *p;
-               unsigned long addr;
                int node;
  
                err = -EFAULT;
                        goto out_flush;
                if (get_user(node, nodes + i))
                        goto out_flush;
-               addr = (unsigned long)untagged_addr(p);
  
                err = -ENODEV;
                if (node < 0 || node >= MAX_NUMNODES)
                 * Errors in the page lookup or isolation are not fatal and we simply
                 * report them via status
                 */
-               err = add_page_for_migration(mm, addr, current_node,
-                               &pagelist, flags & MPOL_MF_MOVE_ALL);
+               err = add_page_for_migration(mm, p, current_node, &pagelist,
+                                            flags & MPOL_MF_MOVE_ALL);
  
                if (err > 0) {
                        /* The page is successfully queued for migration */