sync mm-stable with mm-hotfixes-stable to pick up depended-upon upstream changes

author Andrew Morton <akpm@linux-foundation.org>

Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)
author Andrew Morton <akpm@linux-foundation.org>
Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)
diff --combined kernel/fork.c

index 9051bc0,ea33231..639228b
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -451,49 -451,13 +451,49 @@@ static struct kmem_cache *vm_area_cache
   /* SLAB cache for mm_struct structures (tsk->mm) */
   static struct kmem_cache *mm_cachep;
   
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +
+ +/* SLAB cache for vm_area_struct.lock */
+ +static struct kmem_cache *vma_lock_cachep;
+ +
+ +static bool vma_lock_alloc(struct vm_area_struct *vma)
+ +{
+ +      vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ +      if (!vma->vm_lock)
+ +              return false;
+ +
+ +      init_rwsem(&vma->vm_lock->lock);
+ +      vma->vm_lock_seq = -1;
+ +
+ +      return true;
+ +}
+ +
+ +static inline void vma_lock_free(struct vm_area_struct *vma)
+ +{
+ +      kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+ +}
+ +
+ +#else /* CONFIG_PER_VMA_LOCK */
+ +
+ +static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+ +static inline void vma_lock_free(struct vm_area_struct *vma) {}
+ +
+ +#endif /* CONFIG_PER_VMA_LOCK */
+ +
   struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
   {
         struct vm_area_struct *vma;
   
         vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- -      if (vma)
- -              vma_init(vma, mm);
+ +      if (!vma)
+ +              return NULL;
+ +
+ +      vma_init(vma, mm);
+ +      if (!vma_lock_alloc(vma)) {
+ +              kmem_cache_free(vm_area_cachep, vma);
+ +              return NULL;
+ +      }
+ +
         return vma;
   }
   
@@@ -501,56 -465,26 +501,56 @@@ struct vm_area_struct *vm_area_dup(stru
   {
         struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
   
- -      if (new) {
- -              ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- -              ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- -              /*
- -               * orig->shared.rb may be modified concurrently, but the clone
- -               * will be reinitialized.
- -               */
- -              data_race(memcpy(new, orig, sizeof(*new)));
- -              INIT_LIST_HEAD(&new->anon_vma_chain);
- -              dup_anon_vma_name(orig, new);
+ +      if (!new)
+ +              return NULL;
+ +
+ +      ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ +      ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ +      /*
+ +       * orig->shared.rb may be modified concurrently, but the clone
+ +       * will be reinitialized.
+ +       */
+ +      data_race(memcpy(new, orig, sizeof(*new)));
+ +      if (!vma_lock_alloc(new)) {
+ +              kmem_cache_free(vm_area_cachep, new);
+ +              return NULL;
         }
+ +      INIT_LIST_HEAD(&new->anon_vma_chain);
+ +      vma_numab_state_init(new);
+ +      dup_anon_vma_name(orig, new);
+ +
         return new;
   }
   
- -void vm_area_free(struct vm_area_struct *vma)
+ +void __vm_area_free(struct vm_area_struct *vma)
   {
+ +      vma_numab_state_free(vma);
         free_anon_vma_name(vma);
+ +      vma_lock_free(vma);
         kmem_cache_free(vm_area_cachep, vma);
   }
   
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +static void vm_area_free_rcu_cb(struct rcu_head *head)
+ +{
+ +      struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ +                                                vm_rcu);
+ +
+ +      /* The vma should not be locked while being destroyed. */
+ +      VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ +      __vm_area_free(vma);
+ +}
+ +#endif
+ +
+ +void vm_area_free(struct vm_area_struct *vma)
+ +{
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+ +#else
+ +      __vm_area_free(vma);
+ +#endif
+ +}
+ +
   static void account_kernel_stack(struct task_struct *tsk, int account)
   {
         if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@@ -841,67 -775,6 +841,67 @@@ static void check_mm(struct mm_struct *
   #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
   #define free_mm(mm)   (kmem_cache_free(mm_cachep, (mm)))
   
+ +static void do_check_lazy_tlb(void *arg)
+ +{
+ +      struct mm_struct *mm = arg;
+ +
+ +      WARN_ON_ONCE(current->active_mm == mm);
+ +}
+ +
+ +static void do_shoot_lazy_tlb(void *arg)
+ +{
+ +      struct mm_struct *mm = arg;
+ +
+ +      if (current->active_mm == mm) {
+ +              WARN_ON_ONCE(current->mm);
+ +              current->active_mm = &init_mm;
+ +              switch_mm(mm, &init_mm, current);
+ +      }
+ +}
+ +
+ +static void cleanup_lazy_tlbs(struct mm_struct *mm)
+ +{
+ +      if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ +              /*
+ +               * In this case, lazy tlb mms are refounted and would not reach
+ +               * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ +               */
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ +       * requires lazy mm users to switch to another mm when the refcount
+ +       * drops to zero, before the mm is freed. This requires IPIs here to
+ +       * switch kernel threads to init_mm.
+ +       *
+ +       * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ +       * switch with the final userspace teardown TLB flush which leaves the
+ +       * mm lazy on this CPU but no others, reducing the need for additional
+ +       * IPIs here. There are cases where a final IPI is still required here,
+ +       * such as the final mmdrop being performed on a different CPU than the
+ +       * one exiting, or kernel threads using the mm when userspace exits.
+ +       *
+ +       * IPI overheads have not found to be expensive, but they could be
+ +       * reduced in a number of possible ways, for example (roughly
+ +       * increasing order of complexity):
+ +       * - The last lazy reference created by exit_mm() could instead switch
+ +       *   to init_mm, however it's probable this will run on the same CPU
+ +       *   immediately afterwards, so this may not reduce IPIs much.
+ +       * - A batch of mms requiring IPIs could be gathered and freed at once.
+ +       * - CPUs store active_mm where it can be remotely checked without a
+ +       *   lock, to filter out false-positives in the cpumask.
+ +       * - After mm_users or mm_count reaches zero, switching away from the
+ +       *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ +       *   with some batching or delaying of the final IPIs.
+ +       * - A delayed freeing and RCU-like quiescing sequence based on mm
+ +       *   switching to avoid IPIs completely.
+ +       */
+ +      on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ +      if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ +              on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+ +}
+ +
   /*
    * Called when the last reference to the mm
    * is dropped: either by a lazy thread or by
@@@ -913,10 -786,6 +913,10 @@@ void __mmdrop(struct mm_struct *mm
   
         BUG_ON(mm == &init_mm);
         WARN_ON_ONCE(mm == current->mm);
+ +
+ +      /* Ensure no CPUs are using this as their lazy tlb mm */
+ +      cleanup_lazy_tlbs(mm);
+ +
         WARN_ON_ONCE(mm == current->active_mm);
         mm_free_pgd(mm);
         destroy_context(mm);
@@@ -1259,9 -1128,6 +1259,9 @@@ static struct mm_struct *mm_init(struc
         seqcount_init(&mm->write_protect_seq);
         mmap_init_lock(mm);
         INIT_LIST_HEAD(&mm->mmlist);
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      mm->mm_lock_seq = 0;
+ +#endif
         mm_pgtables_bytes_init(mm);
         mm->map_count = 0;
         mm->locked_vm = 0;
@@@ -1308,6 -1174,7 +1308,7 @@@
   fail_pcpu:
         while (i > 0)
                 percpu_counter_destroy(&mm->rss_stat[--i]);
+       destroy_context(mm);
   fail_nocontext:
         mm_free_pgd(mm);
   fail_nopgd:
@@@ -3199,9 -3066,6 +3200,9 @@@ void __init proc_caches_init(void
                         NULL);
   
         vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+ +#endif
         mmap_init();
         nsproxy_cache_init();
   }
diff --combined lib/maple_tree.c

index 5577e6d,1281a40..4a6ecdb
--- 1/lib/maple_tree.c
--- 2/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@@ -4965,7 -4965,8 +4965,8 @@@ not_found
    * Return: True if found in a leaf, false otherwise.
    *
    */
- static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
+ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
+               unsigned long *gap_min, unsigned long *gap_max)
   {
         enum maple_type type = mte_node_type(mas->node);
         struct maple_node *node = mas_mn(mas);
@@@ -5030,8 -5031,8 +5031,8 @@@
   
         if (unlikely(ma_is_leaf(type))) {
                 mas->offset = offset;
-               mas->min = min;
-               mas->max = min + gap - 1;
+               *gap_min = min;
+               *gap_max = min + gap - 1;
                 return true;
         }
   
@@@ -5055,10 -5056,10 +5056,10 @@@ static inline bool mas_anode_descend(st
   {
         enum maple_type type = mte_node_type(mas->node);
         unsigned long pivot, min, gap = 0;
-       unsigned char offset;
-       unsigned long *gaps;
-       unsigned long *pivots = ma_pivots(mas_mn(mas), type);
-       void __rcu **slots = ma_slots(mas_mn(mas), type);
+       unsigned char offset, data_end;
+       unsigned long *gaps, *pivots;
+       void __rcu **slots;
+       struct maple_node *node;
         bool found = false;
   
         if (ma_is_dense(type)) {
@@@ -5066,13 -5067,15 +5067,15 @@@
                 return true;
         }
   
-       gaps = ma_gaps(mte_to_node(mas->node), type);
+       node = mas_mn(mas);
+       pivots = ma_pivots(node, type);
+       slots = ma_slots(node, type);
+       gaps = ma_gaps(node, type);
         offset = mas->offset;
         min = mas_safe_min(mas, pivots, offset);
-       for (; offset < mt_slots[type]; offset++) {
-               pivot = mas_safe_pivot(mas, pivots, offset, type);
-               if (offset && !pivot)
-                       break;
+       data_end = ma_data_end(node, type, pivots, mas->max);
+       for (; offset <= data_end; offset++) {
+               pivot = mas_logical_pivot(mas, pivots, offset, type);
   
                 /* Not within lower bounds */
                 if (mas->index > pivot)
@@@ -5307,6 -5310,9 +5310,9 @@@ int mas_empty_area(struct ma_state *mas
         unsigned long *pivots;
         enum maple_type mt;
   
+       if (min >= max)
+               return -EINVAL;
+ 
         if (mas_is_start(mas))
                 mas_start(mas);
         else if (mas->offset >= 2)
@@@ -5361,6 -5367,9 +5367,9 @@@ int mas_empty_area_rev(struct ma_state 
   {
         struct maple_enode *last = mas->node;
   
+       if (min >= max)
+               return -EINVAL;
+ 
         if (mas_is_start(mas)) {
                 mas_start(mas);
                 mas->offset = mas_data_end(mas);
@@@ -5380,7 -5389,7 +5389,7 @@@
         mas->index = min;
         mas->last = max;
   
-       while (!mas_rev_awalk(mas, size)) {
+       while (!mas_rev_awalk(mas, size, &min, &max)) {
                 if (last == mas->node) {
                         if (!mas_rewind_node(mas))
                                 return -EBUSY;
@@@ -5395,17 -5404,9 +5404,9 @@@
         if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                 return -EBUSY;
   
-       /*
-        * mas_rev_awalk() has set mas->min and mas->max to the gap values.  If
-        * the maximum is outside the window we are searching, then use the last
-        * location in the search.
-        * mas->max and mas->min is the range of the gap.
-        * mas->index and mas->last are currently set to the search range.
-        */
- 
         /* Trim the upper limit to the max. */
-       if (mas->max <= mas->last)
-               mas->last = mas->max;
+       if (max <= mas->last)
+               mas->last = max;
   
         mas->index = mas->last - size + 1;
         return 0;
@@@ -5814,7 -5815,6 +5815,7 @@@ int mas_preallocate(struct ma_state *ma
         mas_reset(mas);
         return ret;
   }
+ +EXPORT_SYMBOL_GPL(mas_preallocate);
   
   /*
    * mas_destroy() - destroy a maple state.
diff --combined mm/mmap.c

index 51cd747,d5475fb..790cc62
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -133,7 -133,7 +133,7 @@@ void unlink_file_vma(struct vm_area_str
   /*
    * Close a vm structure and free it.
    */
- -static void remove_vma(struct vm_area_struct *vma)
+ +static void remove_vma(struct vm_area_struct *vma, bool unreachable)
   {
         might_sleep();
         if (vma->vm_ops && vma->vm_ops->close)
@@@ -141,10 -141,7 +141,10 @@@
         if (vma->vm_file)
                 fput(vma->vm_file);
         mpol_put(vma_policy(vma));
- -      vm_area_free(vma);
+ +      if (unreachable)
+ +              __vm_area_free(vma);
+ +      else
+ +              vm_area_free(vma);
   }
   
   static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
@@@ -505,15 -502,6 +505,15 @@@ static inline void init_vma_prep(struc
    */
   static inline void vma_prepare(struct vma_prepare *vp)
   {
+ +      vma_start_write(vp->vma);
+ +      if (vp->adj_next)
+ +              vma_start_write(vp->adj_next);
+ +      /* vp->insert is always a newly created VMA, no need for locking */
+ +      if (vp->remove)
+ +              vma_start_write(vp->remove);
+ +      if (vp->remove2)
+ +              vma_start_write(vp->remove2);
+ +
         if (vp->file) {
                 uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
   
@@@ -602,7 -590,6 +602,7 @@@ static inline void vma_complete(struct 
   
         if (vp->remove) {
   again:
+ +              vma_mark_detached(vp->remove, true);
                 if (vp->file) {
                         uprobe_munmap(vp->remove, vp->remove->vm_start,
                                       vp->remove->vm_end);
@@@ -618,7 -605,7 +618,7 @@@
   
                 /*
                  * In mprotect's case 6 (see comments on vma_merge),
- -               * we must remove the one after next as well.
+ +               * we are removing both mid and next vmas
                  */
                 if (vp->remove2) {
                         vp->remove = vp->remove2;
@@@ -696,12 -683,12 +696,12 @@@ int vma_expand(struct vma_iterator *vmi
         if (vma_iter_prealloc(vmi))
                 goto nomem;
   
+ +      vma_prepare(&vp);
         vma_adjust_trans_huge(vma, start, end, 0);
         /* VMA iterator points to previous, so set to start if necessary */
         if (vma_iter_addr(vmi) != start)
                 vma_iter_set(vmi, start);
   
- -      vma_prepare(&vp);
         vma->vm_start = start;
         vma->vm_end = end;
         vma->vm_pgoff = pgoff;
@@@ -736,8 -723,8 +736,8 @@@ int vma_shrink(struct vma_iterator *vmi
                 return -ENOMEM;
   
         init_vma_prep(&vp, vma);
- -      vma_adjust_trans_huge(vma, start, end, 0);
         vma_prepare(&vp);
+ +      vma_adjust_trans_huge(vma, start, end, 0);
   
         if (vma->vm_start < start)
                 vma_iter_clear(vmi, vma->vm_start, start);
@@@ -755,13 -742,12 +755,13 @@@
   
   /*
    * If the vma has a ->close operation then the driver probably needs to release
- - * per-vma resources, so we don't attempt to merge those.
+ + * per-vma resources, so we don't attempt to merge those if the caller indicates
+ + * the current vma may be removed as part of the merge.
    */
- -static inline int is_mergeable_vma(struct vm_area_struct *vma,
- -                                 struct file *file, unsigned long vm_flags,
- -                                 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                                 struct anon_vma_name *anon_name)
+ +static inline bool is_mergeable_vma(struct vm_area_struct *vma,
+ +              struct file *file, unsigned long vm_flags,
+ +              struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ +              struct anon_vma_name *anon_name, bool may_remove_vma)
   {
         /*
          * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@@ -772,20 -758,21 +772,20 @@@
          * extended instead.
          */
         if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
- -              return 0;
+ +              return false;
         if (vma->vm_file != file)
- -              return 0;
- -      if (vma->vm_ops && vma->vm_ops->close)
- -              return 0;
+ +              return false;
+ +      if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
+ +              return false;
         if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
- -              return 0;
+ +              return false;
         if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
- -              return 0;
- -      return 1;
+ +              return false;
+ +      return true;
   }
   
- -static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- -                                      struct anon_vma *anon_vma2,
- -                                      struct vm_area_struct *vma)
+ +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ +               struct anon_vma *anon_vma2, struct vm_area_struct *vma)
   {
         /*
          * The list_is_singular() test is to avoid merging VMA cloned from
@@@ -793,7 -780,7 +793,7 @@@
          */
         if ((!anon_vma1 || !anon_vma2) && (!vma ||
                 list_is_singular(&vma->anon_vma_chain)))
- -              return 1;
+ +              return true;
         return anon_vma1 == anon_vma2;
   }
   
@@@ -807,21 -794,20 +807,21 @@@
    * We don't check here for the merged mmap wrapping around the end of pagecache
    * indices (16TB on ia32) because do_mmap() does not permit mmap's which
    * wrap, nor mmaps which cover the final page at index -1UL.
+ + *
+ + * We assume the vma may be removed as part of the merge.
    */
- -static int
+ +static bool
   can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- -                   struct anon_vma *anon_vma, struct file *file,
- -                   pgoff_t vm_pgoff,
- -                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                   struct anon_vma_name *anon_name)
+ +              struct anon_vma *anon_vma, struct file *file,
+ +              pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ +              struct anon_vma_name *anon_name)
   {
- -      if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
+ +      if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                 if (vma->vm_pgoff == vm_pgoff)
- -                      return 1;
+ +                      return true;
         }
- -      return 0;
+ +      return false;
   }
   
   /*
@@@ -830,23 -816,22 +830,23 @@@
    *
    * We cannot merge two vmas if they have differently assigned (non-NULL)
    * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ + *
+ + * We assume that vma is not removed as part of the merge.
    */
- -static int
+ +static bool
   can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- -                  struct anon_vma *anon_vma, struct file *file,
- -                  pgoff_t vm_pgoff,
- -                  struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                  struct anon_vma_name *anon_name)
+ +              struct anon_vma *anon_vma, struct file *file,
+ +              pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ +              struct anon_vma_name *anon_name)
   {
- -      if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
+ +      if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                 pgoff_t vm_pglen;
                 vm_pglen = vma_pages(vma);
                 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
- -                      return 1;
+ +                      return true;
         }
- -      return 0;
+ +      return false;
   }
   
   /*
@@@ -861,45 -846,42 +861,45 @@@
    * this area are about to be changed to vm_flags - and the no-change
    * case has already been eliminated.
    *
- - * The following mprotect cases have to be considered, where AAAA is
+ + * The following mprotect cases have to be considered, where **** is
    * the area passed down from mprotect_fixup, never extending beyond one
- - * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
+ + * at the same address as **** and is of the same or larger span, and
+ + * NNNN the next vma after ****:
    *
- - *     AAAA             AAAA                   AAAA
- - *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPNNNNNN
+ + *     ****             ****                   ****
+ + *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPCCCCCC
    *    cannot merge    might become       might become
- - *                    PPNNNNNNNNNN       PPPPPPPPPPNN
+ + *                    PPNNNNNNNNNN       PPPPPPPPPPCC
    *    mmap, brk or    case 4 below       case 5 below
    *    mremap move:
- - *                        AAAA               AAAA
- - *                    PPPP    NNNN       PPPPNNNNXXXX
+ + *                        ****               ****
+ + *                    PPPP    NNNN       PPPPCCCCNNNN
    *                    might become       might become
    *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
- - *                    PPPPPPPPNNNN 2 or  PPPPPPPPXXXX 7 or
- - *                    PPPPNNNNNNNN 3     PPPPXXXXXXXX 8
+ + *                    PPPPPPPPNNNN 2 or  PPPPPPPPNNNN 7 or
+ + *                    PPPPNNNNNNNN 3     PPPPNNNNNNNN 8
    *
- - * It is important for case 8 that the vma NNNN overlapping the
- - * region AAAA is never going to extended over XXXX. Instead XXXX must
- - * be extended in region AAAA and NNNN must be removed. This way in
+ + * It is important for case 8 that the vma CCCC overlapping the
+ + * region **** is never going to extended over NNNN. Instead NNNN must
+ + * be extended in region **** and CCCC must be removed. This way in
    * all cases where vma_merge succeeds, the moment vma_merge drops the
    * rmap_locks, the properties of the merged vma will be already
    * correct for the whole merged range. Some of those properties like
    * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
    * be correct for the whole merged range immediately after the
- - * rmap_locks are released. Otherwise if XXXX would be removed and
- - * NNNN would be extended over the XXXX range, remove_migration_ptes
+ + * rmap_locks are released. Otherwise if NNNN would be removed and
+ + * CCCC would be extended over the NNNN range, remove_migration_ptes
    * or other rmap walkers (if working on addresses beyond the "end"
- - * parameter) may establish ptes with the wrong permissions of NNNN
- - * instead of the right permissions of XXXX.
+ + * parameter) may establish ptes with the wrong permissions of CCCC
+ + * instead of the right permissions of NNNN.
    *
    * In the code below:
    * PPPP is represented by *prev
- - * NNNN is represented by *mid (and possibly equal to *next)
- - * XXXX is represented by *next or not represented at all.
- - * AAAA is not represented - it will be merged or the function will return NULL
+ + * CCCC is represented by *curr or not represented at all (NULL)
+ + * NNNN is represented by *next or not represented at all (NULL)
+ + * **** is not represented - it will be merged and the vma containing the
+ + *      area is returned, or the function will return NULL
    */
   struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
                         struct vm_area_struct *prev, unsigned long addr,
@@@ -909,18 -891,18 +909,18 @@@
                         struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                         struct anon_vma_name *anon_name)
   {
- -      pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- -      pgoff_t vma_pgoff;
- -      struct vm_area_struct *mid, *next, *res = NULL;
+ +      struct vm_area_struct *curr, *next, *res;
         struct vm_area_struct *vma, *adjust, *remove, *remove2;
- -      int err = -1;
+ +      struct vma_prepare vp;
+ +      pgoff_t vma_pgoff;
+ +      int err = 0;
         bool merge_prev = false;
         bool merge_next = false;
         bool vma_expanded = false;
- -      struct vma_prepare vp;
- -      unsigned long vma_end = end;
- -      long adj_next = 0;
         unsigned long vma_start = addr;
+ +      unsigned long vma_end = end;
+ +      pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+ +      long adj_start = 0;
   
         validate_mm(mm);
         /*
@@@ -930,105 -912,94 +930,105 @@@
         if (vm_flags & VM_SPECIAL)
                 return NULL;
   
- -      next = find_vma(mm, prev ? prev->vm_end : 0);
- -      mid = next;
- -      if (next && next->vm_end == end)                /* cases 6, 7, 8 */
- -              next = find_vma(mm, next->vm_end);
+ +      /* Does the input range span an existing VMA? (cases 5 - 8) */
+ +      curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
   
- -      /* verify some invariant that must be enforced by the caller */
- -      VM_WARN_ON(prev && addr <= prev->vm_start);
- -      VM_WARN_ON(mid && end > mid->vm_end);
- -      VM_WARN_ON(addr >= end);
+ +      if (!curr ||                    /* cases 1 - 4 */
+ +          end == curr->vm_end)        /* cases 6 - 8, adjacent VMA */
+ +              next = vma_lookup(mm, end);
+ +      else
+ +              next = NULL;            /* case 5 */
   
         if (prev) {
- -              res = prev;
- -              vma = prev;
                 vma_start = prev->vm_start;
                 vma_pgoff = prev->vm_pgoff;
+ +
                 /* Can we merge the predecessor? */
- -              if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy)
+ +              if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
                     && can_vma_merge_after(prev, vm_flags, anon_vma, file,
- -                                 pgoff, vm_userfaultfd_ctx, anon_name)) {
+ +                                         pgoff, vm_userfaultfd_ctx, anon_name)) {
                         merge_prev = true;
                         vma_prev(vmi);
                 }
         }
+ +
         /* Can we merge the successor? */
- -      if (next && end == next->vm_start &&
- -                      mpol_equal(policy, vma_policy(next)) &&
- -                      can_vma_merge_before(next, vm_flags,
- -                                           anon_vma, file, pgoff+pglen,
- -                                           vm_userfaultfd_ctx, anon_name)) {
+ +      if (next && mpol_equal(policy, vma_policy(next)) &&
+ +          can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
+ +                               vm_userfaultfd_ctx, anon_name)) {
                 merge_next = true;
         }
   
+ +      if (!merge_prev && !merge_next)
+ +              return NULL; /* Not mergeable. */
+ +
+ +      res = vma = prev;
         remove = remove2 = adjust = NULL;
+ +
+ +      /* Verify some invariant that must be enforced by the caller. */
+ +      VM_WARN_ON(prev && addr <= prev->vm_start);
+ +      VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
+ +      VM_WARN_ON(addr >= end);
+ +
         /* Can we merge both the predecessor and the successor? */
         if (merge_prev && merge_next &&
             is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
- -              remove = mid;                           /* case 1 */
+ +              remove = next;                          /* case 1 */
                 vma_end = next->vm_end;
- -              err = dup_anon_vma(res, remove);
- -              if (mid != next) {                      /* case 6 */
+ +              err = dup_anon_vma(prev, next);
+ +              if (curr) {                             /* case 6 */
+ +                      remove = curr;
                         remove2 = next;
- -                      if (!remove->anon_vma)
- -                              err = dup_anon_vma(res, remove2);
+ +                      if (!next->anon_vma)
+ +                              err = dup_anon_vma(prev, curr);
                 }
- -      } else if (merge_prev) {
- -              err = 0;                                /* case 2 */
- -              if (mid && end > mid->vm_start) {
- -                      err = dup_anon_vma(res, mid);
- -                      if (end == mid->vm_end) {       /* case 7 */
- -                              remove = mid;
+ +      } else if (merge_prev) {                        /* case 2 */
+ +              if (curr) {
+ +                      err = dup_anon_vma(prev, curr);
+ +                      if (end == curr->vm_end) {      /* case 7 */
+ +                              remove = curr;
                         } else {                        /* case 5 */
- -                              adjust = mid;
- -                              adj_next = (end - mid->vm_start);
+ +                              adjust = curr;
+ +                              adj_start = (end - curr->vm_start);
                         }
                 }
- -      } else if (merge_next) {
+ +      } else { /* merge_next */
                 res = next;
                 if (prev && addr < prev->vm_end) {      /* case 4 */
                         vma_end = addr;
- -                      adjust = mid;
- -                      adj_next = -(vma->vm_end - addr);
- -                      err = dup_anon_vma(adjust, prev);
+ +                      adjust = next;
+ +                      adj_start = -(prev->vm_end - addr);
+ +                      err = dup_anon_vma(next, prev);
                 } else {
+ +                      /*
+ +                       * Note that cases 3 and 8 are the ONLY ones where prev
+ +                       * is permitted to be (but is not necessarily) NULL.
+ +                       */
                         vma = next;                     /* case 3 */
                         vma_start = addr;
                         vma_end = next->vm_end;
- -                      vma_pgoff = mid->vm_pgoff;
- -                      err = 0;
- -                      if (mid != next) {              /* case 8 */
- -                              remove = mid;
- -                              err = dup_anon_vma(res, remove);
+ +                      vma_pgoff = next->vm_pgoff;
+ +                      if (curr) {                     /* case 8 */
+ +                              vma_pgoff = curr->vm_pgoff;
+ +                              remove = curr;
+ +                              err = dup_anon_vma(next, curr);
                         }
                 }
         }
   
- -      /* Cannot merge or error in anon_vma clone */
+ +      /* Error in anon_vma clone. */
         if (err)
                 return NULL;
   
         if (vma_iter_prealloc(vmi))
                 return NULL;
   
- -      vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
         init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
         VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
                    vp.anon_vma != adjust->anon_vma);
   
         vma_prepare(&vp);
+ +      vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
         if (vma_start < vma->vm_start || vma_end > vma->vm_end)
                 vma_expanded = true;
   
@@@ -1039,10 -1010,10 +1039,10 @@@
         if (vma_expanded)
                 vma_iter_store(vmi, vma);
   
- -      if (adj_next) {
- -              adjust->vm_start += adj_next;
- -              adjust->vm_pgoff += adj_next >> PAGE_SHIFT;
- -              if (adj_next < 0) {
+ +      if (adj_start) {
+ +              adjust->vm_start += adj_start;
+ +              adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
+ +              if (adj_start < 0) {
                         WARN_ON(vma_expanded);
                         vma_iter_store(vmi, next);
                 }
@@@ -1547,7 -1518,8 +1547,8 @@@ static inline int accountable_mapping(s
    */
   static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
   {
-       unsigned long length, gap;
+       unsigned long length, gap, low_limit;
+       struct vm_area_struct *tmp;
   
         MA_STATE(mas, &current->mm->mm_mt, 0, 0);
   
@@@ -1556,12 -1528,29 +1557,29 @@@
         if (length < info->length)
                 return -ENOMEM;
   
-       if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
-                                 length))
+       low_limit = info->low_limit;
+ retry:
+       if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length))
                 return -ENOMEM;
   
         gap = mas.index;
         gap += (info->align_offset - gap) & info->align_mask;
+       tmp = mas_next(&mas, ULONG_MAX);
+       if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+               if (vm_start_gap(tmp) < gap + length - 1) {
+                       low_limit = tmp->vm_end;
+                       mas_reset(&mas);
+                       goto retry;
+               }
+       } else {
+               tmp = mas_prev(&mas, 0);
+               if (tmp && vm_end_gap(tmp) > gap) {
+                       low_limit = vm_end_gap(tmp);
+                       mas_reset(&mas);
+                       goto retry;
+               }
+       }
+ 
         return gap;
   }
   
@@@ -1577,7 -1566,8 +1595,8 @@@
    */
   static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
   {
-       unsigned long length, gap;
+       unsigned long length, gap, high_limit, gap_end;
+       struct vm_area_struct *tmp;
   
         MA_STATE(mas, &current->mm->mm_mt, 0, 0);
         /* Adjust search length to account for worst case alignment overhead */
@@@ -1585,12 -1575,31 +1604,31 @@@
         if (length < info->length)
                 return -ENOMEM;
   
-       if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
+       high_limit = info->high_limit;
+ retry:
+       if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1,
                                 length))
                 return -ENOMEM;
   
         gap = mas.last + 1 - info->length;
         gap -= (gap - info->align_offset) & info->align_mask;
+       gap_end = mas.last;
+       tmp = mas_next(&mas, ULONG_MAX);
+       if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+               if (vm_start_gap(tmp) <= gap_end) {
+                       high_limit = vm_start_gap(tmp);
+                       mas_reset(&mas);
+                       goto retry;
+               }
+       } else {
+               tmp = mas_prev(&mas, 0);
+               if (tmp && vm_end_gap(tmp) > gap) {
+                       high_limit = tmp->vm_start;
+                       mas_reset(&mas);
+                       goto retry;
+               }
+       }
+ 
         return gap;
   }
   
@@@ -2148,7 -2157,7 +2186,7 @@@ static inline void remove_mt(struct mm_
                 if (vma->vm_flags & VM_ACCOUNT)
                         nr_accounted += nrpages;
                 vm_stat_account(mm, vma->vm_flags, -nrpages);
- -              remove_vma(vma);
+ +              remove_vma(vma, false);
         }
         vm_unacct_memory(nr_accounted);
         validate_mm(mm);
@@@ -2171,8 -2180,7 +2209,8 @@@ static void unmap_region(struct mm_stru
         update_hiwater_rss(mm);
         unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
         free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- -                               next ? next->vm_start : USER_PGTABLES_CEILING);
+ +                               next ? next->vm_start : USER_PGTABLES_CEILING,
+ +                               mm_wr_locked);
         tlb_finish_mmu(&tlb);
   }
   
@@@ -2228,10 -2236,10 +2266,10 @@@ int __split_vma(struct vma_iterator *vm
         if (new->vm_ops && new->vm_ops->open)
                 new->vm_ops->open(new);
   
- -      vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
         init_vma_prep(&vp, vma);
         vp.insert = new;
         vma_prepare(&vp);
+ +      vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
   
         if (new_below) {
                 vma->vm_start = addr;
@@@ -2275,12 -2283,10 +2313,12 @@@ int split_vma(struct vma_iterator *vmi
   static inline int munmap_sidetree(struct vm_area_struct *vma,
                                    struct ma_state *mas_detach)
   {
+ +      vma_start_write(vma);
         mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
         if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
                 return -ENOMEM;
   
+ +      vma_mark_detached(vma, true);
         if (vma->vm_flags & VM_LOCKED)
                 vma->vm_mm->locked_vm -= vma_pages(vma);
   
@@@ -2936,9 -2942,9 +2974,9 @@@ static int do_brk_flags(struct vma_iter
                 if (vma_iter_prealloc(vmi))
                         goto unacct_fail;
   
- -              vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
                 init_vma_prep(&vp, vma);
                 vma_prepare(&vp);
+ +              vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
                 vma->vm_end = addr + len;
                 vm_flags_set(vma, VM_SOFTDIRTY);
                 vma_iter_store(vmi, vma);
@@@ -3071,7 -3077,7 +3109,7 @@@ void exit_mmap(struct mm_struct *mm
         mmap_write_lock(mm);
         mt_clear_in_rcu(&mm->mm_mt);
         free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
- -                    USER_PGTABLES_CEILING);
+ +                    USER_PGTABLES_CEILING, true);
         tlb_finish_mmu(&tlb);
   
         /*
@@@ -3082,7 -3088,7 +3120,7 @@@
         do {
                 if (vma->vm_flags & VM_ACCOUNT)
                         nr_accounted += vma_pages(vma);
- -              remove_vma(vma);
+ +              remove_vma(vma, true);
                 count++;
                 cond_resched();
         } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
@@@ -3205,7 -3211,6 +3243,7 @@@ struct vm_area_struct *copy_vma(struct 
                         get_file(new_vma->vm_file);
                 if (new_vma->vm_ops && new_vma->vm_ops->open)
                         new_vma->vm_ops->open(new_vma);
+ +              vma_start_write(new_vma);
                 if (vma_link(mm, new_vma))
                         goto out_vma_link;
                 *need_rmap_locks = false;
@@@ -3500,7 -3505,6 +3538,7 @@@ static void vm_lock_mapping(struct mm_s
    * of mm/rmap.c:
    *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
    *     hugetlb mapping);
+ + *   - all vmas marked locked
    *   - all i_mmap_rwsem locks;
    *   - all anon_vma->rwseml
    *
@@@ -3526,13 -3530,6 +3564,13 @@@ int mm_take_all_locks(struct mm_struct 
         mas_for_each(&mas, vma, ULONG_MAX) {
                 if (signal_pending(current))
                         goto out_unlock;
+ +              vma_start_write(vma);
+ +      }
+ +
+ +      mas_set(&mas, 0);
+ +      mas_for_each(&mas, vma, ULONG_MAX) {
+ +              if (signal_pending(current))
+ +                      goto out_unlock;
                 if (vma->vm_file && vma->vm_file->f_mapping &&
                                 is_vm_hugetlb_page(vma))
                         vm_lock_mapping(mm, vma->vm_file->f_mapping);
@@@ -3619,7 -3616,6 +3657,7 @@@ void mm_drop_all_locks(struct mm_struc
                 if (vma->vm_file && vma->vm_file->f_mapping)
                         vm_unlock_mapping(vma->vm_file->f_mapping);
         }
+ +      vma_end_write_all(mm);
   
         mutex_unlock(&mm_all_locks_mutex);
   }
diff --combined mm/page_alloc.c

index d0eb280,8e39705..9c325e5
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -72,7 -72,9 +72,7 @@@
   #include <linux/lockdep.h>
   #include <linux/nmi.h>
   #include <linux/psi.h>
- -#include <linux/padata.h>
   #include <linux/khugepaged.h>
- -#include <linux/buffer_head.h>
   #include <linux/delayacct.h>
   #include <asm/sections.h>
   #include <asm/tlbflush.h>
@@@ -110,6 -112,17 +110,6 @@@ typedef int __bitwise fpi_t
    */
   #define FPI_TO_TAIL           ((__force fpi_t)BIT(1))
   
- -/*
- - * Don't poison memory with KASAN (only for the tag-based modes).
- - * During boot, all non-reserved memblock memory is exposed to page_alloc.
- - * Poisoning all that memory lengthens boot time, especially on systems with
- - * large amount of RAM. This flag is used to skip that poisoning.
- - * This is only done for the tag-based KASAN modes, as those are able to
- - * detect memory corruptions with the memory tags assigned by default.
- - * All memory allocated normally after boot gets poisoned as usual.
- - */
- -#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
- -
   /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
   static DEFINE_MUTEX(pcp_batch_high_lock);
   #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@@ -240,6 -253,23 +240,6 @@@ EXPORT_SYMBOL(init_on_alloc)
   DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
   EXPORT_SYMBOL(init_on_free);
   
- -static bool _init_on_alloc_enabled_early __read_mostly
- -                              = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
- -static int __init early_init_on_alloc(char *buf)
- -{
- -
- -      return kstrtobool(buf, &_init_on_alloc_enabled_early);
- -}
- -early_param("init_on_alloc", early_init_on_alloc);
- -
- -static bool _init_on_free_enabled_early __read_mostly
- -                              = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
- -static int __init early_init_on_free(char *buf)
- -{
- -      return kstrtobool(buf, &_init_on_free_enabled_early);
- -}
- -early_param("init_on_free", early_init_on_free);
- -
   /*
    * A cached value of the page's pageblock's migratetype, used when the page is
    * put on a pcplist. Used to avoid the pageblock migratetype lookup when
@@@ -328,7 -358,7 +328,7 @@@ int sysctl_lowmem_reserve_ratio[MAX_NR_
         [ZONE_MOVABLE] = 0,
   };
   
- -static char * const zone_names[MAX_NR_ZONES] = {
+ +char * const zone_names[MAX_NR_ZONES] = {
   #ifdef CONFIG_ZONE_DMA
          "DMA",
   #endif
@@@ -374,6 -404,17 +374,6 @@@ int user_min_free_kbytes = -1
   int watermark_boost_factor __read_mostly = 15000;
   int watermark_scale_factor = 10;
   
- -static unsigned long nr_kernel_pages __initdata;
- -static unsigned long nr_all_pages __initdata;
- -static unsigned long dma_reserve __initdata;
- -
- -static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
- -static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
- -static unsigned long required_kernelcore __initdata;
- -static unsigned long required_kernelcore_percent __initdata;
- -static unsigned long required_movablecore __initdata;
- -static unsigned long required_movablecore_percent __initdata;
- -static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
   bool mirrored_kernelcore __initdata_memblock;
   
   /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@@ -389,36 -430,86 +389,36 @@@ EXPORT_SYMBOL(nr_online_nodes)
   
   int page_group_by_mobility_disabled __read_mostly;
   
- -bool deferred_struct_pages __meminitdata;
- -
   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
   /*
    * During boot we initialize deferred pages on-demand, as needed, but once
    * page_alloc_init_late() has finished, the deferred pages are all initialized,
    * and we can permanently disable that path.
    */
- -static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+ +DEFINE_STATIC_KEY_TRUE(deferred_pages);
   
   static inline bool deferred_pages_enabled(void)
   {
         return static_branch_unlikely(&deferred_pages);
   }
   
- -/* Returns true if the struct page for the pfn is initialised */
- -static inline bool __meminit early_page_initialised(unsigned long pfn)
- -{
- -      int nid = early_pfn_to_nid(pfn);
- -
- -      if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- -              return false;
- -
- -      return true;
- -}
- -
   /*
- - * Returns true when the remaining initialisation should be deferred until
- - * later in the boot cycle when it can be parallelised.
+ + * deferred_grow_zone() is __init, but it is called from
+ + * get_page_from_freelist() during early boot until deferred_pages permanently
+ + * disables this call. This is why we have refdata wrapper to avoid warning,
+ + * and to ensure that the function body gets unloaded.
    */
- -static bool __meminit
- -defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+ +static bool __ref
+ +_deferred_grow_zone(struct zone *zone, unsigned int order)
   {
- -      static unsigned long prev_end_pfn, nr_initialised;
- -
- -      if (early_page_ext_enabled())
- -              return false;
- -      /*
- -       * prev_end_pfn static that contains the end of previous zone
- -       * No need to protect because called very early in boot before smp_init.
- -       */
- -      if (prev_end_pfn != end_pfn) {
- -              prev_end_pfn = end_pfn;
- -              nr_initialised = 0;
- -      }
- -
- -      /* Always populate low zones for address-constrained allocations */
- -      if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- -              return false;
- -
- -      if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
- -              return true;
- -      /*
- -       * We start only with one section of pages, more pages are added as
- -       * needed until the rest of deferred pages are initialized.
- -       */
- -      nr_initialised++;
- -      if ((nr_initialised > PAGES_PER_SECTION) &&
- -          (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- -              NODE_DATA(nid)->first_deferred_pfn = pfn;
- -              return true;
- -      }
- -      return false;
+ +       return deferred_grow_zone(zone, order);
   }
   #else
   static inline bool deferred_pages_enabled(void)
   {
         return false;
   }
- -
- -static inline bool early_page_initialised(unsigned long pfn)
- -{
- -      return true;
- -}
- -
- -static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
- -{
- -      return false;
- -}
- -#endif
+ +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
   
   /* Return a pointer to the bitmap storing bits affecting a block of pages */
   static inline unsigned long *get_pageblock_bitmap(const struct page *page,
@@@ -684,6 -775,26 +684,6 @@@ void free_compound_page(struct page *pa
         free_the_page(page, compound_order(page));
   }
   
- -static void prep_compound_head(struct page *page, unsigned int order)
- -{
- -      struct folio *folio = (struct folio *)page;
- -
- -      set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- -      set_compound_order(page, order);
- -      atomic_set(&folio->_entire_mapcount, -1);
- -      atomic_set(&folio->_nr_pages_mapped, 0);
- -      atomic_set(&folio->_pincount, 0);
- -}
- -
- -static void prep_compound_tail(struct page *head, int tail_idx)
- -{
- -      struct page *p = head + tail_idx;
- -
- -      p->mapping = TAIL_MAPPING;
- -      set_compound_head(p, head);
- -      set_page_private(p, 0);
- -}
- -
   void prep_compound_page(struct page *page, unsigned int order)
   {
         int i;
@@@ -773,6 -884,64 +773,6 @@@ static inline void clear_page_guard(str
                                 unsigned int order, int migratetype) {}
   #endif
   
- -/*
- - * Enable static keys related to various memory debugging and hardening options.
- - * Some override others, and depend on early params that are evaluated in the
- - * order of appearance. So we need to first gather the full picture of what was
- - * enabled, and then make decisions.
- - */
- -void __init init_mem_debugging_and_hardening(void)
- -{
- -      bool page_poisoning_requested = false;
- -
- -#ifdef CONFIG_PAGE_POISONING
- -      /*
- -       * Page poisoning is debug page alloc for some arches. If
- -       * either of those options are enabled, enable poisoning.
- -       */
- -      if (page_poisoning_enabled() ||
- -           (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- -            debug_pagealloc_enabled())) {
- -              static_branch_enable(&_page_poisoning_enabled);
- -              page_poisoning_requested = true;
- -      }
- -#endif
- -
- -      if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
- -          page_poisoning_requested) {
- -              pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- -                      "will take precedence over init_on_alloc and init_on_free\n");
- -              _init_on_alloc_enabled_early = false;
- -              _init_on_free_enabled_early = false;
- -      }
- -
- -      if (_init_on_alloc_enabled_early)
- -              static_branch_enable(&init_on_alloc);
- -      else
- -              static_branch_disable(&init_on_alloc);
- -
- -      if (_init_on_free_enabled_early)
- -              static_branch_enable(&init_on_free);
- -      else
- -              static_branch_disable(&init_on_free);
- -
- -      if (IS_ENABLED(CONFIG_KMSAN) &&
- -          (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
- -              pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
- -
- -#ifdef CONFIG_DEBUG_PAGEALLOC
- -      if (!debug_pagealloc_enabled())
- -              return;
- -
- -      static_branch_enable(&_debug_pagealloc_enabled);
- -
- -      if (!debug_guardpage_minorder())
- -              return;
- -
- -      static_branch_enable(&_debug_guardpage_enabled);
- -#endif
- -}
- -
   static inline void set_buddy_order(struct page *page, unsigned int order)
   {
         set_page_private(page, order);
@@@ -875,13 -1044,6 +875,13 @@@ static inline void del_page_from_free_l
         zone->free_area[order].nr_free--;
   }
   
+ +static inline struct page *get_page_from_free_area(struct free_area *area,
+ +                                          int migratetype)
+ +{
+ +      return list_first_entry_or_null(&area->free_list[migratetype],
+ +                                      struct page, lru);
+ +}
+ +
   /*
    * If this is not the largest possible page, check if the buddy
    * of the next-highest order is free. If it is, it's possible
@@@ -897,7 -1059,7 +897,7 @@@ buddy_merge_likely(unsigned long pfn, u
         unsigned long higher_page_pfn;
         struct page *higher_page;
   
- -      if (order >= MAX_ORDER - 2)
+ +      if (order >= MAX_ORDER - 1)
                 return false;
   
         higher_page_pfn = buddy_pfn & pfn;
@@@ -952,7 -1114,7 +952,7 @@@ static inline void __free_one_page(stru
         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
         VM_BUG_ON_PAGE(bad_range(zone, page), page);
   
- -      while (order < MAX_ORDER - 1) {
+ +      while (order < MAX_ORDER) {
                 if (compaction_capture(capc, page, order, migratetype)) {
                         __mod_zone_freepage_state(zone, -(1 << order),
                                                                 migratetype);
@@@ -1193,19 -1355,13 +1193,19 @@@ out
   /*
    * Skip KASAN memory poisoning when either:
    *
- - * 1. Deferred memory initialization has not yet completed,
- - *    see the explanation below.
- - * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
- - *    see the comment next to it.
- - * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
- - *    see the comment next to it.
- - * 4. The allocation is excluded from being checked due to sampling,
+ + * 1. For generic KASAN: deferred memory initialization has not yet completed.
+ + *    Tag-based KASAN modes skip pages freed via deferred memory initialization
+ + *    using page tags instead (see below).
+ + * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
+ + *    that error detection is disabled for accesses via the page address.
+ + *
+ + * Pages will have match-all tags in the following circumstances:
+ + *
+ + * 1. Pages are being initialized for the first time, including during deferred
+ + *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
+ + * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
+ + *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
+ + * 3. The allocation was excluded from being checked due to sampling,
    *    see the call to kasan_unpoison_pages.
    *
    * Poisoning pages during deferred memory init will greatly lengthen the
@@@ -1221,10 -1377,10 +1221,10 @@@
    */
   static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
   {
- -      return deferred_pages_enabled() ||
- -             (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- -              (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- -             PageSkipKASanPoison(page);
+ +      if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ +              return deferred_pages_enabled();
+ +
+ +      return page_kasan_tag(page) == 0xff;
   }
   
   static void kernel_init_pages(struct page *page, int numpages)
@@@ -1239,7 -1395,7 +1239,7 @@@
   }
   
   static __always_inline bool free_pages_prepare(struct page *page,
- -                      unsigned int order, bool check_free, fpi_t fpi_flags)
+ +                      unsigned int order, fpi_t fpi_flags)
   {
         int bad = 0;
         bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
@@@ -1277,11 -1433,9 +1277,11 @@@
                 for (i = 1; i < (1 << order); i++) {
                         if (compound)
                                 bad += free_tail_pages_check(page, page + i);
- -                      if (unlikely(free_page_is_bad(page + i))) {
- -                              bad++;
- -                              continue;
+ +                      if (is_check_pages_enabled()) {
+ +                              if (unlikely(free_page_is_bad(page + i))) {
+ +                                      bad++;
+ +                                      continue;
+ +                              }
                         }
                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
                 }
@@@ -1290,12 -1444,10 +1290,12 @@@
                 page->mapping = NULL;
         if (memcg_kmem_online() && PageMemcgKmem(page))
                 __memcg_kmem_uncharge_page(page, order);
- -      if (check_free && free_page_is_bad(page))
- -              bad++;
- -      if (bad)
- -              return false;
+ +      if (is_check_pages_enabled()) {
+ +              if (free_page_is_bad(page))
+ +                      bad++;
+ +              if (bad)
+ +                      return false;
+ +      }
   
         page_cpupid_reset_last(page);
         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
@@@ -1341,6 -1493,46 +1341,6 @@@
         return true;
   }
   
- -#ifdef CONFIG_DEBUG_VM
- -/*
- - * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
- - * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
- - * moved from pcp lists to free lists.
- - */
- -static bool free_pcp_prepare(struct page *page, unsigned int order)
- -{
- -      return free_pages_prepare(page, order, true, FPI_NONE);
- -}
- -
- -/* return true if this page has an inappropriate state */
- -static bool bulkfree_pcp_prepare(struct page *page)
- -{
- -      if (debug_pagealloc_enabled_static())
- -              return free_page_is_bad(page);
- -      else
- -              return false;
- -}
- -#else
- -/*
- - * With DEBUG_VM disabled, order-0 pages being freed are checked only when
- - * moving from pcp lists to free list in order to reduce overhead. With
- - * debug_pagealloc enabled, they are checked also immediately when being freed
- - * to the pcp lists.
- - */
- -static bool free_pcp_prepare(struct page *page, unsigned int order)
- -{
- -      if (debug_pagealloc_enabled_static())
- -              return free_pages_prepare(page, order, true, FPI_NONE);
- -      else
- -              return free_pages_prepare(page, order, false, FPI_NONE);
- -}
- -
- -static bool bulkfree_pcp_prepare(struct page *page)
- -{
- -      return free_page_is_bad(page);
- -}
- -#endif /* CONFIG_DEBUG_VM */
- -
   /*
    * Frees a number of pages from the PCP lists
    * Assumes all pages on list are in same zone.
@@@ -1400,6 -1592,9 +1400,6 @@@ static void free_pcppages_bulk(struct z
                         count -= nr_pages;
                         pcp->count -= nr_pages;
   
- -                      if (bulkfree_pcp_prepare(page))
- -                              continue;
- -
                         /* MIGRATE_ISOLATE page should not go to pcplists */
                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
                         /* Pageblock could have been isolated meanwhile */
@@@ -1430,6 -1625,80 +1430,6 @@@ static void free_one_page(struct zone *
         spin_unlock_irqrestore(&zone->lock, flags);
   }
   
- -static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- -                              unsigned long zone, int nid)
- -{
- -      mm_zero_struct_page(page);
- -      set_page_links(page, zone, nid, pfn);
- -      init_page_count(page);
- -      page_mapcount_reset(page);
- -      page_cpupid_reset_last(page);
- -      page_kasan_tag_reset(page);
- -
- -      INIT_LIST_HEAD(&page->lru);
- -#ifdef WANT_PAGE_VIRTUAL
- -      /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- -      if (!is_highmem_idx(zone))
- -              set_page_address(page, __va(pfn << PAGE_SHIFT));
- -#endif
- -}
- -
- -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- -static void __meminit init_reserved_page(unsigned long pfn)
- -{
- -      pg_data_t *pgdat;
- -      int nid, zid;
- -
- -      if (early_page_initialised(pfn))
- -              return;
- -
- -      nid = early_pfn_to_nid(pfn);
- -      pgdat = NODE_DATA(nid);
- -
- -      for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- -              struct zone *zone = &pgdat->node_zones[zid];
- -
- -              if (zone_spans_pfn(zone, pfn))
- -                      break;
- -      }
- -      __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
- -}
- -#else
- -static inline void init_reserved_page(unsigned long pfn)
- -{
- -}
- -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
- -
- -/*
- - * Initialised pages do not have PageReserved set. This function is
- - * called for each range allocated by the bootmem allocator and
- - * marks the pages PageReserved. The remaining valid pages are later
- - * sent to the buddy page allocator.
- - */
- -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
- -{
- -      unsigned long start_pfn = PFN_DOWN(start);
- -      unsigned long end_pfn = PFN_UP(end);
- -
- -      for (; start_pfn < end_pfn; start_pfn++) {
- -              if (pfn_valid(start_pfn)) {
- -                      struct page *page = pfn_to_page(start_pfn);
- -
- -                      init_reserved_page(start_pfn);
- -
- -                      /* Avoid false-positive PageTail() */
- -                      INIT_LIST_HEAD(&page->lru);
- -
- -                      /*
- -                       * no need for atomic set_bit because the struct
- -                       * page is not visible yet so nobody should
- -                       * access it yet.
- -                       */
- -                      __SetPageReserved(page);
- -              }
- -      }
- -}
- -
   static void __free_pages_ok(struct page *page, unsigned int order,
                             fpi_t fpi_flags)
   {
@@@ -1438,7 -1707,7 +1438,7 @@@
         unsigned long pfn = page_to_pfn(page);
         struct zone *zone = page_zone(page);
   
- -      if (!free_pages_prepare(page, order, true, fpi_flags))
+ +      if (!free_pages_prepare(page, order, fpi_flags))
                 return;
   
         /*
@@@ -1485,7 -1754,71 +1485,7 @@@ void __free_pages_core(struct page *pag
          * Bypass PCP and place fresh pages right to the tail, primarily
          * relevant for memory onlining.
          */
- -      __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
- -}
- -
- -#ifdef CONFIG_NUMA
- -
- -/*
- - * During memory init memblocks map pfns to nids. The search is expensive and
- - * this caches recent lookups. The implementation of __early_pfn_to_nid
- - * treats start/end as pfns.
- - */
- -struct mminit_pfnnid_cache {
- -      unsigned long last_start;
- -      unsigned long last_end;
- -      int last_nid;
- -};
- -
- -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
- -
- -/*
- - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- - */
- -static int __meminit __early_pfn_to_nid(unsigned long pfn,
- -                                      struct mminit_pfnnid_cache *state)
- -{
- -      unsigned long start_pfn, end_pfn;
- -      int nid;
- -
- -      if (state->last_start <= pfn && pfn < state->last_end)
- -              return state->last_nid;
- -
- -      nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- -      if (nid != NUMA_NO_NODE) {
- -              state->last_start = start_pfn;
- -              state->last_end = end_pfn;
- -              state->last_nid = nid;
- -      }
- -
- -      return nid;
- -}
- -
- -int __meminit early_pfn_to_nid(unsigned long pfn)
- -{
- -      static DEFINE_SPINLOCK(early_pfn_lock);
- -      int nid;
- -
- -      spin_lock(&early_pfn_lock);
- -      nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- -      if (nid < 0)
- -              nid = first_online_node;
- -      spin_unlock(&early_pfn_lock);
- -
- -      return nid;
- -}
- -#endif /* CONFIG_NUMA */
- -
- -void __init memblock_free_pages(struct page *page, unsigned long pfn,
- -                                                      unsigned int order)
- -{
- -      if (!early_page_initialised(pfn))
- -              return;
- -      if (!kmsan_memblock_free_pages(page, order)) {
- -              /* KMSAN will take care of these pages. */
- -              return;
- -      }
- -      __free_pages_core(page, order);
+ +      __free_pages_ok(page, order, FPI_TO_TAIL);
   }
   
   /*
@@@ -1558,84 -1891,559 +1558,84 @@@ void clear_zone_contiguous(struct zone 
         zone->contiguous = false;
   }
   
- -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- -static void __init deferred_free_range(unsigned long pfn,
- -                                     unsigned long nr_pages)
+ +/*
+ + * The order of subdivision here is critical for the IO subsystem.
+ + * Please do not alter this order without good reasons and regression
+ + * testing. Specifically, as large blocks of memory are subdivided,
+ + * the order in which smaller blocks are delivered depends on the order
+ + * they're subdivided in this function. This is the primary factor
+ + * influencing the order in which pages are delivered to the IO
+ + * subsystem according to empirical testing, and this is also justified
+ + * by considering the behavior of a buddy system containing a single
+ + * large block of memory acted on by a series of small allocations.
+ + * This behavior is a critical factor in sglist merging's success.
+ + *
+ + * -- nyc
+ + */
+ +static inline void expand(struct zone *zone, struct page *page,
+ +      int low, int high, int migratetype)
   {
- -      struct page *page;
- -      unsigned long i;
- -
- -      if (!nr_pages)
- -              return;
+ +      unsigned long size = 1 << high;
   
- -      page = pfn_to_page(pfn);
+ +      while (high > low) {
+ +              high--;
+ +              size >>= 1;
+ +              VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
   
- -      /* Free a large naturally-aligned chunk if possible */
- -      if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
- -              set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- -              __free_pages_core(page, pageblock_order);
- -              return;
- -      }
+ +              /*
+ +               * Mark as guard pages (or page), that will allow to
+ +               * merge back to allocator when buddy will be freed.
+ +               * Corresponding page table entries will not be touched,
+ +               * pages will stay not present in virtual address space
+ +               */
+ +              if (set_page_guard(zone, &page[size], high, migratetype))
+ +                      continue;
   
- -      for (i = 0; i < nr_pages; i++, page++, pfn++) {
- -              if (pageblock_aligned(pfn))
- -                      set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- -              __free_pages_core(page, 0);
+ +              add_to_free_list(&page[size], zone, high, migratetype);
+ +              set_buddy_order(&page[size], high);
         }
   }
   
- -/* Completion tracking for deferred_init_memmap() threads */
- -static atomic_t pgdat_init_n_undone __initdata;
- -static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
- -
- -static inline void __init pgdat_init_report_one_done(void)
+ +static void check_new_page_bad(struct page *page)
   {
- -      if (atomic_dec_and_test(&pgdat_init_n_undone))
- -              complete(&pgdat_init_all_done_comp);
+ +      if (unlikely(page->flags & __PG_HWPOISON)) {
+ +              /* Don't complain about hwpoisoned pages */
+ +              page_mapcount_reset(page); /* remove PageBuddy */
+ +              return;
+ +      }
+ +
+ +      bad_page(page,
+ +               page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
   }
   
   /*
- - * Returns true if page needs to be initialized or freed to buddy allocator.
- - *
- - * We check if a current large page is valid by only checking the validity
- - * of the head pfn.
+ + * This page is about to be returned from the page allocator
    */
- -static inline bool __init deferred_pfn_valid(unsigned long pfn)
+ +static int check_new_page(struct page *page)
   {
- -      if (pageblock_aligned(pfn) && !pfn_valid(pfn))
- -              return false;
- -      return true;
- -}
+ +      if (likely(page_expected_state(page,
+ +                              PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+ +              return 0;
   
- -/*
- - * Free pages to buddy allocator. Try to free aligned pages in
- - * pageblock_nr_pages sizes.
- - */
- -static void __init deferred_free_pages(unsigned long pfn,
- -                                     unsigned long end_pfn)
- -{
- -      unsigned long nr_free = 0;
- -
- -      for (; pfn < end_pfn; pfn++) {
- -              if (!deferred_pfn_valid(pfn)) {
- -                      deferred_free_range(pfn - nr_free, nr_free);
- -                      nr_free = 0;
- -              } else if (pageblock_aligned(pfn)) {
- -                      deferred_free_range(pfn - nr_free, nr_free);
- -                      nr_free = 1;
- -              } else {
- -                      nr_free++;
- -              }
- -      }
- -      /* Free the last block of pages to allocator */
- -      deferred_free_range(pfn - nr_free, nr_free);
+ +      check_new_page_bad(page);
+ +      return 1;
   }
   
- -/*
- - * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
- - * by performing it only once every pageblock_nr_pages.
- - * Return number of pages initialized.
- - */
- -static unsigned long  __init deferred_init_pages(struct zone *zone,
- -                                               unsigned long pfn,
- -                                               unsigned long end_pfn)
+ +static inline bool check_new_pages(struct page *page, unsigned int order)
   {
- -      int nid = zone_to_nid(zone);
- -      unsigned long nr_pages = 0;
- -      int zid = zone_idx(zone);
- -      struct page *page = NULL;
+ +      if (is_check_pages_enabled()) {
+ +              for (int i = 0; i < (1 << order); i++) {
+ +                      struct page *p = page + i;
   
- -      for (; pfn < end_pfn; pfn++) {
- -              if (!deferred_pfn_valid(pfn)) {
- -                      page = NULL;
- -                      continue;
- -              } else if (!page || pageblock_aligned(pfn)) {
- -                      page = pfn_to_page(pfn);
- -              } else {
- -                      page++;
+ +                      if (unlikely(check_new_page(p)))
+ +                              return true;
                 }
- -              __init_single_page(page, pfn, zid, nid);
- -              nr_pages++;
         }
- -      return (nr_pages);
+ +
+ +      return false;
   }
   
- -/*
- - * This function is meant to pre-load the iterator for the zone init.
- - * Specifically it walks through the ranges until we are caught up to the
- - * first_init_pfn value and exits there. If we never encounter the value we
- - * return false indicating there are no valid ranges left.
- - */
- -static bool __init
- -deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
- -                                  unsigned long *spfn, unsigned long *epfn,
- -                                  unsigned long first_init_pfn)
- -{
- -      u64 j;
- -
- -      /*
- -       * Start out by walking through the ranges in this zone that have
- -       * already been initialized. We don't need to do anything with them
- -       * so we just need to flush them out of the system.
- -       */
- -      for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
- -              if (*epfn <= first_init_pfn)
- -                      continue;
- -              if (*spfn < first_init_pfn)
- -                      *spfn = first_init_pfn;
- -              *i = j;
- -              return true;
- -      }
- -
- -      return false;
- -}
- -
- -/*
- - * Initialize and free pages. We do it in two loops: first we initialize
- - * struct page, then free to buddy allocator, because while we are
- - * freeing pages we can access pages that are ahead (computing buddy
- - * page in __free_one_page()).
- - *
- - * In order to try and keep some memory in the cache we have the loop
- - * broken along max page order boundaries. This way we will not cause
- - * any issues with the buddy page computation.
- - */
- -static unsigned long __init
- -deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
- -                     unsigned long *end_pfn)
- -{
- -      unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
- -      unsigned long spfn = *start_pfn, epfn = *end_pfn;
- -      unsigned long nr_pages = 0;
- -      u64 j = *i;
- -
- -      /* First we loop through and initialize the page values */
- -      for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
- -              unsigned long t;
- -
- -              if (mo_pfn <= *start_pfn)
- -                      break;
- -
- -              t = min(mo_pfn, *end_pfn);
- -              nr_pages += deferred_init_pages(zone, *start_pfn, t);
- -
- -              if (mo_pfn < *end_pfn) {
- -                      *start_pfn = mo_pfn;
- -                      break;
- -              }
- -      }
- -
- -      /* Reset values and now loop through freeing pages as needed */
- -      swap(j, *i);
- -
- -      for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
- -              unsigned long t;
- -
- -              if (mo_pfn <= spfn)
- -                      break;
- -
- -              t = min(mo_pfn, epfn);
- -              deferred_free_pages(spfn, t);
- -
- -              if (mo_pfn <= epfn)
- -                      break;
- -      }
- -
- -      return nr_pages;
- -}
- -
- -static void __init
- -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- -                         void *arg)
- -{
- -      unsigned long spfn, epfn;
- -      struct zone *zone = arg;
- -      u64 i;
- -
- -      deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
- -
- -      /*
- -       * Initialize and free pages in MAX_ORDER sized increments so that we
- -       * can avoid introducing any issues with the buddy allocator.
- -       */
- -      while (spfn < end_pfn) {
- -              deferred_init_maxorder(&i, zone, &spfn, &epfn);
- -              cond_resched();
- -      }
- -}
- -
- -/* An arch may override for more concurrency. */
- -__weak int __init
- -deferred_page_init_max_threads(const struct cpumask *node_cpumask)
- -{
- -      return 1;
- -}
- -
- -/* Initialise remaining memory on a node */
- -static int __init deferred_init_memmap(void *data)
- -{
- -      pg_data_t *pgdat = data;
- -      const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- -      unsigned long spfn = 0, epfn = 0;
- -      unsigned long first_init_pfn, flags;
- -      unsigned long start = jiffies;
- -      struct zone *zone;
- -      int zid, max_threads;
- -      u64 i;
- -
- -      /* Bind memory initialisation thread to a local node if possible */
- -      if (!cpumask_empty(cpumask))
- -              set_cpus_allowed_ptr(current, cpumask);
- -
- -      pgdat_resize_lock(pgdat, &flags);
- -      first_init_pfn = pgdat->first_deferred_pfn;
- -      if (first_init_pfn == ULONG_MAX) {
- -              pgdat_resize_unlock(pgdat, &flags);
- -              pgdat_init_report_one_done();
- -              return 0;
- -      }
- -
- -      /* Sanity check boundaries */
- -      BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
- -      BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
- -      pgdat->first_deferred_pfn = ULONG_MAX;
- -
- -      /*
- -       * Once we unlock here, the zone cannot be grown anymore, thus if an
- -       * interrupt thread must allocate this early in boot, zone must be
- -       * pre-grown prior to start of deferred page initialization.
- -       */
- -      pgdat_resize_unlock(pgdat, &flags);
- -
- -      /* Only the highest zone is deferred so find it */
- -      for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- -              zone = pgdat->node_zones + zid;
- -              if (first_init_pfn < zone_end_pfn(zone))
- -                      break;
- -      }
- -
- -      /* If the zone is empty somebody else may have cleared out the zone */
- -      if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- -                                               first_init_pfn))
- -              goto zone_empty;
- -
- -      max_threads = deferred_page_init_max_threads(cpumask);
- -
- -      while (spfn < epfn) {
- -              unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
- -              struct padata_mt_job job = {
- -                      .thread_fn   = deferred_init_memmap_chunk,
- -                      .fn_arg      = zone,
- -                      .start       = spfn,
- -                      .size        = epfn_align - spfn,
- -                      .align       = PAGES_PER_SECTION,
- -                      .min_chunk   = PAGES_PER_SECTION,
- -                      .max_threads = max_threads,
- -              };
- -
- -              padata_do_multithreaded(&job);
- -              deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- -                                                  epfn_align);
- -      }
- -zone_empty:
- -      /* Sanity check that the next zone really is unpopulated */
- -      WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- -
- -      pr_info("node %d deferred pages initialised in %ums\n",
- -              pgdat->node_id, jiffies_to_msecs(jiffies - start));
- -
- -      pgdat_init_report_one_done();
- -      return 0;
- -}
- -
- -/*
- - * If this zone has deferred pages, try to grow it by initializing enough
- - * deferred pages to satisfy the allocation specified by order, rounded up to
- - * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
- - * of SECTION_SIZE bytes by initializing struct pages in increments of
- - * PAGES_PER_SECTION * sizeof(struct page) bytes.
- - *
- - * Return true when zone was grown, otherwise return false. We return true even
- - * when we grow less than requested, to let the caller decide if there are
- - * enough pages to satisfy the allocation.
- - *
- - * Note: We use noinline because this function is needed only during boot, and
- - * it is called from a __ref function _deferred_grow_zone. This way we are
- - * making sure that it is not inlined into permanent text section.
- - */
- -static noinline bool __init
- -deferred_grow_zone(struct zone *zone, unsigned int order)
- -{
- -      unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- -      pg_data_t *pgdat = zone->zone_pgdat;
- -      unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- -      unsigned long spfn, epfn, flags;
- -      unsigned long nr_pages = 0;
- -      u64 i;
- -
- -      /* Only the last zone may have deferred pages */
- -      if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
- -              return false;
- -
- -      pgdat_resize_lock(pgdat, &flags);
- -
- -      /*
- -       * If someone grew this zone while we were waiting for spinlock, return
- -       * true, as there might be enough pages already.
- -       */
- -      if (first_deferred_pfn != pgdat->first_deferred_pfn) {
- -              pgdat_resize_unlock(pgdat, &flags);
- -              return true;
- -      }
- -
- -      /* If the zone is empty somebody else may have cleared out the zone */
- -      if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- -                                               first_deferred_pfn)) {
- -              pgdat->first_deferred_pfn = ULONG_MAX;
- -              pgdat_resize_unlock(pgdat, &flags);
- -              /* Retry only once. */
- -              return first_deferred_pfn != ULONG_MAX;
- -      }
- -
- -      /*
- -       * Initialize and free pages in MAX_ORDER sized increments so
- -       * that we can avoid introducing any issues with the buddy
- -       * allocator.
- -       */
- -      while (spfn < epfn) {
- -              /* update our first deferred PFN for this section */
- -              first_deferred_pfn = spfn;
- -
- -              nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
- -              touch_nmi_watchdog();
- -
- -              /* We should only stop along section boundaries */
- -              if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
- -                      continue;
- -
- -              /* If our quota has been met we can stop here */
- -              if (nr_pages >= nr_pages_needed)
- -                      break;
- -      }
- -
- -      pgdat->first_deferred_pfn = spfn;
- -      pgdat_resize_unlock(pgdat, &flags);
- -
- -      return nr_pages > 0;
- -}
- -
- -/*
- - * deferred_grow_zone() is __init, but it is called from
- - * get_page_from_freelist() during early boot until deferred_pages permanently
- - * disables this call. This is why we have refdata wrapper to avoid warning,
- - * and to ensure that the function body gets unloaded.
- - */
- -static bool __ref
- -_deferred_grow_zone(struct zone *zone, unsigned int order)
- -{
- -      return deferred_grow_zone(zone, order);
- -}
- -
- -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
- -
- -void __init page_alloc_init_late(void)
- -{
- -      struct zone *zone;
- -      int nid;
- -
- -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- -
- -      /* There will be num_node_state(N_MEMORY) threads */
- -      atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
- -      for_each_node_state(nid, N_MEMORY) {
- -              kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
- -      }
- -
- -      /* Block until all are initialised */
- -      wait_for_completion(&pgdat_init_all_done_comp);
- -
- -      /*
- -       * We initialized the rest of the deferred pages.  Permanently disable
- -       * on-demand struct page initialization.
- -       */
- -      static_branch_disable(&deferred_pages);
- -
- -      /* Reinit limits that are based on free pages after the kernel is up */
- -      files_maxfiles_init();
- -#endif
- -
- -      buffer_init();
- -
- -      /* Discard memblock private memory */
- -      memblock_discard();
- -
- -      for_each_node_state(nid, N_MEMORY)
- -              shuffle_free_memory(NODE_DATA(nid));
- -
- -      for_each_populated_zone(zone)
- -              set_zone_contiguous(zone);
- -}
- -
- -#ifdef CONFIG_CMA
- -/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
- -void __init init_cma_reserved_pageblock(struct page *page)
- -{
- -      unsigned i = pageblock_nr_pages;
- -      struct page *p = page;
- -
- -      do {
- -              __ClearPageReserved(p);
- -              set_page_count(p, 0);
- -      } while (++p, --i);
- -
- -      set_pageblock_migratetype(page, MIGRATE_CMA);
- -      set_page_refcounted(page);
- -      __free_pages(page, pageblock_order);
- -
- -      adjust_managed_page_count(page, pageblock_nr_pages);
- -      page_zone(page)->cma_pages += pageblock_nr_pages;
- -}
- -#endif
- -
- -/*
- - * The order of subdivision here is critical for the IO subsystem.
- - * Please do not alter this order without good reasons and regression
- - * testing. Specifically, as large blocks of memory are subdivided,
- - * the order in which smaller blocks are delivered depends on the order
- - * they're subdivided in this function. This is the primary factor
- - * influencing the order in which pages are delivered to the IO
- - * subsystem according to empirical testing, and this is also justified
- - * by considering the behavior of a buddy system containing a single
- - * large block of memory acted on by a series of small allocations.
- - * This behavior is a critical factor in sglist merging's success.
- - *
- - * -- nyc
- - */
- -static inline void expand(struct zone *zone, struct page *page,
- -      int low, int high, int migratetype)
- -{
- -      unsigned long size = 1 << high;
- -
- -      while (high > low) {
- -              high--;
- -              size >>= 1;
- -              VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
- -
- -              /*
- -               * Mark as guard pages (or page), that will allow to
- -               * merge back to allocator when buddy will be freed.
- -               * Corresponding page table entries will not be touched,
- -               * pages will stay not present in virtual address space
- -               */
- -              if (set_page_guard(zone, &page[size], high, migratetype))
- -                      continue;
- -
- -              add_to_free_list(&page[size], zone, high, migratetype);
- -              set_buddy_order(&page[size], high);
- -      }
- -}
- -
- -static void check_new_page_bad(struct page *page)
- -{
- -      if (unlikely(page->flags & __PG_HWPOISON)) {
- -              /* Don't complain about hwpoisoned pages */
- -              page_mapcount_reset(page); /* remove PageBuddy */
- -              return;
- -      }
- -
- -      bad_page(page,
- -               page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
- -}
- -
- -/*
- - * This page is about to be returned from the page allocator
- - */
- -static inline int check_new_page(struct page *page)
- -{
- -      if (likely(page_expected_state(page,
- -                              PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
- -              return 0;
- -
- -      check_new_page_bad(page);
- -      return 1;
- -}
- -
- -static bool check_new_pages(struct page *page, unsigned int order)
- -{
- -      int i;
- -      for (i = 0; i < (1 << order); i++) {
- -              struct page *p = page + i;
- -
- -              if (unlikely(check_new_page(p)))
- -                      return true;
- -      }
- -
- -      return false;
- -}
- -
- -#ifdef CONFIG_DEBUG_VM
- -/*
- - * With DEBUG_VM enabled, order-0 pages are checked for expected state when
- - * being allocated from pcp lists. With debug_pagealloc also enabled, they are
- - * also checked when pcp lists are refilled from the free lists.
- - */
- -static inline bool check_pcp_refill(struct page *page, unsigned int order)
- -{
- -      if (debug_pagealloc_enabled_static())
- -              return check_new_pages(page, order);
- -      else
- -              return false;
- -}
- -
- -static inline bool check_new_pcp(struct page *page, unsigned int order)
- -{
- -      return check_new_pages(page, order);
- -}
- -#else
- -/*
- - * With DEBUG_VM disabled, free order-0 pages are checked for expected state
- - * when pcp lists are being refilled from the free lists. With debug_pagealloc
- - * enabled, they are also checked when being allocated from the pcp lists.
- - */
- -static inline bool check_pcp_refill(struct page *page, unsigned int order)
- -{
- -      return check_new_pages(page, order);
- -}
- -static inline bool check_new_pcp(struct page *page, unsigned int order)
- -{
- -      if (debug_pagealloc_enabled_static())
- -              return check_new_pages(page, order);
- -      else
- -              return false;
- -}
- -#endif /* CONFIG_DEBUG_VM */
- -
- -static inline bool should_skip_kasan_unpoison(gfp_t flags)
+ +static inline bool should_skip_kasan_unpoison(gfp_t flags)
   {
         /* Don't skip if a software KASAN mode is enabled. */
         if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
@@@ -1648,9 -2456,9 +1648,9 @@@
   
         /*
          * With hardware tag-based KASAN enabled, skip if this has been
- -       * requested via __GFP_SKIP_KASAN_UNPOISON.
+ +       * requested via __GFP_SKIP_KASAN.
          */
- -      return flags & __GFP_SKIP_KASAN_UNPOISON;
+ +      return flags & __GFP_SKIP_KASAN;
   }
   
   static inline bool should_skip_init(gfp_t flags)
@@@ -1669,6 -2477,7 +1669,6 @@@ inline void post_alloc_hook(struct pag
         bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
                         !should_skip_init(gfp_flags);
         bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
- -      bool reset_tags = true;
         int i;
   
         set_page_private(page, 0);
@@@ -1702,22 -2511,37 +1702,22 @@@
                 /* Take note that memory was initialized by the loop above. */
                 init = false;
         }
- -      if (!should_skip_kasan_unpoison(gfp_flags)) {
- -              /* Try unpoisoning (or setting tags) and initializing memory. */
- -              if (kasan_unpoison_pages(page, order, init)) {
- -                      /* Take note that memory was initialized by KASAN. */
- -                      if (kasan_has_integrated_init())
- -                              init = false;
- -                      /* Take note that memory tags were set by KASAN. */
- -                      reset_tags = false;
- -              } else {
- -                      /*
- -                       * KASAN decided to exclude this allocation from being
- -                       * (un)poisoned due to sampling. Make KASAN skip
- -                       * poisoning when the allocation is freed.
- -                       */
- -                      SetPageSkipKASanPoison(page);
- -              }
- -      }
- -      /*
- -       * If memory tags have not been set by KASAN, reset the page tags to
- -       * ensure page_address() dereferencing does not fault.
- -       */
- -      if (reset_tags) {
+ +      if (!should_skip_kasan_unpoison(gfp_flags) &&
+ +          kasan_unpoison_pages(page, order, init)) {
+ +              /* Take note that memory was initialized by KASAN. */
+ +              if (kasan_has_integrated_init())
+ +                      init = false;
+ +      } else {
+ +              /*
+ +               * If memory tags have not been set by KASAN, reset the page
+ +               * tags to ensure page_address() dereferencing does not fault.
+ +               */
                 for (i = 0; i != 1 << order; ++i)
                         page_kasan_tag_reset(page + i);
         }
         /* If memory is still not initialized, initialize it now. */
         if (init)
                 kernel_init_pages(page, 1 << order);
- -      /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
- -      if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
- -              SetPageSkipKASanPoison(page);
   
         set_page_owner(page, order, gfp_flags);
         page_table_check_alloc(page, order);
@@@ -1756,7 -2580,7 +1756,7 @@@ struct page *__rmqueue_smallest(struct 
         struct page *page;
   
         /* Find a page of the appropriate size in the preferred list */
- -      for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ +      for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
                 area = &(zone->free_area[current_order]);
                 page = get_page_from_free_area(area, migratetype);
                 if (!page)
@@@ -2128,7 -2952,7 +2128,7 @@@ static bool unreserve_highatomic_pagebl
                         continue;
   
                 spin_lock_irqsave(&zone->lock, flags);
- -              for (order = 0; order < MAX_ORDER; order++) {
+ +              for (order = 0; order <= MAX_ORDER; order++) {
                         struct free_area *area = &(zone->free_area[order]);
   
                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@@ -2212,7 -3036,7 +2212,7 @@@ __rmqueue_fallback(struct zone *zone, i
          * approximates finding the pageblock with the most free pages, which
          * would be too costly to do exactly.
          */
- -      for (current_order = MAX_ORDER - 1; current_order >= min_order;
+ +      for (current_order = MAX_ORDER; current_order >= min_order;
                                 --current_order) {
                 area = &(zone->free_area[current_order]);
                 fallback_mt = find_suitable_fallback(area, current_order,
@@@ -2238,7 -3062,7 +2238,7 @@@
         return false;
   
   find_smallest:
- -      for (current_order = order; current_order < MAX_ORDER;
+ +      for (current_order = order; current_order <= MAX_ORDER;
                                                         current_order++) {
                 area = &(zone->free_area[current_order]);
                 fallback_mt = find_suitable_fallback(area, current_order,
@@@ -2251,7 -3075,7 +2251,7 @@@
          * This should not happen - we already found a suitable fallback
          * when looking for the largest page.
          */
- -      VM_BUG_ON(current_order == MAX_ORDER);
+ +      VM_BUG_ON(current_order > MAX_ORDER);
   
   do_steal:
         page = get_page_from_free_area(area, fallback_mt);
@@@ -2313,7 -3137,7 +2313,7 @@@ static int rmqueue_bulk(struct zone *zo
                         int migratetype, unsigned int alloc_flags)
   {
         unsigned long flags;
- -      int i, allocated = 0;
+ +      int i;
   
         spin_lock_irqsave(&zone->lock, flags);
         for (i = 0; i < count; ++i) {
@@@ -2322,6 -3146,9 +2322,6 @@@
                 if (unlikely(page == NULL))
                         break;
   
- -              if (unlikely(check_pcp_refill(page, order)))
- -                      continue;
- -
                 /*
                  * Split buddy pages returned by expand() are received here in
                  * physical page order. The page is added to the tail of
@@@ -2333,15 -3160,21 +2333,15 @@@
                  * pages are ordered properly.
                  */
                 list_add_tail(&page->pcp_list, list);
- -              allocated++;
                 if (is_migrate_cma(get_pcppage_migratetype(page)))
                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                               -(1 << order));
         }
   
- -      /*
- -       * i pages were removed from the buddy list even if some leak due
- -       * to check_pcp_refill failing so adjust NR_FREE_PAGES based
- -       * on i. Do not confuse with 'allocated' which is the number of
- -       * pages added to the pcp list.
- -       */
         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
         spin_unlock_irqrestore(&zone->lock, flags);
- -      return allocated;
+ +
+ +      return i;
   }
   
   #ifdef CONFIG_NUMA
@@@ -2552,7 -3385,7 +2552,7 @@@ static bool free_unref_page_prepare(str
   {
         int migratetype;
   
- -      if (!free_pcp_prepare(page, order))
+ +      if (!free_pages_prepare(page, order, FPI_NONE))
                 return false;
   
         migratetype = get_pfnblock_migratetype(page, pfn);
@@@ -2958,7 -3791,7 +2958,7 @@@ struct page *__rmqueue_pcplist(struct z
                 page = list_first_entry(list, struct page, pcp_list);
                 list_del(&page->pcp_list);
                 pcp->count -= 1 << order;
- -      } while (check_new_pcp(page, order));
+ +      } while (check_new_pages(page, order));
   
         return page;
   }
@@@ -3212,7 -4045,7 +3212,7 @@@ bool __zone_watermark_ok(struct zone *z
                 return true;
   
         /* For a high-order request, check at least one suitable page is free */
- -      for (o = order; o < MAX_ORDER; o++) {
+ +      for (o = order; o <= MAX_ORDER; o++) {
                 struct free_area *area = &z->free_area[o];
                 int mt;
   
@@@ -4732,7 -5565,7 +4732,7 @@@ struct page *__alloc_pages(gfp_t gfp, u
          * There are several places where we assume that the order value is sane
          * so bail out early if the request is out of bound.
          */
- -      if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
+ +      if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
                 return NULL;
   
         gfp &= gfp_allowed_mask;
@@@ -4815,7 -5648,7 +4815,7 @@@ EXPORT_SYMBOL(__get_free_pages)
   
   unsigned long get_zeroed_page(gfp_t gfp_mask)
   {
- -      return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
+ +      return __get_free_page(gfp_mask | __GFP_ZERO);
   }
   EXPORT_SYMBOL(get_zeroed_page);
   
@@@ -5246,6 -6079,8 +5246,6 @@@ static bool show_mem_node_skip(unsigne
         return !node_isset(nid, *nodemask);
   }
   
- -#define K(x) ((x) << (PAGE_SHIFT-10))
- -
   static void show_migration_types(unsigned char type)
   {
         static const char types[MIGRATE_TYPES] = {
@@@ -5460,8 -6295,8 +5460,8 @@@ void __show_free_areas(unsigned int fil
   
         for_each_populated_zone(zone) {
                 unsigned int order;
- -              unsigned long nr[MAX_ORDER], flags, total = 0;
- -              unsigned char types[MAX_ORDER];
+ +              unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ +              unsigned char types[MAX_ORDER + 1];
   
                 if (zone_idx(zone) > max_zone_idx)
                         continue;
@@@ -5471,7 -6306,7 +5471,7 @@@
                 printk(KERN_CONT "%s: ", zone->name);
   
                 spin_lock_irqsave(&zone->lock, flags);
- -              for (order = 0; order < MAX_ORDER; order++) {
+ +              for (order = 0; order <= MAX_ORDER; order++) {
                         struct free_area *area = &zone->free_area[order];
                         int type;
   
@@@ -5485,7 -6320,7 +5485,7 @@@
                         }
                 }
                 spin_unlock_irqrestore(&zone->lock, flags);
- -              for (order = 0; order < MAX_ORDER; order++) {
+ +              for (order = 0; order <= MAX_ORDER; order++) {
                         printk(KERN_CONT "%lu*%lukB ",
                                nr[order], K(1UL) << order);
                         if (nr[order])
@@@ -5790,13 -6625,28 +5790,27 @@@ static void per_cpu_pages_init(struct p
   #define BOOT_PAGESET_BATCH    1
   static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
   static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
- -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
   
   static void __build_all_zonelists(void *data)
   {
         int nid;
         int __maybe_unused cpu;
         pg_data_t *self = data;
+       unsigned long flags;
   
+       /*
+        * Explicitly disable this CPU's interrupts before taking seqlock
+        * to prevent any IRQ handler from calling into the page allocator
+        * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+        */
+       local_irq_save(flags);
+       /*
+        * Explicitly disable this CPU's synchronous printk() before taking
+        * seqlock to prevent any printk() from trying to hold port->lock, for
+        * tty_insert_flip_string_and_push_buffer() on other CPU might be
+        * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
+        */
+       printk_deferred_enter();
         write_seqlock(&zonelist_update_seq);
   
   #ifdef CONFIG_NUMA
@@@ -5835,6 -6685,8 +5849,8 @@@
         }
   
         write_sequnlock(&zonelist_update_seq);
+       printk_deferred_exit();
+       local_irq_restore(flags);
   }
   
   static noinline void __init
@@@ -5903,10 -6755,370 +5919,10 @@@ void __ref build_all_zonelists(pg_data_
   #endif
   }
   
- -/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
- -static bool __meminit
- -overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+ +static int zone_batchsize(struct zone *zone)
   {
- -      static struct memblock_region *r;
- -
- -      if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- -              if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- -                      for_each_mem_region(r) {
- -                              if (*pfn < memblock_region_memory_end_pfn(r))
- -                                      break;
- -                      }
- -              }
- -              if (*pfn >= memblock_region_memory_base_pfn(r) &&
- -                  memblock_is_mirror(r)) {
- -                      *pfn = memblock_region_memory_end_pfn(r);
- -                      return true;
- -              }
- -      }
- -      return false;
- -}
- -
- -/*
- - * Initially all pages are reserved - free ones are freed
- - * up by memblock_free_all() once the early boot process is
- - * done. Non-atomic initialization, single-pass.
- - *
- - * All aligned pageblocks are initialized to the specified migratetype
- - * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
- - * zone stats (e.g., nr_isolate_pageblock) are touched.
- - */
- -void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
- -              unsigned long start_pfn, unsigned long zone_end_pfn,
- -              enum meminit_context context,
- -              struct vmem_altmap *altmap, int migratetype)
- -{
- -      unsigned long pfn, end_pfn = start_pfn + size;
- -      struct page *page;
- -
- -      if (highest_memmap_pfn < end_pfn - 1)
- -              highest_memmap_pfn = end_pfn - 1;
- -
- -#ifdef CONFIG_ZONE_DEVICE
- -      /*
- -       * Honor reservation requested by the driver for this ZONE_DEVICE
- -       * memory. We limit the total number of pages to initialize to just
- -       * those that might contain the memory mapping. We will defer the
- -       * ZONE_DEVICE page initialization until after we have released
- -       * the hotplug lock.
- -       */
- -      if (zone == ZONE_DEVICE) {
- -              if (!altmap)
- -                      return;
- -
- -              if (start_pfn == altmap->base_pfn)
- -                      start_pfn += altmap->reserve;
- -              end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- -      }
- -#endif
- -
- -      for (pfn = start_pfn; pfn < end_pfn; ) {
- -              /*
- -               * There can be holes in boot-time mem_map[]s handed to this
- -               * function.  They do not exist on hotplugged memory.
- -               */
- -              if (context == MEMINIT_EARLY) {
- -                      if (overlap_memmap_init(zone, &pfn))
- -                              continue;
- -                      if (defer_init(nid, pfn, zone_end_pfn)) {
- -                              deferred_struct_pages = true;
- -                              break;
- -                      }
- -              }
- -
- -              page = pfn_to_page(pfn);
- -              __init_single_page(page, pfn, zone, nid);
- -              if (context == MEMINIT_HOTPLUG)
- -                      __SetPageReserved(page);
- -
- -              /*
- -               * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
- -               * such that unmovable allocations won't be scattered all
- -               * over the place during system boot.
- -               */
- -              if (pageblock_aligned(pfn)) {
- -                      set_pageblock_migratetype(page, migratetype);
- -                      cond_resched();
- -              }
- -              pfn++;
- -      }
- -}
- -
- -#ifdef CONFIG_ZONE_DEVICE
- -static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
- -                                        unsigned long zone_idx, int nid,
- -                                        struct dev_pagemap *pgmap)
- -{
- -
- -      __init_single_page(page, pfn, zone_idx, nid);
- -
- -      /*
- -       * Mark page reserved as it will need to wait for onlining
- -       * phase for it to be fully associated with a zone.
- -       *
- -       * We can use the non-atomic __set_bit operation for setting
- -       * the flag as we are still initializing the pages.
- -       */
- -      __SetPageReserved(page);
- -
- -      /*
- -       * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
- -       * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
- -       * ever freed or placed on a driver-private list.
- -       */
- -      page->pgmap = pgmap;
- -      page->zone_device_data = NULL;
- -
- -      /*
- -       * Mark the block movable so that blocks are reserved for
- -       * movable at startup. This will force kernel allocations
- -       * to reserve their blocks rather than leaking throughout
- -       * the address space during boot when many long-lived
- -       * kernel allocations are made.
- -       *
- -       * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- -       * because this is done early in section_activate()
- -       */
- -      if (pageblock_aligned(pfn)) {
- -              set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- -              cond_resched();
- -      }
- -
- -      /*
- -       * ZONE_DEVICE pages are released directly to the driver page allocator
- -       * which will set the page count to 1 when allocating the page.
- -       */
- -      if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- -          pgmap->type == MEMORY_DEVICE_COHERENT)
- -              set_page_count(page, 0);
- -}
- -
- -/*
- - * With compound page geometry and when struct pages are stored in ram most
- - * tail pages are reused. Consequently, the amount of unique struct pages to
- - * initialize is a lot smaller that the total amount of struct pages being
- - * mapped. This is a paired / mild layering violation with explicit knowledge
- - * of how the sparse_vmemmap internals handle compound pages in the lack
- - * of an altmap. See vmemmap_populate_compound_pages().
- - */
- -static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
- -                                            unsigned long nr_pages)
- -{
- -      return is_power_of_2(sizeof(struct page)) &&
- -              !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
- -}
- -
- -static void __ref memmap_init_compound(struct page *head,
- -                                     unsigned long head_pfn,
- -                                     unsigned long zone_idx, int nid,
- -                                     struct dev_pagemap *pgmap,
- -                                     unsigned long nr_pages)
- -{
- -      unsigned long pfn, end_pfn = head_pfn + nr_pages;
- -      unsigned int order = pgmap->vmemmap_shift;
- -
- -      __SetPageHead(head);
- -      for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
- -              struct page *page = pfn_to_page(pfn);
- -
- -              __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
- -              prep_compound_tail(head, pfn - head_pfn);
- -              set_page_count(page, 0);
- -
- -              /*
- -               * The first tail page stores important compound page info.
- -               * Call prep_compound_head() after the first tail page has
- -               * been initialized, to not have the data overwritten.
- -               */
- -              if (pfn == head_pfn + 1)
- -                      prep_compound_head(head, order);
- -      }
- -}
- -
- -void __ref memmap_init_zone_device(struct zone *zone,
- -                                 unsigned long start_pfn,
- -                                 unsigned long nr_pages,
- -                                 struct dev_pagemap *pgmap)
- -{
- -      unsigned long pfn, end_pfn = start_pfn + nr_pages;
- -      struct pglist_data *pgdat = zone->zone_pgdat;
- -      struct vmem_altmap *altmap = pgmap_altmap(pgmap);
- -      unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
- -      unsigned long zone_idx = zone_idx(zone);
- -      unsigned long start = jiffies;
- -      int nid = pgdat->node_id;
- -
- -      if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
- -              return;
- -
- -      /*
- -       * The call to memmap_init should have already taken care
- -       * of the pages reserved for the memmap, so we can just jump to
- -       * the end of that region and start processing the device pages.
- -       */
- -      if (altmap) {
- -              start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- -              nr_pages = end_pfn - start_pfn;
- -      }
- -
- -      for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
- -              struct page *page = pfn_to_page(pfn);
- -
- -              __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
- -
- -              if (pfns_per_compound == 1)
- -                      continue;
- -
- -              memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- -                                   compound_nr_pages(altmap, pfns_per_compound));
- -      }
- -
- -      pr_info("%s initialised %lu pages in %ums\n", __func__,
- -              nr_pages, jiffies_to_msecs(jiffies - start));
- -}
- -
- -#endif
- -static void __meminit zone_init_free_lists(struct zone *zone)
- -{
- -      unsigned int order, t;
- -      for_each_migratetype_order(order, t) {
- -              INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
- -              zone->free_area[order].nr_free = 0;
- -      }
- -}
- -
- -/*
- - * Only struct pages that correspond to ranges defined by memblock.memory
- - * are zeroed and initialized by going through __init_single_page() during
- - * memmap_init_zone_range().
- - *
- - * But, there could be struct pages that correspond to holes in
- - * memblock.memory. This can happen because of the following reasons:
- - * - physical memory bank size is not necessarily the exact multiple of the
- - *   arbitrary section size
- - * - early reserved memory may not be listed in memblock.memory
- - * - memory layouts defined with memmap= kernel parameter may not align
- - *   nicely with memmap sections
- - *
- - * Explicitly initialize those struct pages so that:
- - * - PG_Reserved is set
- - * - zone and node links point to zone and node that span the page if the
- - *   hole is in the middle of a zone
- - * - zone and node links point to adjacent zone/node if the hole falls on
- - *   the zone boundary; the pages in such holes will be prepended to the
- - *   zone/node above the hole except for the trailing pages in the last
- - *   section that will be appended to the zone/node below.
- - */
- -static void __init init_unavailable_range(unsigned long spfn,
- -                                        unsigned long epfn,
- -                                        int zone, int node)
- -{
- -      unsigned long pfn;
- -      u64 pgcnt = 0;
- -
- -      for (pfn = spfn; pfn < epfn; pfn++) {
- -              if (!pfn_valid(pageblock_start_pfn(pfn))) {
- -                      pfn = pageblock_end_pfn(pfn) - 1;
- -                      continue;
- -              }
- -              __init_single_page(pfn_to_page(pfn), pfn, zone, node);
- -              __SetPageReserved(pfn_to_page(pfn));
- -              pgcnt++;
- -      }
- -
- -      if (pgcnt)
- -              pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
- -                      node, zone_names[zone], pgcnt);
- -}
- -
- -static void __init memmap_init_zone_range(struct zone *zone,
- -                                        unsigned long start_pfn,
- -                                        unsigned long end_pfn,
- -                                        unsigned long *hole_pfn)
- -{
- -      unsigned long zone_start_pfn = zone->zone_start_pfn;
- -      unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
- -      int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
- -
- -      start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
- -      end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
- -
- -      if (start_pfn >= end_pfn)
- -              return;
- -
- -      memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- -                        zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
- -
- -      if (*hole_pfn < start_pfn)
- -              init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
- -
- -      *hole_pfn = end_pfn;
- -}
- -
- -static void __init memmap_init(void)
- -{
- -      unsigned long start_pfn, end_pfn;
- -      unsigned long hole_pfn = 0;
- -      int i, j, zone_id = 0, nid;
- -
- -      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- -              struct pglist_data *node = NODE_DATA(nid);
- -
- -              for (j = 0; j < MAX_NR_ZONES; j++) {
- -                      struct zone *zone = node->node_zones + j;
- -
- -                      if (!populated_zone(zone))
- -                              continue;
- -
- -                      memmap_init_zone_range(zone, start_pfn, end_pfn,
- -                                             &hole_pfn);
- -                      zone_id = j;
- -              }
- -      }
- -
- -#ifdef CONFIG_SPARSEMEM
- -      /*
- -       * Initialize the memory map for hole in the range [memory_end,
- -       * section_end].
- -       * Append the pages in this hole to the highest zone in the last
- -       * node.
- -       * The call to init_unavailable_range() is outside the ifdef to
- -       * silence the compiler warining about zone_id set but not used;
- -       * for FLATMEM it is a nop anyway
- -       */
- -      end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- -      if (hole_pfn < end_pfn)
- -#endif
- -              init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
- -}
- -
- -void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
- -                        phys_addr_t min_addr, int nid, bool exact_nid)
- -{
- -      void *ptr;
- -
- -      if (exact_nid)
- -              ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- -                                                 MEMBLOCK_ALLOC_ACCESSIBLE,
- -                                                 nid);
- -      else
- -              ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- -                                               MEMBLOCK_ALLOC_ACCESSIBLE,
- -                                               nid);
- -
- -      if (ptr && size > 0)
- -              page_init_poison(ptr, size);
- -
- -      return ptr;
- -}
- -
- -static int zone_batchsize(struct zone *zone)
- -{
- -#ifdef CONFIG_MMU
- -      int batch;
+ +#ifdef CONFIG_MMU
+ +      int batch;
   
         /*
          * The number of pages to batch allocate is either ~0.1%
@@@ -5962,201 -7174,1343 +5978,201 @@@ static int zone_highsize(struct zone *z
                 /*
                  * By default, the high value of the pcp is based on the zone
                  * low watermark so that if they are full then background
- -               * reclaim will not be started prematurely.
- -               */
- -              total_pages = low_wmark_pages(zone);
- -      } else {
- -              /*
- -               * If percpu_pagelist_high_fraction is configured, the high
- -               * value is based on a fraction of the managed pages in the
- -               * zone.
- -               */
- -              total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
- -      }
- -
- -      /*
- -       * Split the high value across all online CPUs local to the zone. Note
- -       * that early in boot that CPUs may not be online yet and that during
- -       * CPU hotplug that the cpumask is not yet updated when a CPU is being
- -       * onlined. For memory nodes that have no CPUs, split pcp->high across
- -       * all online CPUs to mitigate the risk that reclaim is triggered
- -       * prematurely due to pages stored on pcp lists.
- -       */
- -      nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
- -      if (!nr_split_cpus)
- -              nr_split_cpus = num_online_cpus();
- -      high = total_pages / nr_split_cpus;
- -
- -      /*
- -       * Ensure high is at least batch*4. The multiple is based on the
- -       * historical relationship between high and batch.
- -       */
- -      high = max(high, batch << 2);
- -
- -      return high;
- -#else
- -      return 0;
- -#endif
- -}
- -
- -/*
- - * pcp->high and pcp->batch values are related and generally batch is lower
- - * than high. They are also related to pcp->count such that count is lower
- - * than high, and as soon as it reaches high, the pcplist is flushed.
- - *
- - * However, guaranteeing these relations at all times would require e.g. write
- - * barriers here but also careful usage of read barriers at the read side, and
- - * thus be prone to error and bad for performance. Thus the update only prevents
- - * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- - * can cope with those fields changing asynchronously, and fully trust only the
- - * pcp->count field on the local CPU with interrupts disabled.
- - *
- - * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
- - * outside of boot time (or some other assurance that no concurrent updaters
- - * exist).
- - */
- -static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
- -              unsigned long batch)
- -{
- -      WRITE_ONCE(pcp->batch, batch);
- -      WRITE_ONCE(pcp->high, high);
- -}
- -
- -static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
- -{
- -      int pindex;
- -
- -      memset(pcp, 0, sizeof(*pcp));
- -      memset(pzstats, 0, sizeof(*pzstats));
- -
- -      spin_lock_init(&pcp->lock);
- -      for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
- -              INIT_LIST_HEAD(&pcp->lists[pindex]);
- -
- -      /*
- -       * Set batch and high values safe for a boot pageset. A true percpu
- -       * pageset's initialization will update them subsequently. Here we don't
- -       * need to be as careful as pageset_update() as nobody can access the
- -       * pageset yet.
- -       */
- -      pcp->high = BOOT_PAGESET_HIGH;
- -      pcp->batch = BOOT_PAGESET_BATCH;
- -      pcp->free_factor = 0;
- -}
- -
- -static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
- -              unsigned long batch)
- -{
- -      struct per_cpu_pages *pcp;
- -      int cpu;
- -
- -      for_each_possible_cpu(cpu) {
- -              pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- -              pageset_update(pcp, high, batch);
- -      }
- -}
- -
- -/*
- - * Calculate and set new high and batch values for all per-cpu pagesets of a
- - * zone based on the zone's size.
- - */
- -static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
- -{
- -      int new_high, new_batch;
- -
- -      new_batch = max(1, zone_batchsize(zone));
- -      new_high = zone_highsize(zone, new_batch, cpu_online);
- -
- -      if (zone->pageset_high == new_high &&
- -          zone->pageset_batch == new_batch)
- -              return;
- -
- -      zone->pageset_high = new_high;
- -      zone->pageset_batch = new_batch;
- -
- -      __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
- -}
- -
- -void __meminit setup_zone_pageset(struct zone *zone)
- -{
- -      int cpu;
- -
- -      /* Size may be 0 on !SMP && !NUMA */
- -      if (sizeof(struct per_cpu_zonestat) > 0)
- -              zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
- -
- -      zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
- -      for_each_possible_cpu(cpu) {
- -              struct per_cpu_pages *pcp;
- -              struct per_cpu_zonestat *pzstats;
- -
- -              pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- -              pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- -              per_cpu_pages_init(pcp, pzstats);
- -      }
- -
- -      zone_set_pageset_high_and_batch(zone, 0);
- -}
- -
- -/*
- - * The zone indicated has a new number of managed_pages; batch sizes and percpu
- - * page high values need to be recalculated.
- - */
- -static void zone_pcp_update(struct zone *zone, int cpu_online)
- -{
- -      mutex_lock(&pcp_batch_high_lock);
- -      zone_set_pageset_high_and_batch(zone, cpu_online);
- -      mutex_unlock(&pcp_batch_high_lock);
- -}
- -
- -/*
- - * Allocate per cpu pagesets and initialize them.
- - * Before this call only boot pagesets were available.
- - */
- -void __init setup_per_cpu_pageset(void)
- -{
- -      struct pglist_data *pgdat;
- -      struct zone *zone;
- -      int __maybe_unused cpu;
- -
- -      for_each_populated_zone(zone)
- -              setup_zone_pageset(zone);
- -
- -#ifdef CONFIG_NUMA
- -      /*
- -       * Unpopulated zones continue using the boot pagesets.
- -       * The numa stats for these pagesets need to be reset.
- -       * Otherwise, they will end up skewing the stats of
- -       * the nodes these zones are associated with.
- -       */
- -      for_each_possible_cpu(cpu) {
- -              struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
- -              memset(pzstats->vm_numa_event, 0,
- -                     sizeof(pzstats->vm_numa_event));
- -      }
- -#endif
- -
- -      for_each_online_pgdat(pgdat)
- -              pgdat->per_cpu_nodestats =
- -                      alloc_percpu(struct per_cpu_nodestat);
- -}
- -
- -static __meminit void zone_pcp_init(struct zone *zone)
- -{
- -      /*
- -       * per cpu subsystem is not up at this point. The following code
- -       * relies on the ability of the linker to provide the
- -       * offset of a (static) per cpu variable into the per cpu area.
- -       */
- -      zone->per_cpu_pageset = &boot_pageset;
- -      zone->per_cpu_zonestats = &boot_zonestats;
- -      zone->pageset_high = BOOT_PAGESET_HIGH;
- -      zone->pageset_batch = BOOT_PAGESET_BATCH;
- -
- -      if (populated_zone(zone))
- -              pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
- -                       zone->present_pages, zone_batchsize(zone));
- -}
- -
- -void __meminit init_currently_empty_zone(struct zone *zone,
- -                                      unsigned long zone_start_pfn,
- -                                      unsigned long size)
- -{
- -      struct pglist_data *pgdat = zone->zone_pgdat;
- -      int zone_idx = zone_idx(zone) + 1;
- -
- -      if (zone_idx > pgdat->nr_zones)
- -              pgdat->nr_zones = zone_idx;
- -
- -      zone->zone_start_pfn = zone_start_pfn;
- -
- -      mminit_dprintk(MMINIT_TRACE, "memmap_init",
- -                      "Initialising map node %d zone %lu pfns %lu -> %lu\n",
- -                      pgdat->node_id,
- -                      (unsigned long)zone_idx(zone),
- -                      zone_start_pfn, (zone_start_pfn + size));
- -
- -      zone_init_free_lists(zone);
- -      zone->initialized = 1;
- -}
- -
- -/**
- - * get_pfn_range_for_nid - Return the start and end page frames for a node
- - * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
- - * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
- - * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
- - *
- - * It returns the start and end page frame of a node based on information
- - * provided by memblock_set_node(). If called for a node
- - * with no available memory, a warning is printed and the start and end
- - * PFNs will be 0.
- - */
- -void __init get_pfn_range_for_nid(unsigned int nid,
- -                      unsigned long *start_pfn, unsigned long *end_pfn)
- -{
- -      unsigned long this_start_pfn, this_end_pfn;
- -      int i;
- -
- -      *start_pfn = -1UL;
- -      *end_pfn = 0;
- -
- -      for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
- -              *start_pfn = min(*start_pfn, this_start_pfn);
- -              *end_pfn = max(*end_pfn, this_end_pfn);
- -      }
- -
- -      if (*start_pfn == -1UL)
- -              *start_pfn = 0;
- -}
- -
- -/*
- - * This finds a zone that can be used for ZONE_MOVABLE pages. The
- - * assumption is made that zones within a node are ordered in monotonic
- - * increasing memory addresses so that the "highest" populated zone is used
- - */
- -static void __init find_usable_zone_for_movable(void)
- -{
- -      int zone_index;
- -      for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
- -              if (zone_index == ZONE_MOVABLE)
- -                      continue;
- -
- -              if (arch_zone_highest_possible_pfn[zone_index] >
- -                              arch_zone_lowest_possible_pfn[zone_index])
- -                      break;
- -      }
- -
- -      VM_BUG_ON(zone_index == -1);
- -      movable_zone = zone_index;
- -}
- -
- -/*
- - * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- - * because it is sized independent of architecture. Unlike the other zones,
- - * the starting point for ZONE_MOVABLE is not fixed. It may be different
- - * in each node depending on the size of each node and how evenly kernelcore
- - * is distributed. This helper function adjusts the zone ranges
- - * provided by the architecture for a given node by using the end of the
- - * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
- - * zones within a node are in order of monotonic increases memory addresses
- - */
- -static void __init adjust_zone_range_for_zone_movable(int nid,
- -                                      unsigned long zone_type,
- -                                      unsigned long node_start_pfn,
- -                                      unsigned long node_end_pfn,
- -                                      unsigned long *zone_start_pfn,
- -                                      unsigned long *zone_end_pfn)
- -{
- -      /* Only adjust if ZONE_MOVABLE is on this node */
- -      if (zone_movable_pfn[nid]) {
- -              /* Size ZONE_MOVABLE */
- -              if (zone_type == ZONE_MOVABLE) {
- -                      *zone_start_pfn = zone_movable_pfn[nid];
- -                      *zone_end_pfn = min(node_end_pfn,
- -                              arch_zone_highest_possible_pfn[movable_zone]);
- -
- -              /* Adjust for ZONE_MOVABLE starting within this range */
- -              } else if (!mirrored_kernelcore &&
- -                      *zone_start_pfn < zone_movable_pfn[nid] &&
- -                      *zone_end_pfn > zone_movable_pfn[nid]) {
- -                      *zone_end_pfn = zone_movable_pfn[nid];
- -
- -              /* Check if this whole range is within ZONE_MOVABLE */
- -              } else if (*zone_start_pfn >= zone_movable_pfn[nid])
- -                      *zone_start_pfn = *zone_end_pfn;
- -      }
- -}
- -
- -/*
- - * Return the number of pages a zone spans in a node, including holes
- - * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
- - */
- -static unsigned long __init zone_spanned_pages_in_node(int nid,
- -                                      unsigned long zone_type,
- -                                      unsigned long node_start_pfn,
- -                                      unsigned long node_end_pfn,
- -                                      unsigned long *zone_start_pfn,
- -                                      unsigned long *zone_end_pfn)
- -{
- -      unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- -      unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- -      /* When hotadd a new node from cpu_up(), the node should be empty */
- -      if (!node_start_pfn && !node_end_pfn)
- -              return 0;
- -
- -      /* Get the start and end of the zone */
- -      *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- -      *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- -      adjust_zone_range_for_zone_movable(nid, zone_type,
- -                              node_start_pfn, node_end_pfn,
- -                              zone_start_pfn, zone_end_pfn);
- -
- -      /* Check that this node has pages within the zone's required range */
- -      if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
- -              return 0;
- -
- -      /* Move the zone boundaries inside the node if necessary */
- -      *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
- -      *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
- -
- -      /* Return the spanned pages */
- -      return *zone_end_pfn - *zone_start_pfn;
- -}
- -
- -/*
- - * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- - * then all holes in the requested range will be accounted for.
- - */
- -unsigned long __init __absent_pages_in_range(int nid,
- -                              unsigned long range_start_pfn,
- -                              unsigned long range_end_pfn)
- -{
- -      unsigned long nr_absent = range_end_pfn - range_start_pfn;
- -      unsigned long start_pfn, end_pfn;
- -      int i;
- -
- -      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- -              start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- -              end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
- -              nr_absent -= end_pfn - start_pfn;
- -      }
- -      return nr_absent;
- -}
- -
- -/**
- - * absent_pages_in_range - Return number of page frames in holes within a range
- - * @start_pfn: The start PFN to start searching for holes
- - * @end_pfn: The end PFN to stop searching for holes
- - *
- - * Return: the number of pages frames in memory holes within a range.
- - */
- -unsigned long __init absent_pages_in_range(unsigned long start_pfn,
- -                                                      unsigned long end_pfn)
- -{
- -      return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
- -}
- -
- -/* Return the number of page frames in holes in a zone on a node */
- -static unsigned long __init zone_absent_pages_in_node(int nid,
- -                                      unsigned long zone_type,
- -                                      unsigned long node_start_pfn,
- -                                      unsigned long node_end_pfn)
- -{
- -      unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- -      unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- -      unsigned long zone_start_pfn, zone_end_pfn;
- -      unsigned long nr_absent;
- -
- -      /* When hotadd a new node from cpu_up(), the node should be empty */
- -      if (!node_start_pfn && !node_end_pfn)
- -              return 0;
- -
- -      zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- -      zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- -
- -      adjust_zone_range_for_zone_movable(nid, zone_type,
- -                      node_start_pfn, node_end_pfn,
- -                      &zone_start_pfn, &zone_end_pfn);
- -      nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
- -
- -      /*
- -       * ZONE_MOVABLE handling.
- -       * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
- -       * and vice versa.
- -       */
- -      if (mirrored_kernelcore && zone_movable_pfn[nid]) {
- -              unsigned long start_pfn, end_pfn;
- -              struct memblock_region *r;
- -
- -              for_each_mem_region(r) {
- -                      start_pfn = clamp(memblock_region_memory_base_pfn(r),
- -                                        zone_start_pfn, zone_end_pfn);
- -                      end_pfn = clamp(memblock_region_memory_end_pfn(r),
- -                                      zone_start_pfn, zone_end_pfn);
- -
- -                      if (zone_type == ZONE_MOVABLE &&
- -                          memblock_is_mirror(r))
- -                              nr_absent += end_pfn - start_pfn;
- -
- -                      if (zone_type == ZONE_NORMAL &&
- -                          !memblock_is_mirror(r))
- -                              nr_absent += end_pfn - start_pfn;
- -              }
- -      }
- -
- -      return nr_absent;
- -}
- -
- -static void __init calculate_node_totalpages(struct pglist_data *pgdat,
- -                                              unsigned long node_start_pfn,
- -                                              unsigned long node_end_pfn)
- -{
- -      unsigned long realtotalpages = 0, totalpages = 0;
- -      enum zone_type i;
- -
- -      for (i = 0; i < MAX_NR_ZONES; i++) {
- -              struct zone *zone = pgdat->node_zones + i;
- -              unsigned long zone_start_pfn, zone_end_pfn;
- -              unsigned long spanned, absent;
- -              unsigned long size, real_size;
- -
- -              spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
- -                                                   node_start_pfn,
- -                                                   node_end_pfn,
- -                                                   &zone_start_pfn,
- -                                                   &zone_end_pfn);
- -              absent = zone_absent_pages_in_node(pgdat->node_id, i,
- -                                                 node_start_pfn,
- -                                                 node_end_pfn);
- -
- -              size = spanned;
- -              real_size = size - absent;
- -
- -              if (size)
- -                      zone->zone_start_pfn = zone_start_pfn;
- -              else
- -                      zone->zone_start_pfn = 0;
- -              zone->spanned_pages = size;
- -              zone->present_pages = real_size;
- -#if defined(CONFIG_MEMORY_HOTPLUG)
- -              zone->present_early_pages = real_size;
- -#endif
- -
- -              totalpages += size;
- -              realtotalpages += real_size;
- -      }
- -
- -      pgdat->node_spanned_pages = totalpages;
- -      pgdat->node_present_pages = realtotalpages;
- -      pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
- -}
- -
- -#ifndef CONFIG_SPARSEMEM
- -/*
- - * Calculate the size of the zone->blockflags rounded to an unsigned long
- - * Start by making sure zonesize is a multiple of pageblock_order by rounding
- - * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- - * round what is now in bits to nearest long in bits, then return it in
- - * bytes.
- - */
- -static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
- -{
- -      unsigned long usemapsize;
- -
- -      zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- -      usemapsize = roundup(zonesize, pageblock_nr_pages);
- -      usemapsize = usemapsize >> pageblock_order;
- -      usemapsize *= NR_PAGEBLOCK_BITS;
- -      usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
- -
- -      return usemapsize / 8;
- -}
- -
- -static void __ref setup_usemap(struct zone *zone)
- -{
- -      unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
- -                                             zone->spanned_pages);
- -      zone->pageblock_flags = NULL;
- -      if (usemapsize) {
- -              zone->pageblock_flags =
- -                      memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
- -                                          zone_to_nid(zone));
- -              if (!zone->pageblock_flags)
- -                      panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
- -                            usemapsize, zone->name, zone_to_nid(zone));
- -      }
- -}
- -#else
- -static inline void setup_usemap(struct zone *zone) {}
- -#endif /* CONFIG_SPARSEMEM */
- -
- -#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
- -
- -/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
- -void __init set_pageblock_order(void)
- -{
- -      unsigned int order = MAX_ORDER - 1;
- -
- -      /* Check that pageblock_nr_pages has not already been setup */
- -      if (pageblock_order)
- -              return;
- -
- -      /* Don't let pageblocks exceed the maximum allocation granularity. */
- -      if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
- -              order = HUGETLB_PAGE_ORDER;
- -
- -      /*
- -       * Assume the largest contiguous order of interest is a huge page.
- -       * This value may be variable depending on boot parameters on IA64 and
- -       * powerpc.
- -       */
- -      pageblock_order = order;
- -}
- -#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
- -
- -/*
- - * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- - * is unused as pageblock_order is set at compile-time. See
- - * include/linux/pageblock-flags.h for the values of pageblock_order based on
- - * the kernel config
- - */
- -void __init set_pageblock_order(void)
- -{
- -}
- -
- -#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
- -
- -static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
- -                                              unsigned long present_pages)
- -{
- -      unsigned long pages = spanned_pages;
- -
- -      /*
- -       * Provide a more accurate estimation if there are holes within
- -       * the zone and SPARSEMEM is in use. If there are holes within the
- -       * zone, each populated memory region may cost us one or two extra
- -       * memmap pages due to alignment because memmap pages for each
- -       * populated regions may not be naturally aligned on page boundary.
- -       * So the (present_pages >> 4) heuristic is a tradeoff for that.
- -       */
- -      if (spanned_pages > present_pages + (present_pages >> 4) &&
- -          IS_ENABLED(CONFIG_SPARSEMEM))
- -              pages = present_pages;
- -
- -      return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
- -}
- -
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- -static void pgdat_init_split_queue(struct pglist_data *pgdat)
- -{
- -      struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
- -
- -      spin_lock_init(&ds_queue->split_queue_lock);
- -      INIT_LIST_HEAD(&ds_queue->split_queue);
- -      ds_queue->split_queue_len = 0;
- -}
- -#else
- -static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
- -#endif
- -
- -#ifdef CONFIG_COMPACTION
- -static void pgdat_init_kcompactd(struct pglist_data *pgdat)
- -{
- -      init_waitqueue_head(&pgdat->kcompactd_wait);
- -}
- -#else
- -static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
- -#endif
- -
- -static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
- -{
- -      int i;
- -
- -      pgdat_resize_init(pgdat);
- -      pgdat_kswapd_lock_init(pgdat);
- -
- -      pgdat_init_split_queue(pgdat);
- -      pgdat_init_kcompactd(pgdat);
- -
- -      init_waitqueue_head(&pgdat->kswapd_wait);
- -      init_waitqueue_head(&pgdat->pfmemalloc_wait);
- -
- -      for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
- -              init_waitqueue_head(&pgdat->reclaim_wait[i]);
- -
- -      pgdat_page_ext_init(pgdat);
- -      lruvec_init(&pgdat->__lruvec);
- -}
- -
- -static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
- -                                                      unsigned long remaining_pages)
- -{
- -      atomic_long_set(&zone->managed_pages, remaining_pages);
- -      zone_set_nid(zone, nid);
- -      zone->name = zone_names[idx];
- -      zone->zone_pgdat = NODE_DATA(nid);
- -      spin_lock_init(&zone->lock);
- -      zone_seqlock_init(zone);
- -      zone_pcp_init(zone);
- -}
- -
- -/*
- - * Set up the zone data structures
- - * - init pgdat internals
- - * - init all zones belonging to this node
- - *
- - * NOTE: this function is only called during memory hotplug
- - */
- -#ifdef CONFIG_MEMORY_HOTPLUG
- -void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
- -{
- -      int nid = pgdat->node_id;
- -      enum zone_type z;
- -      int cpu;
- -
- -      pgdat_init_internals(pgdat);
- -
- -      if (pgdat->per_cpu_nodestats == &boot_nodestats)
- -              pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
- -
- -      /*
- -       * Reset the nr_zones, order and highest_zoneidx before reuse.
- -       * Note that kswapd will init kswapd_highest_zoneidx properly
- -       * when it starts in the near future.
- -       */
- -      pgdat->nr_zones = 0;
- -      pgdat->kswapd_order = 0;
- -      pgdat->kswapd_highest_zoneidx = 0;
- -      pgdat->node_start_pfn = 0;
- -      for_each_online_cpu(cpu) {
- -              struct per_cpu_nodestat *p;
- -
- -              p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- -              memset(p, 0, sizeof(*p));
- -      }
- -
- -      for (z = 0; z < MAX_NR_ZONES; z++)
- -              zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
- -}
- -#endif
- -
- -/*
- - * Set up the zone data structures:
- - *   - mark all pages reserved
- - *   - mark all memory queues empty
- - *   - clear the memory bitmaps
- - *
- - * NOTE: pgdat should get zeroed by caller.
- - * NOTE: this function is only called during early init.
- - */
- -static void __init free_area_init_core(struct pglist_data *pgdat)
- -{
- -      enum zone_type j;
- -      int nid = pgdat->node_id;
- -
- -      pgdat_init_internals(pgdat);
- -      pgdat->per_cpu_nodestats = &boot_nodestats;
- -
- -      for (j = 0; j < MAX_NR_ZONES; j++) {
- -              struct zone *zone = pgdat->node_zones + j;
- -              unsigned long size, freesize, memmap_pages;
- -
- -              size = zone->spanned_pages;
- -              freesize = zone->present_pages;
- -
- -              /*
- -               * Adjust freesize so that it accounts for how much memory
- -               * is used by this zone for memmap. This affects the watermark
- -               * and per-cpu initialisations
- -               */
- -              memmap_pages = calc_memmap_size(size, freesize);
- -              if (!is_highmem_idx(j)) {
- -                      if (freesize >= memmap_pages) {
- -                              freesize -= memmap_pages;
- -                              if (memmap_pages)
- -                                      pr_debug("  %s zone: %lu pages used for memmap\n",
- -                                               zone_names[j], memmap_pages);
- -                      } else
- -                              pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
- -                                      zone_names[j], memmap_pages, freesize);
- -              }
- -
- -              /* Account for reserved pages */
- -              if (j == 0 && freesize > dma_reserve) {
- -                      freesize -= dma_reserve;
- -                      pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
- -              }
- -
- -              if (!is_highmem_idx(j))
- -                      nr_kernel_pages += freesize;
- -              /* Charge for highmem memmap if there are enough kernel pages */
- -              else if (nr_kernel_pages > memmap_pages * 2)
- -                      nr_kernel_pages -= memmap_pages;
- -              nr_all_pages += freesize;
- -
- -              /*
- -               * Set an approximate value for lowmem here, it will be adjusted
- -               * when the bootmem allocator frees pages into the buddy system.
- -               * And all highmem pages will be managed by the buddy system.
- -               */
- -              zone_init_internals(zone, j, nid, freesize);
- -
- -              if (!size)
- -                      continue;
- -
- -              set_pageblock_order();
- -              setup_usemap(zone);
- -              init_currently_empty_zone(zone, zone->zone_start_pfn, size);
- -      }
- -}
- -
- -#ifdef CONFIG_FLATMEM
- -static void __init alloc_node_mem_map(struct pglist_data *pgdat)
- -{
- -      unsigned long __maybe_unused start = 0;
- -      unsigned long __maybe_unused offset = 0;
- -
- -      /* Skip empty nodes */
- -      if (!pgdat->node_spanned_pages)
- -              return;
- -
- -      start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- -      offset = pgdat->node_start_pfn - start;
- -      /* ia64 gets its own node_mem_map, before this, without bootmem */
- -      if (!pgdat->node_mem_map) {
- -              unsigned long size, end;
- -              struct page *map;
- -
- -              /*
- -               * The zone's endpoints aren't required to be MAX_ORDER
- -               * aligned but the node_mem_map endpoints must be in order
- -               * for the buddy allocator to function correctly.
- -               */
- -              end = pgdat_end_pfn(pgdat);
- -              end = ALIGN(end, MAX_ORDER_NR_PAGES);
- -              size =  (end - start) * sizeof(struct page);
- -              map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
- -                                 pgdat->node_id, false);
- -              if (!map)
- -                      panic("Failed to allocate %ld bytes for node %d memory map\n",
- -                            size, pgdat->node_id);
- -              pgdat->node_mem_map = map + offset;
- -      }
- -      pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
- -                              __func__, pgdat->node_id, (unsigned long)pgdat,
- -                              (unsigned long)pgdat->node_mem_map);
- -#ifndef CONFIG_NUMA
- -      /*
- -       * With no DISCONTIG, the global mem_map is just set as node 0's
- -       */
- -      if (pgdat == NODE_DATA(0)) {
- -              mem_map = NODE_DATA(0)->node_mem_map;
- -              if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- -                      mem_map -= offset;
- -      }
- -#endif
- -}
- -#else
- -static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
- -#endif /* CONFIG_FLATMEM */
- -
- -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- -static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
- -{
- -      pgdat->first_deferred_pfn = ULONG_MAX;
- -}
- -#else
- -static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
- -#endif
- -
- -static void __init free_area_init_node(int nid)
- -{
- -      pg_data_t *pgdat = NODE_DATA(nid);
- -      unsigned long start_pfn = 0;
- -      unsigned long end_pfn = 0;
- -
- -      /* pg_data_t should be reset to zero when it's allocated */
- -      WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
- -
- -      get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- -
- -      pgdat->node_id = nid;
- -      pgdat->node_start_pfn = start_pfn;
- -      pgdat->per_cpu_nodestats = NULL;
- -
- -      if (start_pfn != end_pfn) {
- -              pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- -                      (u64)start_pfn << PAGE_SHIFT,
- -                      end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
- -      } else {
- -              pr_info("Initmem setup node %d as memoryless\n", nid);
- -      }
- -
- -      calculate_node_totalpages(pgdat, start_pfn, end_pfn);
- -
- -      alloc_node_mem_map(pgdat);
- -      pgdat_set_deferred_range(pgdat);
- -
- -      free_area_init_core(pgdat);
- -      lru_gen_init_pgdat(pgdat);
- -}
- -
- -static void __init free_area_init_memoryless_node(int nid)
- -{
- -      free_area_init_node(nid);
- -}
- -
- -#if MAX_NUMNODES > 1
- -/*
- - * Figure out the number of possible node ids.
- - */
- -void __init setup_nr_node_ids(void)
- -{
- -      unsigned int highest;
- -
- -      highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
- -      nr_node_ids = highest + 1;
- -}
- -#endif
- -
- -/**
- - * node_map_pfn_alignment - determine the maximum internode alignment
- - *
- - * This function should be called after node map is populated and sorted.
- - * It calculates the maximum power of two alignment which can distinguish
- - * all the nodes.
- - *
- - * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
- - * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
- - * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
- - * shifted, 1GiB is enough and this function will indicate so.
- - *
- - * This is used to test whether pfn -> nid mapping of the chosen memory
- - * model has fine enough granularity to avoid incorrect mapping for the
- - * populated node map.
- - *
- - * Return: the determined alignment in pfn's.  0 if there is no alignment
- - * requirement (single node).
- - */
- -unsigned long __init node_map_pfn_alignment(void)
- -{
- -      unsigned long accl_mask = 0, last_end = 0;
- -      unsigned long start, end, mask;
- -      int last_nid = NUMA_NO_NODE;
- -      int i, nid;
- -
- -      for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
- -              if (!start || last_nid < 0 || last_nid == nid) {
- -                      last_nid = nid;
- -                      last_end = end;
- -                      continue;
- -              }
- -
- -              /*
- -               * Start with a mask granular enough to pin-point to the
- -               * start pfn and tick off bits one-by-one until it becomes
- -               * too coarse to separate the current node from the last.
- -               */
- -              mask = ~((1 << __ffs(start)) - 1);
- -              while (mask && last_end <= (start & (mask << 1)))
- -                      mask <<= 1;
- -
- -              /* accumulate all internode masks */
- -              accl_mask |= mask;
- -      }
- -
- -      /* convert mask to number of pages */
- -      return ~accl_mask + 1;
- -}
- -
- -/*
- - * early_calculate_totalpages()
- - * Sum pages in active regions for movable zone.
- - * Populate N_MEMORY for calculating usable_nodes.
- - */
- -static unsigned long __init early_calculate_totalpages(void)
- -{
- -      unsigned long totalpages = 0;
- -      unsigned long start_pfn, end_pfn;
- -      int i, nid;
- -
- -      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- -              unsigned long pages = end_pfn - start_pfn;
- -
- -              totalpages += pages;
- -              if (pages)
- -                      node_set_state(nid, N_MEMORY);
- -      }
- -      return totalpages;
- -}
- -
- -/*
- - * Find the PFN the Movable zone begins in each node. Kernel memory
- - * is spread evenly between nodes as long as the nodes have enough
- - * memory. When they don't, some nodes will have more kernelcore than
- - * others
- - */
- -static void __init find_zone_movable_pfns_for_nodes(void)
- -{
- -      int i, nid;
- -      unsigned long usable_startpfn;
- -      unsigned long kernelcore_node, kernelcore_remaining;
- -      /* save the state before borrow the nodemask */
- -      nodemask_t saved_node_state = node_states[N_MEMORY];
- -      unsigned long totalpages = early_calculate_totalpages();
- -      int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- -      struct memblock_region *r;
- -
- -      /* Need to find movable_zone earlier when movable_node is specified. */
- -      find_usable_zone_for_movable();
- -
- -      /*
- -       * If movable_node is specified, ignore kernelcore and movablecore
- -       * options.
- -       */
- -      if (movable_node_is_enabled()) {
- -              for_each_mem_region(r) {
- -                      if (!memblock_is_hotpluggable(r))
- -                              continue;
- -
- -                      nid = memblock_get_region_node(r);
- -
- -                      usable_startpfn = PFN_DOWN(r->base);
- -                      zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- -                              min(usable_startpfn, zone_movable_pfn[nid]) :
- -                              usable_startpfn;
- -              }
- -
- -              goto out2;
- -      }
- -
- -      /*
- -       * If kernelcore=mirror is specified, ignore movablecore option
- -       */
- -      if (mirrored_kernelcore) {
- -              bool mem_below_4gb_not_mirrored = false;
- -
- -              for_each_mem_region(r) {
- -                      if (memblock_is_mirror(r))
- -                              continue;
- -
- -                      nid = memblock_get_region_node(r);
- -
- -                      usable_startpfn = memblock_region_memory_base_pfn(r);
- -
- -                      if (usable_startpfn < PHYS_PFN(SZ_4G)) {
- -                              mem_below_4gb_not_mirrored = true;
- -                              continue;
- -                      }
- -
- -                      zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- -                              min(usable_startpfn, zone_movable_pfn[nid]) :
- -                              usable_startpfn;
- -              }
- -
- -              if (mem_below_4gb_not_mirrored)
- -                      pr_warn("This configuration results in unmirrored kernel memory.\n");
- -
- -              goto out2;
- -      }
- -
- -      /*
- -       * If kernelcore=nn% or movablecore=nn% was specified, calculate the
- -       * amount of necessary memory.
- -       */
- -      if (required_kernelcore_percent)
- -              required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
- -                                     10000UL;
- -      if (required_movablecore_percent)
- -              required_movablecore = (totalpages * 100 * required_movablecore_percent) /
- -                                      10000UL;
- -
- -      /*
- -       * If movablecore= was specified, calculate what size of
- -       * kernelcore that corresponds so that memory usable for
- -       * any allocation type is evenly spread. If both kernelcore
- -       * and movablecore are specified, then the value of kernelcore
- -       * will be used for required_kernelcore if it's greater than
- -       * what movablecore would have allowed.
- -       */
- -      if (required_movablecore) {
- -              unsigned long corepages;
- -
- -              /*
- -               * Round-up so that ZONE_MOVABLE is at least as large as what
- -               * was requested by the user
- -               */
- -              required_movablecore =
- -                      roundup(required_movablecore, MAX_ORDER_NR_PAGES);
- -              required_movablecore = min(totalpages, required_movablecore);
- -              corepages = totalpages - required_movablecore;
- -
- -              required_kernelcore = max(required_kernelcore, corepages);
- -      }
- -
- -      /*
- -       * If kernelcore was not specified or kernelcore size is larger
- -       * than totalpages, there is no ZONE_MOVABLE.
- -       */
- -      if (!required_kernelcore || required_kernelcore >= totalpages)
- -              goto out;
- -
- -      /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- -      usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
- -
- -restart:
- -      /* Spread kernelcore memory as evenly as possible throughout nodes */
- -      kernelcore_node = required_kernelcore / usable_nodes;
- -      for_each_node_state(nid, N_MEMORY) {
- -              unsigned long start_pfn, end_pfn;
- -
- -              /*
- -               * Recalculate kernelcore_node if the division per node
- -               * now exceeds what is necessary to satisfy the requested
- -               * amount of memory for the kernel
- -               */
- -              if (required_kernelcore < kernelcore_node)
- -                      kernelcore_node = required_kernelcore / usable_nodes;
- -
- -              /*
- -               * As the map is walked, we track how much memory is usable
- -               * by the kernel using kernelcore_remaining. When it is
- -               * 0, the rest of the node is usable by ZONE_MOVABLE
- -               */
- -              kernelcore_remaining = kernelcore_node;
- -
- -              /* Go through each range of PFNs within this node */
- -              for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- -                      unsigned long size_pages;
- -
- -                      start_pfn = max(start_pfn, zone_movable_pfn[nid]);
- -                      if (start_pfn >= end_pfn)
- -                              continue;
- -
- -                      /* Account for what is only usable for kernelcore */
- -                      if (start_pfn < usable_startpfn) {
- -                              unsigned long kernel_pages;
- -                              kernel_pages = min(end_pfn, usable_startpfn)
- -                                                              - start_pfn;
- -
- -                              kernelcore_remaining -= min(kernel_pages,
- -                                                      kernelcore_remaining);
- -                              required_kernelcore -= min(kernel_pages,
- -                                                      required_kernelcore);
- -
- -                              /* Continue if range is now fully accounted */
- -                              if (end_pfn <= usable_startpfn) {
- -
- -                                      /*
- -                                       * Push zone_movable_pfn to the end so
- -                                       * that if we have to rebalance
- -                                       * kernelcore across nodes, we will
- -                                       * not double account here
- -                                       */
- -                                      zone_movable_pfn[nid] = end_pfn;
- -                                      continue;
- -                              }
- -                              start_pfn = usable_startpfn;
- -                      }
- -
- -                      /*
- -                       * The usable PFN range for ZONE_MOVABLE is from
- -                       * start_pfn->end_pfn. Calculate size_pages as the
- -                       * number of pages used as kernelcore
- -                       */
- -                      size_pages = end_pfn - start_pfn;
- -                      if (size_pages > kernelcore_remaining)
- -                              size_pages = kernelcore_remaining;
- -                      zone_movable_pfn[nid] = start_pfn + size_pages;
- -
- -                      /*
- -                       * Some kernelcore has been met, update counts and
- -                       * break if the kernelcore for this node has been
- -                       * satisfied
- -                       */
- -                      required_kernelcore -= min(required_kernelcore,
- -                                                              size_pages);
- -                      kernelcore_remaining -= size_pages;
- -                      if (!kernelcore_remaining)
- -                              break;
- -              }
+ +               * reclaim will not be started prematurely.
+ +               */
+ +              total_pages = low_wmark_pages(zone);
+ +      } else {
+ +              /*
+ +               * If percpu_pagelist_high_fraction is configured, the high
+ +               * value is based on a fraction of the managed pages in the
+ +               * zone.
+ +               */
+ +              total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
         }
   
         /*
- -       * If there is still required_kernelcore, we do another pass with one
- -       * less node in the count. This will push zone_movable_pfn[nid] further
- -       * along on the nodes that still have memory until kernelcore is
- -       * satisfied
+ +       * Split the high value across all online CPUs local to the zone. Note
+ +       * that early in boot that CPUs may not be online yet and that during
+ +       * CPU hotplug that the cpumask is not yet updated when a CPU is being
+ +       * onlined. For memory nodes that have no CPUs, split pcp->high across
+ +       * all online CPUs to mitigate the risk that reclaim is triggered
+ +       * prematurely due to pages stored on pcp lists.
          */
- -      usable_nodes--;
- -      if (usable_nodes && required_kernelcore > usable_nodes)
- -              goto restart;
- -
- -out2:
- -      /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- -      for (nid = 0; nid < MAX_NUMNODES; nid++) {
- -              unsigned long start_pfn, end_pfn;
- -
- -              zone_movable_pfn[nid] =
- -                      roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
- -
- -              get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- -              if (zone_movable_pfn[nid] >= end_pfn)
- -                      zone_movable_pfn[nid] = 0;
- -      }
- -
- -out:
- -      /* restore the node_state */
- -      node_states[N_MEMORY] = saved_node_state;
- -}
+ +      nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
+ +      if (!nr_split_cpus)
+ +              nr_split_cpus = num_online_cpus();
+ +      high = total_pages / nr_split_cpus;
   
- -/* Any regular or high memory on that node ? */
- -static void check_for_memory(pg_data_t *pgdat, int nid)
- -{
- -      enum zone_type zone_type;
+ +      /*
+ +       * Ensure high is at least batch*4. The multiple is based on the
+ +       * historical relationship between high and batch.
+ +       */
+ +      high = max(high, batch << 2);
   
- -      for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
- -              struct zone *zone = &pgdat->node_zones[zone_type];
- -              if (populated_zone(zone)) {
- -                      if (IS_ENABLED(CONFIG_HIGHMEM))
- -                              node_set_state(nid, N_HIGH_MEMORY);
- -                      if (zone_type <= ZONE_NORMAL)
- -                              node_set_state(nid, N_NORMAL_MEMORY);
- -                      break;
- -              }
- -      }
+ +      return high;
+ +#else
+ +      return 0;
+ +#endif
   }
   
   /*
- - * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
- - * such cases we allow max_zone_pfn sorted in the descending order
+ + * pcp->high and pcp->batch values are related and generally batch is lower
+ + * than high. They are also related to pcp->count such that count is lower
+ + * than high, and as soon as it reaches high, the pcplist is flushed.
+ + *
+ + * However, guaranteeing these relations at all times would require e.g. write
+ + * barriers here but also careful usage of read barriers at the read side, and
+ + * thus be prone to error and bad for performance. Thus the update only prevents
+ + * store tearing. Any new users of pcp->batch and pcp->high should ensure they
+ + * can cope with those fields changing asynchronously, and fully trust only the
+ + * pcp->count field on the local CPU with interrupts disabled.
+ + *
+ + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ + * outside of boot time (or some other assurance that no concurrent updaters
+ + * exist).
    */
- -bool __weak arch_has_descending_max_zone_pfns(void)
+ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+ +              unsigned long batch)
   {
- -      return false;
+ +      WRITE_ONCE(pcp->batch, batch);
+ +      WRITE_ONCE(pcp->high, high);
   }
   
- -/**
- - * free_area_init - Initialise all pg_data_t and zone data
- - * @max_zone_pfn: an array of max PFNs for each zone
- - *
- - * This will call free_area_init_node() for each active node in the system.
- - * Using the page ranges provided by memblock_set_node(), the size of each
- - * zone in each node and their holes is calculated. If the maximum PFN
- - * between two adjacent zones match, it is assumed that the zone is empty.
- - * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- - * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- - * starts where the previous one ended. For example, ZONE_DMA32 starts
- - * at arch_max_dma_pfn.
- - */
- -void __init free_area_init(unsigned long *max_zone_pfn)
+ +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
   {
- -      unsigned long start_pfn, end_pfn;
- -      int i, nid, zone;
- -      bool descending;
- -
- -      /* Record where the zone boundaries are */
- -      memset(arch_zone_lowest_possible_pfn, 0,
- -                              sizeof(arch_zone_lowest_possible_pfn));
- -      memset(arch_zone_highest_possible_pfn, 0,
- -                              sizeof(arch_zone_highest_possible_pfn));
- -
- -      start_pfn = PHYS_PFN(memblock_start_of_DRAM());
- -      descending = arch_has_descending_max_zone_pfns();
- -
- -      for (i = 0; i < MAX_NR_ZONES; i++) {
- -              if (descending)
- -                      zone = MAX_NR_ZONES - i - 1;
- -              else
- -                      zone = i;
- -
- -              if (zone == ZONE_MOVABLE)
- -                      continue;
+ +      int pindex;
   
- -              end_pfn = max(max_zone_pfn[zone], start_pfn);
- -              arch_zone_lowest_possible_pfn[zone] = start_pfn;
- -              arch_zone_highest_possible_pfn[zone] = end_pfn;
+ +      memset(pcp, 0, sizeof(*pcp));
+ +      memset(pzstats, 0, sizeof(*pzstats));
   
- -              start_pfn = end_pfn;
- -      }
+ +      spin_lock_init(&pcp->lock);
+ +      for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+ +              INIT_LIST_HEAD(&pcp->lists[pindex]);
   
- -      /* Find the PFNs that ZONE_MOVABLE begins at in each node */
- -      memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- -      find_zone_movable_pfns_for_nodes();
+ +      /*
+ +       * Set batch and high values safe for a boot pageset. A true percpu
+ +       * pageset's initialization will update them subsequently. Here we don't
+ +       * need to be as careful as pageset_update() as nobody can access the
+ +       * pageset yet.
+ +       */
+ +      pcp->high = BOOT_PAGESET_HIGH;
+ +      pcp->batch = BOOT_PAGESET_BATCH;
+ +      pcp->free_factor = 0;
+ +}
   
- -      /* Print out the zone ranges */
- -      pr_info("Zone ranges:\n");
- -      for (i = 0; i < MAX_NR_ZONES; i++) {
- -              if (i == ZONE_MOVABLE)
- -                      continue;
- -              pr_info("  %-8s ", zone_names[i]);
- -              if (arch_zone_lowest_possible_pfn[i] ==
- -                              arch_zone_highest_possible_pfn[i])
- -                      pr_cont("empty\n");
- -              else
- -                      pr_cont("[mem %#018Lx-%#018Lx]\n",
- -                              (u64)arch_zone_lowest_possible_pfn[i]
- -                                      << PAGE_SHIFT,
- -                              ((u64)arch_zone_highest_possible_pfn[i]
- -                                      << PAGE_SHIFT) - 1);
- -      }
+ +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+ +              unsigned long batch)
+ +{
+ +      struct per_cpu_pages *pcp;
+ +      int cpu;
   
- -      /* Print out the PFNs ZONE_MOVABLE begins at in each node */
- -      pr_info("Movable zone start for each node\n");
- -      for (i = 0; i < MAX_NUMNODES; i++) {
- -              if (zone_movable_pfn[i])
- -                      pr_info("  Node %d: %#018Lx\n", i,
- -                             (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+ +      for_each_possible_cpu(cpu) {
+ +              pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ +              pageset_update(pcp, high, batch);
         }
+ +}
   
- -      /*
- -       * Print out the early node map, and initialize the
- -       * subsection-map relative to active online memory ranges to
- -       * enable future "sub-section" extensions of the memory map.
- -       */
- -      pr_info("Early memory node ranges\n");
- -      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- -              pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
- -                      (u64)start_pfn << PAGE_SHIFT,
- -                      ((u64)end_pfn << PAGE_SHIFT) - 1);
- -              subsection_map_init(start_pfn, end_pfn - start_pfn);
- -      }
- -
- -      /* Initialise every node */
- -      mminit_verify_pageflags_layout();
- -      setup_nr_node_ids();
- -      for_each_node(nid) {
- -              pg_data_t *pgdat;
- -
- -              if (!node_online(nid)) {
- -                      pr_info("Initializing node %d as memoryless\n", nid);
- -
- -                      /* Allocator not initialized yet */
- -                      pgdat = arch_alloc_nodedata(nid);
- -                      if (!pgdat)
- -                              panic("Cannot allocate %zuB for node %d.\n",
- -                                     sizeof(*pgdat), nid);
- -                      arch_refresh_nodedata(nid, pgdat);
- -                      free_area_init_memoryless_node(nid);
+ +/*
+ + * Calculate and set new high and batch values for all per-cpu pagesets of a
+ + * zone based on the zone's size.
+ + */
+ +static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
+ +{
+ +      int new_high, new_batch;
   
- -                      /*
- -                       * We do not want to confuse userspace by sysfs
- -                       * files/directories for node without any memory
- -                       * attached to it, so this node is not marked as
- -                       * N_MEMORY and not marked online so that no sysfs
- -                       * hierarchy will be created via register_one_node for
- -                       * it. The pgdat will get fully initialized by
- -                       * hotadd_init_pgdat() when memory is hotplugged into
- -                       * this node.
- -                       */
- -                      continue;
- -              }
+ +      new_batch = max(1, zone_batchsize(zone));
+ +      new_high = zone_highsize(zone, new_batch, cpu_online);
   
- -              pgdat = NODE_DATA(nid);
- -              free_area_init_node(nid);
+ +      if (zone->pageset_high == new_high &&
+ +          zone->pageset_batch == new_batch)
+ +              return;
   
- -              /* Any memory on that node */
- -              if (pgdat->node_present_pages)
- -                      node_set_state(nid, N_MEMORY);
- -              check_for_memory(pgdat, nid);
- -      }
+ +      zone->pageset_high = new_high;
+ +      zone->pageset_batch = new_batch;
   
- -      memmap_init();
+ +      __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
   }
   
- -static int __init cmdline_parse_core(char *p, unsigned long *core,
- -                                   unsigned long *percent)
+ +void __meminit setup_zone_pageset(struct zone *zone)
   {
- -      unsigned long long coremem;
- -      char *endptr;
- -
- -      if (!p)
- -              return -EINVAL;
+ +      int cpu;
   
- -      /* Value may be a percentage of total memory, otherwise bytes */
- -      coremem = simple_strtoull(p, &endptr, 0);
- -      if (*endptr == '%') {
- -              /* Paranoid check for percent values greater than 100 */
- -              WARN_ON(coremem > 100);
+ +      /* Size may be 0 on !SMP && !NUMA */
+ +      if (sizeof(struct per_cpu_zonestat) > 0)
+ +              zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
   
- -              *percent = coremem;
- -      } else {
- -              coremem = memparse(p, &p);
- -              /* Paranoid check that UL is enough for the coremem value */
- -              WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+ +      zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
+ +      for_each_possible_cpu(cpu) {
+ +              struct per_cpu_pages *pcp;
+ +              struct per_cpu_zonestat *pzstats;
   
- -              *core = coremem >> PAGE_SHIFT;
- -              *percent = 0UL;
+ +              pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ +              pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ +              per_cpu_pages_init(pcp, pzstats);
         }
- -      return 0;
+ +
+ +      zone_set_pageset_high_and_batch(zone, 0);
   }
   
   /*
- - * kernelcore=size sets the amount of memory for use for allocations that
- - * cannot be reclaimed or migrated.
+ + * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ + * page high values need to be recalculated.
    */
- -static int __init cmdline_parse_kernelcore(char *p)
+ +static void zone_pcp_update(struct zone *zone, int cpu_online)
   {
- -      /* parse kernelcore=mirror */
- -      if (parse_option_str(p, "mirror")) {
- -              mirrored_kernelcore = true;
- -              return 0;
- -      }
- -
- -      return cmdline_parse_core(p, &required_kernelcore,
- -                                &required_kernelcore_percent);
+ +      mutex_lock(&pcp_batch_high_lock);
+ +      zone_set_pageset_high_and_batch(zone, cpu_online);
+ +      mutex_unlock(&pcp_batch_high_lock);
   }
   
   /*
- - * movablecore=size sets the amount of memory for use for allocations that
- - * can be reclaimed or migrated.
+ + * Allocate per cpu pagesets and initialize them.
+ + * Before this call only boot pagesets were available.
    */
- -static int __init cmdline_parse_movablecore(char *p)
+ +void __init setup_per_cpu_pageset(void)
   {
- -      return cmdline_parse_core(p, &required_movablecore,
- -                                &required_movablecore_percent);
+ +      struct pglist_data *pgdat;
+ +      struct zone *zone;
+ +      int __maybe_unused cpu;
+ +
+ +      for_each_populated_zone(zone)
+ +              setup_zone_pageset(zone);
+ +
+ +#ifdef CONFIG_NUMA
+ +      /*
+ +       * Unpopulated zones continue using the boot pagesets.
+ +       * The numa stats for these pagesets need to be reset.
+ +       * Otherwise, they will end up skewing the stats of
+ +       * the nodes these zones are associated with.
+ +       */
+ +      for_each_possible_cpu(cpu) {
+ +              struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+ +              memset(pzstats->vm_numa_event, 0,
+ +                     sizeof(pzstats->vm_numa_event));
+ +      }
+ +#endif
+ +
+ +      for_each_online_pgdat(pgdat)
+ +              pgdat->per_cpu_nodestats =
+ +                      alloc_percpu(struct per_cpu_nodestat);
   }
   
- -early_param("kernelcore", cmdline_parse_kernelcore);
- -early_param("movablecore", cmdline_parse_movablecore);
+ +__meminit void zone_pcp_init(struct zone *zone)
+ +{
+ +      /*
+ +       * per cpu subsystem is not up at this point. The following code
+ +       * relies on the ability of the linker to provide the
+ +       * offset of a (static) per cpu variable into the per cpu area.
+ +       */
+ +      zone->per_cpu_pageset = &boot_pageset;
+ +      zone->per_cpu_zonestats = &boot_zonestats;
+ +      zone->pageset_high = BOOT_PAGESET_HIGH;
+ +      zone->pageset_batch = BOOT_PAGESET_BATCH;
+ +
+ +      if (populated_zone(zone))
+ +              pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
+ +                       zone->present_pages, zone_batchsize(zone));
+ +}
   
   void adjust_managed_page_count(struct page *page, long count)
   {
@@@ -6205,6 -8559,73 +6221,6 @@@ unsigned long free_reserved_area(void *
         return pages;
   }
   
- -void __init mem_init_print_info(void)
- -{
- -      unsigned long physpages, codesize, datasize, rosize, bss_size;
- -      unsigned long init_code_size, init_data_size;
- -
- -      physpages = get_num_physpages();
- -      codesize = _etext - _stext;
- -      datasize = _edata - _sdata;
- -      rosize = __end_rodata - __start_rodata;
- -      bss_size = __bss_stop - __bss_start;
- -      init_data_size = __init_end - __init_begin;
- -      init_code_size = _einittext - _sinittext;
- -
- -      /*
- -       * Detect special cases and adjust section sizes accordingly:
- -       * 1) .init.* may be embedded into .data sections
- -       * 2) .init.text.* may be out of [__init_begin, __init_end],
- -       *    please refer to arch/tile/kernel/vmlinux.lds.S.
- -       * 3) .rodata.* may be embedded into .text or .data sections.
- -       */
- -#define adj_init_size(start, end, size, pos, adj) \
- -      do { \
- -              if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
- -                      size -= adj; \
- -      } while (0)
- -
- -      adj_init_size(__init_begin, __init_end, init_data_size,
- -                   _sinittext, init_code_size);
- -      adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
- -      adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
- -      adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
- -      adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
- -
- -#undef        adj_init_size
- -
- -      pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
- -#ifdef        CONFIG_HIGHMEM
- -              ", %luK highmem"
- -#endif
- -              ")\n",
- -              K(nr_free_pages()), K(physpages),
- -              codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
- -              (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
- -              K(physpages - totalram_pages() - totalcma_pages),
- -              K(totalcma_pages)
- -#ifdef        CONFIG_HIGHMEM
- -              , K(totalhigh_pages())
- -#endif
- -              );
- -}
- -
- -/**
- - * set_dma_reserve - set the specified number of pages reserved in the first zone
- - * @new_dma_reserve: The number of pages to mark reserved
- - *
- - * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- - * In the DMA zone, a significant percentage may be consumed by kernel image
- - * and other unfreeable allocations which can skew the watermarks badly. This
- - * function may optionally be used to account for unfreeable pages in the
- - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- - * smaller per-cpu batchsize.
- - */
- -void __init set_dma_reserve(unsigned long new_dma_reserve)
- -{
- -      dma_reserve = new_dma_reserve;
- -}
- -
   static int page_alloc_cpu_dead(unsigned int cpu)
   {
         struct zone *zone;
@@@ -6245,10 -8666,28 +6261,10 @@@ static int page_alloc_cpu_online(unsign
         return 0;
   }
   
- -#ifdef CONFIG_NUMA
- -int hashdist = HASHDIST_DEFAULT;
- -
- -static int __init set_hashdist(char *str)
- -{
- -      if (!str)
- -              return 0;
- -      hashdist = simple_strtoul(str, &str, 0);
- -      return 1;
- -}
- -__setup("hashdist=", set_hashdist);
- -#endif
- -
- -void __init page_alloc_init(void)
+ +void __init page_alloc_init_cpuhp(void)
   {
         int ret;
   
- -#ifdef CONFIG_NUMA
- -      if (num_node_state(N_MEMORY) == 1)
- -              hashdist = 0;
- -#endif
- -
         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
                                         "mm/page_alloc:pcp",
                                         page_alloc_cpu_online,
@@@ -6631,6 -9070,149 +6647,6 @@@ out
         return ret;
   }
   
- -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
- -/*
- - * Returns the number of pages that arch has reserved but
- - * is not known to alloc_large_system_hash().
- - */
- -static unsigned long __init arch_reserved_kernel_pages(void)
- -{
- -      return 0;
- -}
- -#endif
- -
- -/*
- - * Adaptive scale is meant to reduce sizes of hash tables on large memory
- - * machines. As memory size is increased the scale is also increased but at
- - * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
- - * quadruples the scale is increased by one, which means the size of hash table
- - * only doubles, instead of quadrupling as well.
- - * Because 32-bit systems cannot have large physical memory, where this scaling
- - * makes sense, it is disabled on such platforms.
- - */
- -#if __BITS_PER_LONG > 32
- -#define ADAPT_SCALE_BASE      (64ul << 30)
- -#define ADAPT_SCALE_SHIFT     2
- -#define ADAPT_SCALE_NPAGES    (ADAPT_SCALE_BASE >> PAGE_SHIFT)
- -#endif
- -
- -/*
- - * allocate a large system hash table from bootmem
- - * - it is assumed that the hash table must contain an exact power-of-2
- - *   quantity of entries
- - * - limit is the number of hash buckets, not the total allocation size
- - */
- -void *__init alloc_large_system_hash(const char *tablename,
- -                                   unsigned long bucketsize,
- -                                   unsigned long numentries,
- -                                   int scale,
- -                                   int flags,
- -                                   unsigned int *_hash_shift,
- -                                   unsigned int *_hash_mask,
- -                                   unsigned long low_limit,
- -                                   unsigned long high_limit)
- -{
- -      unsigned long long max = high_limit;
- -      unsigned long log2qty, size;
- -      void *table;
- -      gfp_t gfp_flags;
- -      bool virt;
- -      bool huge;
- -
- -      /* allow the kernel cmdline to have a say */
- -      if (!numentries) {
- -              /* round applicable memory size up to nearest megabyte */
- -              numentries = nr_kernel_pages;
- -              numentries -= arch_reserved_kernel_pages();
- -
- -              /* It isn't necessary when PAGE_SIZE >= 1MB */
- -              if (PAGE_SIZE < SZ_1M)
- -                      numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
- -
- -#if __BITS_PER_LONG > 32
- -              if (!high_limit) {
- -                      unsigned long adapt;
- -
- -                      for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
- -                           adapt <<= ADAPT_SCALE_SHIFT)
- -                              scale++;
- -              }
- -#endif
- -
- -              /* limit to 1 bucket per 2^scale bytes of low memory */
- -              if (scale > PAGE_SHIFT)
- -                      numentries >>= (scale - PAGE_SHIFT);
- -              else
- -                      numentries <<= (PAGE_SHIFT - scale);
- -
- -              /* Make sure we've got at least a 0-order allocation.. */
- -              if (unlikely(flags & HASH_SMALL)) {
- -                      /* Makes no sense without HASH_EARLY */
- -                      WARN_ON(!(flags & HASH_EARLY));
- -                      if (!(numentries >> *_hash_shift)) {
- -                              numentries = 1UL << *_hash_shift;
- -                              BUG_ON(!numentries);
- -                      }
- -              } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
- -                      numentries = PAGE_SIZE / bucketsize;
- -      }
- -      numentries = roundup_pow_of_two(numentries);
- -
- -      /* limit allocation size to 1/16 total memory by default */
- -      if (max == 0) {
- -              max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
- -              do_div(max, bucketsize);
- -      }
- -      max = min(max, 0x80000000ULL);
- -
- -      if (numentries < low_limit)
- -              numentries = low_limit;
- -      if (numentries > max)
- -              numentries = max;
- -
- -      log2qty = ilog2(numentries);
- -
- -      gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
- -      do {
- -              virt = false;
- -              size = bucketsize << log2qty;
- -              if (flags & HASH_EARLY) {
- -                      if (flags & HASH_ZERO)
- -                              table = memblock_alloc(size, SMP_CACHE_BYTES);
- -                      else
- -                              table = memblock_alloc_raw(size,
- -                                                         SMP_CACHE_BYTES);
- -              } else if (get_order(size) >= MAX_ORDER || hashdist) {
- -                      table = vmalloc_huge(size, gfp_flags);
- -                      virt = true;
- -                      if (table)
- -                              huge = is_vm_area_hugepages(table);
- -              } else {
- -                      /*
- -                       * If bucketsize is not a power-of-two, we may free
- -                       * some pages at the end of hash table which
- -                       * alloc_pages_exact() automatically does
- -                       */
- -                      table = alloc_pages_exact(size, gfp_flags);
- -                      kmemleak_alloc(table, size, 1, gfp_flags);
- -              }
- -      } while (!table && size > PAGE_SIZE && --log2qty);
- -
- -      if (!table)
- -              panic("Failed to allocate %s hash table\n", tablename);
- -
- -      pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- -              tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- -              virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
- -
- -      if (_hash_shift)
- -              *_hash_shift = log2qty;
- -      if (_hash_mask)
- -              *_hash_mask = (1 << log2qty) - 1;
- -
- -      return table;
- -}
- -
   #ifdef CONFIG_CONTIG_ALLOC
   #if defined(CONFIG_DYNAMIC_DEBUG) || \
         (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
@@@ -6814,7 -9396,7 +6830,7 @@@ int alloc_contig_range(unsigned long st
         order = 0;
         outer_start = start;
         while (!PageBuddy(pfn_to_page(outer_start))) {
- -              if (++order >= MAX_ORDER) {
+ +              if (++order > MAX_ORDER) {
                         outer_start = start;
                         break;
                 }
@@@ -6883,6 -9465,9 +6899,9 @@@ static bool pfn_range_valid_contig(stru
                         return false;
   
                 if (PageReserved(page))
+                       return false;
+ 
+               if (PageHuge(page))
                         return false;
         }
         return true;
@@@ -7064,7 -9649,7 +7083,7 @@@ bool is_free_buddy_page(struct page *pa
         unsigned long pfn = page_to_pfn(page);
         unsigned int order;
   
- -      for (order = 0; order < MAX_ORDER; order++) {
+ +      for (order = 0; order <= MAX_ORDER; order++) {
                 struct page *page_head = page - (pfn & ((1 << order) - 1));
   
                 if (PageBuddy(page_head) &&
@@@ -7072,7 -9657,7 +7091,7 @@@
                         break;
         }
   
- -      return order < MAX_ORDER;
+ +      return order <= MAX_ORDER;
   }
   EXPORT_SYMBOL(is_free_buddy_page);
   
@@@ -7123,7 -9708,7 +7142,7 @@@ bool take_page_off_buddy(struct page *p
         bool ret = false;
   
         spin_lock_irqsave(&zone->lock, flags);
- -      for (order = 0; order < MAX_ORDER; order++) {
+ +      for (order = 0; order <= MAX_ORDER; order++) {
                 struct page *page_head = page - (pfn & ((1 << order) - 1));
                 int page_order = buddy_order(page_head);
   
diff --combined mm/vmalloc.c

index 3fa476f,31ff782..63a2255
--- 1/mm/vmalloc.c
--- 2/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@@ -33,11 -33,11 +33,11 @@@
   #include <linux/compiler.h>
   #include <linux/memcontrol.h>
   #include <linux/llist.h>
+ +#include <linux/uio.h>
   #include <linux/bitops.h>
   #include <linux/rbtree_augmented.h>
   #include <linux/overflow.h>
   #include <linux/pgtable.h>
- -#include <linux/uaccess.h>
   #include <linux/hugetlb.h>
   #include <linux/sched/mm.h>
   #include <asm/tlbflush.h>
@@@ -313,8 -313,8 +313,8 @@@ int ioremap_page_range(unsigned long ad
                                  ioremap_max_page_shift);
         flush_cache_vmap(addr, end);
         if (!err)
-               kmsan_ioremap_page_range(addr, end, phys_addr, prot,
-                                        ioremap_max_page_shift);
+               err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+                                              ioremap_max_page_shift);
         return err;
   }
   
@@@ -605,7 -605,11 +605,11 @@@ int __vmap_pages_range_noflush(unsigne
   int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift)
   {
-       kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+       int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
+                                                page_shift);
+ 
+       if (ret)
+               return ret;
         return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
   }
   
@@@ -2739,7 -2743,7 +2743,7 @@@ void vfree(const void *addr
                  * High-order allocs for huge vmallocs are split, so
                  * can be freed as an array of order-0 allocations
                  */
- -              __free_pages(page, 0);
+ +              __free_page(page);
                 cond_resched();
         }
         atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
@@@ -3190,7 -3194,7 +3194,7 @@@ again
                          * pages backing VM_ALLOC mapping. Memory is instead
                          * poisoned and zeroed by kasan_unpoison_vmalloc().
                          */
- -                      gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ +                      gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                 }
   
                 /* Take note that the mapping is PAGE_KERNEL. */
@@@ -3444,96 -3448,62 +3448,96 @@@ void *vmalloc_32_user(unsigned long siz
   EXPORT_SYMBOL(vmalloc_32_user);
   
   /*
- - * small helper routine , copy contents to buf from addr.
- - * If the page is not present, fill zero.
+ + * Atomically zero bytes in the iterator.
+ + *
+ + * Returns the number of zeroed bytes.
    */
+ +static size_t zero_iter(struct iov_iter *iter, size_t count)
+ +{
+ +      size_t remains = count;
+ +
+ +      while (remains > 0) {
+ +              size_t num, copied;
+ +
+ +              num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ +              copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+ +              remains -= copied;
+ +
+ +              if (copied < num)
+ +                      break;
+ +      }
   
- -static int aligned_vread(char *buf, char *addr, unsigned long count)
+ +      return count - remains;
+ +}
+ +
+ +/*
+ + * small helper routine, copy contents to iter from addr.
+ + * If the page is not present, fill zero.
+ + *
+ + * Returns the number of copied bytes.
+ + */
+ +static size_t aligned_vread_iter(struct iov_iter *iter,
+ +                               const char *addr, size_t count)
   {
- -      struct page *p;
- -      int copied = 0;
+ +      size_t remains = count;
+ +      struct page *page;
   
- -      while (count) {
+ +      while (remains > 0) {
                 unsigned long offset, length;
+ +              size_t copied = 0;
   
                 offset = offset_in_page(addr);
                 length = PAGE_SIZE - offset;
- -              if (length > count)
- -                      length = count;
- -              p = vmalloc_to_page(addr);
+ +              if (length > remains)
+ +                      length = remains;
+ +              page = vmalloc_to_page(addr);
                 /*
- -               * To do safe access to this _mapped_ area, we need
- -               * lock. But adding lock here means that we need to add
- -               * overhead of vmalloc()/vfree() calls for this _debug_
- -               * interface, rarely used. Instead of that, we'll use
- -               * kmap() and get small overhead in this access function.
+ +               * To do safe access to this _mapped_ area, we need lock. But
+ +               * adding lock here means that we need to add overhead of
+ +               * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ +               * used. Instead of that, we'll use an local mapping via
+ +               * copy_page_to_iter_nofault() and accept a small overhead in
+ +               * this access function.
                  */
- -              if (p) {
- -                      /* We can expect USER0 is not used -- see vread() */
- -                      void *map = kmap_atomic(p);
- -                      memcpy(buf, map + offset, length);
- -                      kunmap_atomic(map);
- -              } else
- -                      memset(buf, 0, length);
+ +              if (page)
+ +                      copied = copy_page_to_iter_nofault(page, offset,
+ +                                                         length, iter);
+ +              else
+ +                      copied = zero_iter(iter, length);
   
- -              addr += length;
- -              buf += length;
- -              copied += length;
- -              count -= length;
+ +              addr += copied;
+ +              remains -= copied;
+ +
+ +              if (copied != length)
+ +                      break;
         }
- -      return copied;
+ +
+ +      return count - remains;
   }
   
- -static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+ +/*
+ + * Read from a vm_map_ram region of memory.
+ + *
+ + * Returns the number of copied bytes.
+ + */
+ +static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ +                                size_t count, unsigned long flags)
   {
         char *start;
         struct vmap_block *vb;
         unsigned long offset;
- -      unsigned int rs, re, n;
+ +      unsigned int rs, re;
+ +      size_t remains, n;
   
         /*
          * If it's area created by vm_map_ram() interface directly, but
          * not further subdividing and delegating management to vmap_block,
          * handle it here.
          */
- -      if (!(flags & VMAP_BLOCK)) {
- -              aligned_vread(buf, addr, count);
- -              return;
- -      }
+ +      if (!(flags & VMAP_BLOCK))
+ +              return aligned_vread_iter(iter, addr, count);
+ +
+ +      remains = count;
   
         /*
          * Area is split into regions and tracked with vmap_block, read out
@@@ -3541,64 -3511,50 +3545,64 @@@
          */
         vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
         if (!vb)
- -              goto finished;
+ +              goto finished_zero;
   
         spin_lock(&vb->lock);
         if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                 spin_unlock(&vb->lock);
- -              goto finished;
+ +              goto finished_zero;
         }
+ +
         for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
- -              if (!count)
- -                      break;
+ +              size_t copied;
+ +
+ +              if (remains == 0)
+ +                      goto finished;
+ +
                 start = vmap_block_vaddr(vb->va->va_start, rs);
- -              while (addr < start) {
- -                      if (count == 0)
- -                              goto unlock;
- -                      *buf = '\0';
- -                      buf++;
- -                      addr++;
- -                      count--;
+ +
+ +              if (addr < start) {
+ +                      size_t to_zero = min_t(size_t, start - addr, remains);
+ +                      size_t zeroed = zero_iter(iter, to_zero);
+ +
+ +                      addr += zeroed;
+ +                      remains -= zeroed;
+ +
+ +                      if (remains == 0 || zeroed != to_zero)
+ +                              goto finished;
                 }
+ +
                 /*it could start reading from the middle of used region*/
                 offset = offset_in_page(addr);
                 n = ((re - rs + 1) << PAGE_SHIFT) - offset;
- -              if (n > count)
- -                      n = count;
- -              aligned_vread(buf, start+offset, n);
+ +              if (n > remains)
+ +                      n = remains;
+ +
+ +              copied = aligned_vread_iter(iter, start + offset, n);
   
- -              buf += n;
- -              addr += n;
- -              count -= n;
+ +              addr += copied;
+ +              remains -= copied;
+ +
+ +              if (copied != n)
+ +                      goto finished;
         }
- -unlock:
+ +
         spin_unlock(&vb->lock);
   
- -finished:
+ +finished_zero:
         /* zero-fill the left dirty or free regions */
- -      if (count)
- -              memset(buf, 0, count);
+ +      return count - remains + zero_iter(iter, remains);
+ +finished:
+ +      /* We couldn't copy/zero everything */
+ +      spin_unlock(&vb->lock);
+ +      return count - remains;
   }
   
   /**
- - * vread() - read vmalloc area in a safe way.
- - * @buf:     buffer for reading data
- - * @addr:    vm address.
- - * @count:   number of bytes to be read.
+ + * vread_iter() - read vmalloc area in a safe way to an iterator.
+ + * @iter:         the iterator to which data should be written.
+ + * @addr:         vm address.
+ + * @count:        number of bytes to be read.
    *
    * This function checks that addr is a valid vmalloc'ed area, and
    * copy data from that area to a given buffer. If the given memory range
@@@ -3618,12 -3574,13 +3622,12 @@@
    * (same number as @count) or %0 if [addr...addr+count) doesn't
    * include any intersection with valid vmalloc area
    */
- -long vread(char *buf, char *addr, unsigned long count)
+ +long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
   {
         struct vmap_area *va;
         struct vm_struct *vm;
- -      char *vaddr, *buf_start = buf;
- -      unsigned long buflen = count;
- -      unsigned long n, size, flags;
+ +      char *vaddr;
+ +      size_t n, size, flags, remains;
   
         addr = kasan_reset_tag(addr);
   
@@@ -3631,22 -3588,18 +3635,22 @@@
         if ((unsigned long) addr + count < count)
                 count = -(unsigned long) addr;
   
+ +      remains = count;
+ +
         spin_lock(&vmap_area_lock);
         va = find_vmap_area_exceed_addr((unsigned long)addr);
         if (!va)
- -              goto finished;
+ +              goto finished_zero;
   
         /* no intersects with alive vmap_area */
- -      if ((unsigned long)addr + count <= va->va_start)
- -              goto finished;
+ +      if ((unsigned long)addr + remains <= va->va_start)
+ +              goto finished_zero;
   
         list_for_each_entry_from(va, &vmap_area_list, list) {
- -              if (!count)
- -                      break;
+ +              size_t copied;
+ +
+ +              if (remains == 0)
+ +                      goto finished;
   
                 vm = va->vm;
                 flags = va->flags & VMAP_FLAGS_MASK;
@@@ -3661,7 -3614,6 +3665,7 @@@
   
                 if (vm && (vm->flags & VM_UNINITIALIZED))
                         continue;
+ +
                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                 smp_rmb();
   
@@@ -3670,45 -3622,38 +3674,45 @@@
   
                 if (addr >= vaddr + size)
                         continue;
- -              while (addr < vaddr) {
- -                      if (count == 0)
+ +
+ +              if (addr < vaddr) {
+ +                      size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ +                      size_t zeroed = zero_iter(iter, to_zero);
+ +
+ +                      addr += zeroed;
+ +                      remains -= zeroed;
+ +
+ +                      if (remains == 0 || zeroed != to_zero)
                                 goto finished;
- -                      *buf = '\0';
- -                      buf++;
- -                      addr++;
- -                      count--;
                 }
+ +
                 n = vaddr + size - addr;
- -              if (n > count)
- -                      n = count;
+ +              if (n > remains)
+ +                      n = remains;
   
                 if (flags & VMAP_RAM)
- -                      vmap_ram_vread(buf, addr, n, flags);
+ +                      copied = vmap_ram_vread_iter(iter, addr, n, flags);
                 else if (!(vm->flags & VM_IOREMAP))
- -                      aligned_vread(buf, addr, n);
+ +                      copied = aligned_vread_iter(iter, addr, n);
                 else /* IOREMAP area is treated as memory hole */
- -                      memset(buf, 0, n);
- -              buf += n;
- -              addr += n;
- -              count -= n;
+ +                      copied = zero_iter(iter, n);
+ +
+ +              addr += copied;
+ +              remains -= copied;
+ +
+ +              if (copied != n)
+ +                      goto finished;
         }
- -finished:
- -      spin_unlock(&vmap_area_lock);
   
- -      if (buf == buf_start)
- -              return 0;
+ +finished_zero:
+ +      spin_unlock(&vmap_area_lock);
         /* zero-fill memory holes */
- -      if (buf != buf_start + buflen)
- -              memset(buf, 0, buflen - (buf - buf_start));
+ +      return count - remains + zero_iter(iter, remains);
+ +finished:
+ +      /* Nothing remains, or We couldn't copy/zero everything. */
+ +      spin_unlock(&vmap_area_lock);
   
- -      return buflen;
+ +      return count - remains;
   }
   
   /**
author	Andrew Morton <akpm@linux-foundation.org>
	Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 18 Apr 2023 21:53:49 +0000 (14:53 -0700)
		1	2
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/maple_tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmalloc.c	patch \|	diff1 \|	diff2 \|	blob \| history