LoongArch: Refactor cache probe and flush methods

[platform/kernel/linux-starfive.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index b0bcab5..d04211f 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,97 @@ typedef int __bitwise fpi_t;
  static DEFINE_MUTEX(pcp_batch_high_lock);
  #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
  
-struct pagesets {
-       local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
-       .lock = INIT_LOCAL_LOCK(lock),
-};
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)     do { } while (0)
+#define pcp_trylock_finish(flag)       do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)     local_irq_save(flags)
+#define pcp_trylock_finish(flags)      local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin()                preempt_disable()
+#define pcpu_task_unpin()      preempt_enable()
+#else
+#define pcpu_task_pin()                migrate_disable()
+#define pcpu_task_unpin()      migrate_enable()
+#endif
  
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr)                              \
+({                                                                     \
+       type *_ret;                                                     \
+       pcpu_task_pin();                                                \
+       _ret = this_cpu_ptr(ptr);                                       \
+       spin_lock(&_ret->member);                                       \
+       _ret;                                                           \
+})
+
+#define pcpu_spin_lock_irqsave(type, member, ptr, flags)               \
+({                                                                     \
+       type *_ret;                                                     \
+       pcpu_task_pin();                                                \
+       _ret = this_cpu_ptr(ptr);                                       \
+       spin_lock_irqsave(&_ret->member, flags);                        \
+       _ret;                                                           \
+})
+
+#define pcpu_spin_trylock_irqsave(type, member, ptr, flags)            \
+({                                                                     \
+       type *_ret;                                                     \
+       pcpu_task_pin();                                                \
+       _ret = this_cpu_ptr(ptr);                                       \
+       if (!spin_trylock_irqsave(&_ret->member, flags)) {              \
+               pcpu_task_unpin();                                      \
+               _ret = NULL;                                            \
+       }                                                               \
+       _ret;                                                           \
+})
+
+#define pcpu_spin_unlock(member, ptr)                                  \
+({                                                                     \
+       spin_unlock(&ptr->member);                                      \
+       pcpu_task_unpin();                                              \
+})
+
+#define pcpu_spin_unlock_irqrestore(member, ptr, flags)                        \
+({                                                                     \
+       spin_unlock_irqrestore(&ptr->member, flags);                    \
+       pcpu_task_unpin();                                              \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr)                                             \
+       pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_lock_irqsave(ptr, flags)                              \
+       pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_trylock_irqsave(ptr, flags)                           \
+       pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_unlock(ptr)                                           \
+       pcpu_spin_unlock(lock, ptr)
+
+#define pcp_spin_unlock_irqrestore(ptr, flags)                         \
+       pcpu_spin_unlock_irqrestore(lock, ptr, flags)
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_);           /* Kernel "local memory" node */
  EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  #endif
  
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
-       struct zone *zone;
-       struct work_struct work;
-};
  static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
  
  #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
  volatile unsigned long latent_entropy __latent_entropy;
@@ -356,7 +434,7 @@ static unsigned long required_kernelcore_percent __initdata;
  static unsigned long required_movablecore __initdata;
  static unsigned long required_movablecore_percent __initdata;
  static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
-static bool mirrored_kernelcore __meminitdata;
+bool mirrored_kernelcore __initdata_memblock;
  
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
@@ -524,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
  {
         unsigned long *bitmap;
         unsigned long bitidx, word_bitidx;
-       unsigned long old_word, word;
+       unsigned long word;
  
         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
@@ -540,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
         flags <<= bitidx;
  
         word = READ_ONCE(bitmap[word_bitidx]);
-       for (;;) {
-               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-               if (word == old_word)
-                       break;
-               word = old_word;
-       }
+       do {
+       } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
  }
  
  void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -653,7 +727,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         if (order > PAGE_ALLOC_COSTLY_ORDER) {
                 VM_BUG_ON(order != pageblock_order);
-               base = PAGE_ALLOC_COSTLY_ORDER + 1;
+               return NR_LOWORDER_PCP_LISTS;
         }
  #else
         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -667,7 +741,7 @@ static inline int pindex_to_order(unsigned int pindex)
         int order = pindex / MIGRATE_PCPTYPES;
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (order > PAGE_ALLOC_COSTLY_ORDER)
+       if (pindex == NR_LOWORDER_PCP_LISTS)
                 order = pageblock_order;
  #else
         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -744,6 +818,14 @@ void prep_compound_page(struct page *page, unsigned int order)
         prep_compound_head(page, order);
  }
  
+void destroy_large_folio(struct folio *folio)
+{
+       enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+       VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+       compound_page_dtors[dtor](&folio->page);
+}
+
  #ifdef CONFIG_DEBUG_PAGEALLOC
  unsigned int _debug_guardpage_minorder;
  
@@ -785,7 +867,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
                 return false;
  
         __SetPageGuard(page);
-       INIT_LIST_HEAD(&page->lru);
+       INIT_LIST_HEAD(&page->buddy_list);
         set_page_private(page, order);
         /* Guard pages are not available for any usage */
         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -928,7 +1010,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
  {
         struct free_area *area = &zone->free_area[order];
  
-       list_add(&page->lru, &area->free_list[migratetype]);
+       list_add(&page->buddy_list, &area->free_list[migratetype]);
         area->nr_free++;
  }
  
@@ -938,7 +1020,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
  {
         struct free_area *area = &zone->free_area[order];
  
-       list_add_tail(&page->lru, &area->free_list[migratetype]);
+       list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
         area->nr_free++;
  }
  
@@ -952,7 +1034,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
  {
         struct free_area *area = &zone->free_area[order];
  
-       list_move_tail(&page->lru, &area->free_list[migratetype]);
+       list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
  }
  
  static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -962,7 +1044,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
         if (page_reported(page))
                 __ClearPageReported(page);
  
-       list_del(&page->lru);
+       list_del(&page->buddy_list);
         __ClearPageBuddy(page);
         set_page_private(page, 0);
         zone->free_area[order].nr_free--;
@@ -1296,18 +1378,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
                PageSkipKASanPoison(page);
  }
  
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_pages(struct page *page, int numpages)
  {
         int i;
  
         /* s390's use of memset() could override KASAN redzones. */
         kasan_disable_current();
-       for (i = 0; i < numpages; i++) {
-               u8 tag = page_kasan_tag(page + i);
-               page_kasan_tag_reset(page + i);
-               clear_highpage(page + i);
-               page_kasan_tag_set(page + i, tag);
-       }
+       for (i = 0; i < numpages; i++)
+               clear_highpage_kasan_tagged(page + i);
         kasan_enable_current();
  }
  
@@ -1396,7 +1474,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
                         init = false;
         }
         if (init)
-               kernel_init_free_pages(page, 1 << order);
+               kernel_init_pages(page, 1 << order);
  
         /*
          * arch_free_page() can make the page's contents inaccessible.  s390
@@ -1473,10 +1551,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
         /* Ensure requested pindex is drained first. */
         pindex = pindex - 1;
  
-       /*
-        * local_lock_irq held so equivalent to spin_lock_irqsave for
-        * both PREEMPT_RT and non-PREEMPT_RT configurations.
-        */
+       /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
         spin_lock(&zone->lock);
         isolated_pageblocks = has_isolate_pageblock(zone);
  
@@ -1504,11 +1579,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                 do {
                         int mt;
  
-                       page = list_last_entry(list, struct page, lru);
+                       page = list_last_entry(list, struct page, pcp_list);
                         mt = get_pcppage_migratetype(page);
  
                         /* must delete to avoid corrupting pcp list */
-                       list_del(&page->lru);
+                       list_del(&page->pcp_list);
                         count -= nr_pages;
                         pcp->count -= nr_pages;
  
@@ -2442,7 +2517,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
         }
         /* If memory is still not initialized, do it now. */
         if (init)
-               kernel_init_free_pages(page, 1 << order);
+               kernel_init_pages(page, 1 << order);
         /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
         if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
                 SetPageSkipKASanPoison(page);
@@ -3045,10 +3120,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  {
         int i, allocated = 0;
  
-       /*
-        * local_lock_irq held so equivalent to spin_lock_irqsave for
-        * both PREEMPT_RT and non-PREEMPT_RT configurations.
-        */
+       /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
                 struct page *page = __rmqueue(zone, order, migratetype,
@@ -3069,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                  * for IO devices that can merge IO requests if the physical
                  * pages are ordered properly.
                  */
-               list_add_tail(&page->lru, list);
+               list_add_tail(&page->pcp_list, list);
                 allocated++;
                 if (is_migrate_cma(get_pcppage_migratetype(page)))
                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3092,51 +3164,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
   * Called from the vmstat counter updater to drain pagesets of this
   * currently executing processor on remote nodes after they have
   * expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
   */
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
-       unsigned long flags;
         int to_drain, batch;
  
-       local_lock_irqsave(&pagesets.lock, flags);
         batch = READ_ONCE(pcp->batch);
         to_drain = min(pcp->count, batch);
-       if (to_drain > 0)
+       if (to_drain > 0) {
+               unsigned long flags;
+
+               /*
+                * free_pcppages_bulk expects IRQs disabled for zone->lock
+                * so even though pcp->lock is not intended to be IRQ-safe,
+                * it's needed in this context.
+                */
+               spin_lock_irqsave(&pcp->lock, flags);
                 free_pcppages_bulk(zone, to_drain, pcp, 0);
-       local_unlock_irqrestore(&pagesets.lock, flags);
+               spin_unlock_irqrestore(&pcp->lock, flags);
+       }
  }
  #endif
  
  /*
   * Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
   */
  static void drain_pages_zone(unsigned int cpu, struct zone *zone)
  {
-       unsigned long flags;
         struct per_cpu_pages *pcp;
  
-       local_lock_irqsave(&pagesets.lock, flags);
-
         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-       if (pcp->count)
-               free_pcppages_bulk(zone, pcp->count, pcp, 0);
+       if (pcp->count) {
+               unsigned long flags;
  
-       local_unlock_irqrestore(&pagesets.lock, flags);
+               /* See drain_zone_pages on why this is disabling IRQs */
+               spin_lock_irqsave(&pcp->lock, flags);
+               free_pcppages_bulk(zone, pcp->count, pcp, 0);
+               spin_unlock_irqrestore(&pcp->lock, flags);
+       }
  }
  
  /*
   * Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
   */
  static void drain_pages(unsigned int cpu)
  {
@@ -3149,9 +3218,6 @@ static void drain_pages(unsigned int cpu)
  
  /*
   * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
   */
  void drain_local_pages(struct zone *zone)
  {
@@ -3163,24 +3229,6 @@ void drain_local_pages(struct zone *zone)
                 drain_pages(cpu);
  }
  
-static void drain_local_pages_wq(struct work_struct *work)
-{
-       struct pcpu_drain *drain;
-
-       drain = container_of(work, struct pcpu_drain, work);
-
-       /*
-        * drain_all_pages doesn't use proper cpu hotplug protection so
-        * we can race with cpu offline when the WQ can move this from
-        * a cpu pinned worker to an unbound one. We can operate on a different
-        * cpu which is alright but we also have to make sure to not move to
-        * a different one.
-        */
-       migrate_disable();
-       drain_local_pages(drain->zone);
-       migrate_enable();
-}
-
  /*
   * The implementation of drain_all_pages(), exposing an extra parameter to
   * drain on all cpus.
@@ -3202,13 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
         static cpumask_t cpus_with_pcps;
  
         /*
-        * Make sure nobody triggers this path before mm_percpu_wq is fully
-        * initialized.
-        */
-       if (WARN_ON_ONCE(!mm_percpu_wq))
-               return;
-
-       /*
          * Do not drain if one is already in progress unless it's specific to
          * a zone. Such callers are primarily CMA and memory hotplug and need
          * the drain to be complete when the call returns.
@@ -3257,14 +3298,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
         }
  
         for_each_cpu(cpu, &cpus_with_pcps) {
-               struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-
-               drain->zone = zone;
-               INIT_WORK(&drain->work, drain_local_pages_wq);
-               queue_work_on(cpu, mm_percpu_wq, &drain->work);
+               if (zone)
+                       drain_pages_zone(cpu, zone);
+               else
+                       drain_pages(cpu);
         }
-       for_each_cpu(cpu, &cpus_with_pcps)
-               flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
  
         mutex_unlock(&pcpu_drain_mutex);
  }
@@ -3273,8 +3311,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
   *
   * When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
   */
  void drain_all_pages(struct zone *zone)
  {
@@ -3319,7 +3355,7 @@ void mark_free_pages(struct zone *zone)
  
         for_each_migratetype_order(order, t) {
                 list_for_each_entry(page,
-                               &zone->free_area[order].free_list[t], lru) {
+                               &zone->free_area[order].free_list[t], buddy_list) {
                         unsigned long i;
  
                         pfn = page_to_pfn(page);
@@ -3396,19 +3432,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
         return min(READ_ONCE(pcp->batch) << 2, high);
  }
  
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+                                  struct page *page, int migratetype,
                                    unsigned int order)
  {
-       struct zone *zone = page_zone(page);
-       struct per_cpu_pages *pcp;
         int high;
         int pindex;
         bool free_high;
  
         __count_vm_event(PGFREE);
-       pcp = this_cpu_ptr(zone->per_cpu_pageset);
         pindex = order_to_pindex(migratetype, order);
-       list_add(&page->lru, &pcp->lists[pindex]);
+       list_add(&page->pcp_list, &pcp->lists[pindex]);
         pcp->count += 1 << order;
  
         /*
@@ -3433,6 +3467,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
  void free_unref_page(struct page *page, unsigned int order)
  {
         unsigned long flags;
+       unsigned long __maybe_unused UP_flags;
+       struct per_cpu_pages *pcp;
+       struct zone *zone;
         unsigned long pfn = page_to_pfn(page);
         int migratetype;
  
@@ -3455,9 +3492,16 @@ void free_unref_page(struct page *page, unsigned int order)
                 migratetype = MIGRATE_MOVABLE;
         }
  
-       local_lock_irqsave(&pagesets.lock, flags);
-       free_unref_page_commit(page, migratetype, order);
-       local_unlock_irqrestore(&pagesets.lock, flags);
+       zone = page_zone(page);
+       pcp_trylock_prepare(UP_flags);
+       pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+       if (pcp) {
+               free_unref_page_commit(zone, pcp, page, migratetype, order);
+               pcp_spin_unlock_irqrestore(pcp, flags);
+       } else {
+               free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+       }
+       pcp_trylock_finish(UP_flags);
  }
  
  /*
@@ -3466,6 +3510,8 @@ void free_unref_page(struct page *page, unsigned int order)
  void free_unref_page_list(struct list_head *list)
  {
         struct page *page, *next;
+       struct per_cpu_pages *pcp = NULL;
+       struct zone *locked_zone = NULL;
         unsigned long flags;
         int batch_count = 0;
         int migratetype;
@@ -3490,8 +3536,18 @@ void free_unref_page_list(struct list_head *list)
                 }
         }
  
-       local_lock_irqsave(&pagesets.lock, flags);
         list_for_each_entry_safe(page, next, list, lru) {
+               struct zone *zone = page_zone(page);
+
+               /* Different zone, different pcp lock. */
+               if (zone != locked_zone) {
+                       if (pcp)
+                               pcp_spin_unlock_irqrestore(pcp, flags);
+
+                       locked_zone = zone;
+                       pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
+               }
+
                 /*
                  * Non-isolated types over MIGRATE_PCPTYPES get added
                  * to the MIGRATE_MOVABLE pcp list.
@@ -3501,19 +3557,21 @@ void free_unref_page_list(struct list_head *list)
                         migratetype = MIGRATE_MOVABLE;
  
                 trace_mm_page_free_batched(page);
-               free_unref_page_commit(page, migratetype, 0);
+               free_unref_page_commit(zone, pcp, page, migratetype, 0);
  
                 /*
                  * Guard against excessive IRQ disabled times when we get
                  * a large list of pages to free.
                  */
                 if (++batch_count == SWAP_CLUSTER_MAX) {
-                       local_unlock_irqrestore(&pagesets.lock, flags);
+                       pcp_spin_unlock_irqrestore(pcp, flags);
                         batch_count = 0;
-                       local_lock_irqsave(&pagesets.lock, flags);
+                       pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
                 }
         }
-       local_unlock_irqrestore(&pagesets.lock, flags);
+
+       if (pcp)
+               pcp_spin_unlock_irqrestore(pcp, flags);
  }
  
  /*
@@ -3638,6 +3696,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
  #endif
  }
  
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+                          unsigned int order, unsigned int alloc_flags,
+                          int migratetype)
+{
+       struct page *page;
+       unsigned long flags;
+
+       do {
+               page = NULL;
+               spin_lock_irqsave(&zone->lock, flags);
+               /*
+                * order-0 request can reach here when the pcplist is skipped
+                * due to non-CMA allocation context. HIGHATOMIC area is
+                * reserved for high-order atomic allocation, so order-0
+                * request should skip it.
+                */
+               if (order > 0 && alloc_flags & ALLOC_HARDER)
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+               if (!page) {
+                       page = __rmqueue(zone, order, migratetype, alloc_flags);
+                       if (!page) {
+                               spin_unlock_irqrestore(&zone->lock, flags);
+                               return NULL;
+                       }
+               }
+               __mod_zone_freepage_state(zone, -(1 << order),
+                                         get_pcppage_migratetype(page));
+               spin_unlock_irqrestore(&zone->lock, flags);
+       } while (check_new_pages(page, order));
+
+       __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+       zone_statistics(preferred_zone, zone, 1);
+
+       return page;
+}
+
  /* Remove page from the per-cpu list, caller must protect the list */
  static inline
  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3671,8 +3766,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
                                 return NULL;
                 }
  
-               page = list_first_entry(list, struct page, lru);
-               list_del(&page->lru);
+               page = list_first_entry(list, struct page, pcp_list);
+               list_del(&page->pcp_list);
                 pcp->count -= 1 << order;
         } while (check_new_pcp(page, order));
  
@@ -3689,19 +3784,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
         struct list_head *list;
         struct page *page;
         unsigned long flags;
+       unsigned long __maybe_unused UP_flags;
  
-       local_lock_irqsave(&pagesets.lock, flags);
+       /*
+        * spin_trylock may fail due to a parallel drain. In the future, the
+        * trylock will also protect against IRQ reentrancy.
+        */
+       pcp_trylock_prepare(UP_flags);
+       pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+       if (!pcp) {
+               pcp_trylock_finish(UP_flags);
+               return NULL;
+       }
  
         /*
          * On allocation, reduce the number of pages that are batch freed.
          * See nr_pcp_free() where free_factor is increased for subsequent
          * frees.
          */
-       pcp = this_cpu_ptr(zone->per_cpu_pageset);
         pcp->free_factor >>= 1;
         list = &pcp->lists[order_to_pindex(migratetype, order)];
         page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
-       local_unlock_irqrestore(&pagesets.lock, flags);
+       pcp_spin_unlock_irqrestore(pcp, flags);
+       pcp_trylock_finish(UP_flags);
         if (page) {
                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
                 zone_statistics(preferred_zone, zone, 1);
@@ -3718,9 +3823,14 @@ struct page *rmqueue(struct zone *preferred_zone,
                         gfp_t gfp_flags, unsigned int alloc_flags,
                         int migratetype)
  {
-       unsigned long flags;
         struct page *page;
  
+       /*
+        * We most definitely don't want callers attempting to
+        * allocate greater than order-1 page units with __GFP_NOFAIL.
+        */
+       WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
         if (likely(pcp_allowed_order(order))) {
                 /*
                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3730,53 +3840,23 @@ struct page *rmqueue(struct zone *preferred_zone,
                                 migratetype != MIGRATE_MOVABLE) {
                         page = rmqueue_pcplist(preferred_zone, zone, order,
                                         gfp_flags, migratetype, alloc_flags);
-                       goto out;
+                       if (likely(page))
+                               goto out;
                 }
         }
  
-       /*
-        * We most definitely don't want callers attempting to
-        * allocate greater than order-1 page units with __GFP_NOFAIL.
-        */
-       WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
-       do {
-               page = NULL;
-               spin_lock_irqsave(&zone->lock, flags);
-               /*
-                * order-0 request can reach here when the pcplist is skipped
-                * due to non-CMA allocation context. HIGHATOMIC area is
-                * reserved for high-order atomic allocation, so order-0
-                * request should skip it.
-                */
-               if (order > 0 && alloc_flags & ALLOC_HARDER)
-                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-               if (!page) {
-                       page = __rmqueue(zone, order, migratetype, alloc_flags);
-                       if (!page)
-                               goto failed;
-               }
-               __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_pcppage_migratetype(page));
-               spin_unlock_irqrestore(&zone->lock, flags);
-       } while (check_new_pages(page, order));
-
-       __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-       zone_statistics(preferred_zone, zone, 1);
+       page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+                                                       migratetype);
  
  out:
         /* Separate test+clear to avoid unnecessary atomics */
-       if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+       if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
         }
  
         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
         return page;
-
-failed:
-       spin_unlock_irqrestore(&zone->lock, flags);
-       return NULL;
  }
  
  #ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -4095,7 +4175,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
  retry:
         /*
          * Scan zonelist, looking for a zone with enough free.
-        * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
+        * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
          */
         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
         z = ac->preferred_zoneref;
@@ -4628,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
  EXPORT_SYMBOL_GPL(fs_reclaim_release);
  #endif
  
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+       if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+               return read_seqbegin(&zonelist_update_seq);
+
+       return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+       if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+               return read_seqretry(&zonelist_update_seq, seq);
+
+       return seq;
+}
+
  /* Perform direct synchronous page reclaim */
  static unsigned long
  __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -4921,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         int compaction_retries;
         int no_progress_loops;
         unsigned int cpuset_mems_cookie;
+       unsigned int zonelist_iter_cookie;
         int reserve_flags;
  
         /*
@@ -4931,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                 gfp_mask &= ~__GFP_ATOMIC;
  
-retry_cpuset:
+restart:
         compaction_retries = 0;
         no_progress_loops = 0;
         compact_priority = DEF_COMPACT_PRIORITY;
         cpuset_mems_cookie = read_mems_allowed_begin();
+       zonelist_iter_cookie = zonelist_iter_begin();
  
         /*
          * The fast path uses conservative alloc_flags to succeed only until
@@ -5107,9 +5213,13 @@ retry:
                 goto retry;
  
  
-       /* Deal with possible cpuset update races before we start OOM killing */
-       if (check_retry_cpuset(cpuset_mems_cookie, ac))
-               goto retry_cpuset;
+       /*
+        * Deal with possible cpuset update races or zonelist updates to avoid
+        * a unnecessary OOM kill.
+        */
+       if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+           check_retry_zonelist(zonelist_iter_cookie))
+               goto restart;
  
         /* Reclaim has failed us, start killing things */
         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5129,9 +5239,13 @@ retry:
         }
  
  nopage:
-       /* Deal with possible cpuset update races before we fail */
-       if (check_retry_cpuset(cpuset_mems_cookie, ac))
-               goto retry_cpuset;
+       /*
+        * Deal with possible cpuset update races or zonelist updates to avoid
+        * a unnecessary OOM kill.
+        */
+       if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+           check_retry_zonelist(zonelist_iter_cookie))
+               goto restart;
  
         /*
          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5202,10 +5316,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
                         *alloc_flags |= ALLOC_CPUSET;
         }
  
-       fs_reclaim_acquire(gfp_mask);
-       fs_reclaim_release(gfp_mask);
-
-       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+       might_alloc(gfp_mask);
  
         if (should_fail_alloc_page(gfp_mask, order))
                 return false;
@@ -5253,6 +5364,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
  {
         struct page *page;
         unsigned long flags;
+       unsigned long __maybe_unused UP_flags;
         struct zone *zone;
         struct zoneref *z;
         struct per_cpu_pages *pcp;
@@ -5333,11 +5445,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
         if (unlikely(!zone))
                 goto failed;
  
+       /* Is a parallel drain in progress? */
+       pcp_trylock_prepare(UP_flags);
+       pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+       if (!pcp)
+               goto failed_irq;
+
         /* Attempt the batch allocation */
-       local_lock_irqsave(&pagesets.lock, flags);
-       pcp = this_cpu_ptr(zone->per_cpu_pageset);
         pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
-
         while (nr_populated < nr_pages) {
  
                 /* Skip existing pages */
@@ -5350,8 +5465,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                                                                 pcp, pcp_list);
                 if (unlikely(!page)) {
                         /* Try and allocate at least one page */
-                       if (!nr_account)
+                       if (!nr_account) {
+                               pcp_spin_unlock_irqrestore(pcp, flags);
                                 goto failed_irq;
+                       }
                         break;
                 }
                 nr_account++;
@@ -5364,7 +5481,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                 nr_populated++;
         }
  
-       local_unlock_irqrestore(&pagesets.lock, flags);
+       pcp_spin_unlock_irqrestore(pcp, flags);
+       pcp_trylock_finish(UP_flags);
  
         __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
         zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
@@ -5373,7 +5491,7 @@ out:
         return nr_populated;
  
  failed_irq:
-       local_unlock_irqrestore(&pagesets.lock, flags);
+       pcp_trylock_finish(UP_flags);
  
  failed:
         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@ -5622,6 +5740,18 @@ refill:
                 /* reset page count bias and offset to start of new frag */
                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                 offset = size - fragsz;
+               if (unlikely(offset < 0)) {
+                       /*
+                        * The caller is trying to allocate a fragment
+                        * with fragsz > PAGE_SIZE but the cache isn't big
+                        * enough to satisfy the request, this may
+                        * happen in low memory conditions.
+                        * We don't release the cache page because
+                        * it could make memory pressure worse
+                        * so we simply return NULL here.
+                        */
+                       return NULL;
+               }
         }
  
         nc->pagecnt_bias--;
@@ -5804,14 +5934,14 @@ long si_mem_available(void)
  
         /*
          * Estimate the amount of memory available for userspace allocations,
-        * without causing swapping.
+        * without causing swapping or OOM.
          */
         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
  
         /*
          * Not all the page cache can be freed, otherwise the system will
-        * start swapping. Assume at least half of the page cache, or the
-        * low watermark worth of cache, needs to stay.
+        * start swapping or thrashing. Assume at least half of the page
+        * cache, or the low watermark worth of cache, needs to stay.
          */
         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
         pagecache -= min(pagecache / 2, wmark_low);
@@ -5939,7 +6069,7 @@ static void show_migration_types(unsigned char type)
  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
  {
         unsigned long free_pcp = 0;
-       int cpu;
+       int cpu, nid;
         struct zone *zone;
         pg_data_t *pgdat;
  
@@ -6127,7 +6257,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                 printk(KERN_CONT "= %lukB\n", K(total));
         }
  
-       hugetlb_show_meminfo();
+       for_each_online_node(nid) {
+               if (show_mem_node_skip(filter, nid, nodemask))
+                       continue;
+               hugetlb_show_meminfo_node(nid);
+       }
  
         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
  
@@ -6426,9 +6560,8 @@ static void __build_all_zonelists(void *data)
         int nid;
         int __maybe_unused cpu;
         pg_data_t *self = data;
-       static DEFINE_SPINLOCK(lock);
  
-       spin_lock(&lock);
+       write_seqlock(&zonelist_update_seq);
  
  #ifdef CONFIG_NUMA
         memset(node_load, 0, sizeof(node_load));
@@ -6465,7 +6598,7 @@ static void __build_all_zonelists(void *data)
  #endif
         }
  
-       spin_unlock(&lock);
+       write_sequnlock(&zonelist_update_seq);
  }
  
  static noinline void __init
@@ -7013,6 +7146,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
         memset(pcp, 0, sizeof(*pcp));
         memset(pzstats, 0, sizeof(*pzstats));
  
+       spin_lock_init(&pcp->lock);
         for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
                 INIT_LIST_HEAD(&pcp->lists[pindex]);