Merge remote-tracking branch 'stable/linux-5.15.y' into rpi-5.15.y

[platform/kernel/linux-rpi.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 6da5020..971546b 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,11 +103,6 @@ static bool do_memsw_account(void)
         return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
  }
  
-/* memcg and lruvec stats flushing */
-static void flush_memcg_stats_dwork(struct work_struct *w);
-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
-
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  
@@ -239,7 +234,7 @@ enum res_type {
              iter != NULL;                              \
              iter = mem_cgroup_iter(NULL, iter, NULL))
  
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
  {
         return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
                 (current->flags & PF_EXITING);
@@ -259,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
  }
  
  #ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
  
  bool mem_cgroup_kmem_disabled(void)
  {
@@ -303,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
         if (nr_pages)
                 obj_cgroup_uncharge_pages(objcg, nr_pages);
  
-       spin_lock_irqsave(&css_set_lock, flags);
+       spin_lock_irqsave(&objcg_lock, flags);
         list_del(&objcg->list);
-       spin_unlock_irqrestore(&css_set_lock, flags);
+       spin_unlock_irqrestore(&objcg_lock, flags);
  
         percpu_ref_exit(ref);
         kfree_rcu(objcg, rcu);
@@ -337,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
  
         objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
  
-       spin_lock_irq(&css_set_lock);
+       spin_lock_irq(&objcg_lock);
  
         /* 1) Ready to reparent active objcg. */
         list_add(&objcg->list, &memcg->objcg_list);
@@ -347,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
         /* 3) Move already reparented objcgs to the parent's list */
         list_splice(&memcg->objcg_list, &parent->objcg_list);
  
-       spin_unlock_irq(&css_set_lock);
+       spin_unlock_irq(&objcg_lock);
  
         percpu_ref_kill(&objcg->refcnt);
  }
@@ -635,6 +630,74 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
         return mz;
  }
  
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ *    rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ *    only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_SPINLOCK(stats_flush_lock);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+{
+       unsigned int x;
+
+       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+
+       x = __this_cpu_add_return(stats_updates, abs(val));
+       if (x > MEMCG_CHARGE_BATCH) {
+               atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+               __this_cpu_write(stats_updates, 0);
+       }
+}
+
+static void __mem_cgroup_flush_stats(void)
+{
+       unsigned long flag;
+
+       if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+               return;
+
+       flush_next_time = jiffies_64 + 2*FLUSH_TIME;
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       atomic_set(&stats_flush_threshold, 0);
+       spin_unlock_irqrestore(&stats_flush_lock, flag);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+       if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+               __mem_cgroup_flush_stats();
+}
+
+void mem_cgroup_flush_stats_delayed(void)
+{
+       if (time_after64(jiffies_64, flush_next_time))
+               mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+       __mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+}
+
  /**
   * __mod_memcg_state - update cgroup memory statistics
   * @memcg: the memory cgroup
@@ -647,7 +710,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
                 return;
  
         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg, val);
  }
  
  /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -675,10 +738,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
         memcg = pn->memcg;
  
         /* Update memcg */
-       __mod_memcg_state(memcg, idx, val);
+       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
  
         /* Update lruvec */
         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+       memcg_rstat_updated(memcg, val);
  }
  
  /**
@@ -780,7 +845,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                 return;
  
         __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg, count);
  }
  
  static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1414,7 +1479,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
          *
          * Current memory state:
          */
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
  
         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                 u64 size;
@@ -1575,7 +1640,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
          * A few threads which were not waiting at mutex_lock_killable() can
          * fail to bail out. Therefore, check again after holding oom_lock.
          */
-       ret = should_force_charge() || out_of_memory(&oc);
+       ret = task_is_dying() || out_of_memory(&oc);
  
  unlock:
         mutex_unlock(&oom_lock);
@@ -2530,6 +2595,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
         struct page_counter *counter;
         enum oom_status oom_status;
         unsigned long nr_reclaimed;
+       bool passed_oom = false;
         bool may_swap = true;
         bool drained = false;
         unsigned long pflags;
@@ -2565,15 +2631,6 @@ retry:
                 goto force;
  
         /*
-        * Unlike in global OOM situations, memcg is not in a physical
-        * memory shortage.  Allow dying and OOM-killed tasks to
-        * bypass the last charges so that they can exit quickly and
-        * free their memory.
-        */
-       if (unlikely(should_force_charge()))
-               goto force;
-
-       /*
          * Prevent unbounded recursion when reclaim operations need to
          * allocate memory. This might exceed the limits temporarily,
          * but we prefer facilitating memory reclaim and getting back
@@ -2630,8 +2687,9 @@ retry:
         if (gfp_mask & __GFP_RETRY_MAYFAIL)
                 goto nomem;
  
-       if (fatal_signal_pending(current))
-               goto force;
+       /* Avoid endless loop for tasks bypassed by the oom killer */
+       if (passed_oom && task_is_dying())
+               goto nomem;
  
         /*
          * keep retrying as long as the memcg oom killer is able to make
@@ -2640,14 +2698,10 @@ retry:
          */
         oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
                        get_order(nr_pages * PAGE_SIZE));
-       switch (oom_status) {
-       case OOM_SUCCESS:
+       if (oom_status == OOM_SUCCESS) {
+               passed_oom = true;
                 nr_retries = MAX_RECLAIM_RETRIES;
                 goto retry;
-       case OOM_FAILED:
-               goto force;
-       default:
-               goto nomem;
         }
  nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
@@ -3518,8 +3572,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
         unsigned long val;
  
         if (mem_cgroup_is_root(memcg)) {
-               /* mem_cgroup_threshold() calls here from irqsafe context */
-               cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+               mem_cgroup_flush_stats();
                 val = memcg_page_state(memcg, NR_FILE_PAGES) +
                         memcg_page_state(memcg, NR_ANON_MAPPED);
                 if (swap)
@@ -3900,7 +3953,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
         int nid;
         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
  
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
  
         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                 seq_printf(m, "%s=%lu", stat->name,
@@ -3972,7 +4025,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
  
         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
  
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
  
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
@@ -4475,7 +4528,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
         struct mem_cgroup *parent;
  
-       cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
  
         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5341,21 +5394,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
         memcg_wb_domain_size_changed(memcg);
  }
  
-void mem_cgroup_flush_stats(void)
-{
-       if (!spin_trylock(&stats_flush_lock))
-               return;
-
-       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
-       spin_unlock(&stats_flush_lock);
-}
-
-static void flush_memcg_stats_dwork(struct work_struct *w)
-{
-       mem_cgroup_flush_stats();
-       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
-}
-
  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -6373,7 +6411,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
         int i;
         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
  
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
  
         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                 int nid;
@@ -7077,7 +7115,7 @@ static int __init cgroup_memory(char *s)
                 if (!strcmp(token, "nokmem"))
                         cgroup_memory_nokmem = true;
         }
-       return 0;
+       return 1;
  }
  __setup("cgroup.memory=", cgroup_memory);