Merge tag 'linux-watchdog-6.1-rc1' of git://www.linux-watchdog.org/linux-watchdog
[platform/kernel/linux-rpi.git] / mm / memcontrol.c
index 6324020..2d8549a 100644 (file)
@@ -88,13 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init;
 /* Kernel memory accounting disabled? */
 static bool cgroup_memory_nokmem __ro_after_init;
 
-/* Whether the swap controller is active */
-#ifdef CONFIG_MEMCG_SWAP
-static bool cgroup_memory_noswap __ro_after_init;
-#else
-#define cgroup_memory_noswap           1
-#endif
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
@@ -102,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 /* Whether legacy memory+swap accounting is active */
 static bool do_memsw_account(void)
 {
-       return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+       return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
 }
 
 #define THRESHOLDS_EVENTS_TARGET 128
@@ -597,25 +590,18 @@ static u64 flush_next_time;
  */
 static void memcg_stats_lock(void)
 {
-#ifdef CONFIG_PREEMPT_RT
-      preempt_disable();
-#else
-      VM_BUG_ON(!irqs_disabled());
-#endif
+       preempt_disable_nested();
+       VM_WARN_ON_IRQS_ENABLED();
 }
 
 static void __memcg_stats_lock(void)
 {
-#ifdef CONFIG_PREEMPT_RT
-      preempt_disable();
-#endif
+       preempt_disable_nested();
 }
 
 static void memcg_stats_unlock(void)
 {
-#ifdef CONFIG_PREEMPT_RT
-      preempt_enable();
-#endif
+       preempt_enable_nested();
 }
 
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
@@ -669,6 +655,81 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
        queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }
 
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+       PGPGIN,
+       PGPGOUT,
+       PGSCAN_KSWAPD,
+       PGSCAN_DIRECT,
+       PGSTEAL_KSWAPD,
+       PGSTEAL_DIRECT,
+       PGFAULT,
+       PGMAJFAULT,
+       PGREFILL,
+       PGACTIVATE,
+       PGDEACTIVATE,
+       PGLAZYFREE,
+       PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+       ZSWPIN,
+       ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       THP_FAULT_ALLOC,
+       THP_COLLAPSE_ALLOC,
+#endif
+};
+
+#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+
+static void init_memcg_events(void)
+{
+       int i;
+
+       for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+               mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+}
+
+static inline int memcg_events_index(enum vm_event_item idx)
+{
+       return mem_cgroup_events_index[idx] - 1;
+}
+
+struct memcg_vmstats_percpu {
+       /* Local (CPU and cgroup) page state & events */
+       long                    state[MEMCG_NR_STAT];
+       unsigned long           events[NR_MEMCG_EVENTS];
+
+       /* Delta calculation for lockless upward propagation */
+       long                    state_prev[MEMCG_NR_STAT];
+       unsigned long           events_prev[NR_MEMCG_EVENTS];
+
+       /* Cgroup1: threshold notifications & softlimit tree updates */
+       unsigned long           nr_page_events;
+       unsigned long           targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+       /* Aggregated (CPU and subtree) page state & events */
+       long                    state[MEMCG_NR_STAT];
+       unsigned long           events[NR_MEMCG_EVENTS];
+
+       /* Pending child counts during tree propagation */
+       long                    state_pending[MEMCG_NR_STAT];
+       unsigned long           events_pending[NR_MEMCG_EVENTS];
+};
+
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+       long x = READ_ONCE(memcg->vmstats->state[idx]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
+
 /**
  * __mod_memcg_state - update cgroup memory statistics
  * @memcg: the memory cgroup
@@ -715,7 +776,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
         * interrupt context while other caller need to have disabled interrupt.
         */
        __memcg_stats_lock();
-       if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+       if (IS_ENABLED(CONFIG_DEBUG_VM)) {
                switch (idx) {
                case NR_ANON_MAPPED:
                case NR_FILE_MAPPED:
@@ -725,7 +786,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        WARN_ON_ONCE(!in_task());
                        break;
                default:
-                       WARN_ON_ONCE(!irqs_disabled());
+                       VM_WARN_ON_IRQS_ENABLED();
                }
        }
 
@@ -816,27 +877,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count)
 {
-       if (mem_cgroup_disabled())
+       int index = memcg_events_index(idx);
+
+       if (mem_cgroup_disabled() || index < 0)
                return;
 
        memcg_stats_lock();
-       __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+       __this_cpu_add(memcg->vmstats_percpu->events[index], count);
        memcg_rstat_updated(memcg, count);
        memcg_stats_unlock();
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 {
-       return READ_ONCE(memcg->vmstats.events[event]);
+       int index = memcg_events_index(event);
+
+       if (index < 0)
+               return 0;
+       return READ_ONCE(memcg->vmstats->events[index]);
 }
 
 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 {
        long x = 0;
        int cpu;
+       int index = memcg_events_index(event);
+
+       if (index < 0)
+               return 0;
 
        for_each_possible_cpu(cpu)
-               x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
+               x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
        return x;
 }
 
@@ -1401,6 +1472,7 @@ static const struct memory_stat memory_stats[] = {
        { "kernel",                     MEMCG_KMEM                      },
        { "kernel_stack",               NR_KERNEL_STACK_KB              },
        { "pagetables",                 NR_PAGETABLE                    },
+       { "sec_pagetables",             NR_SECONDARY_PAGETABLE          },
        { "percpu",                     MEMCG_PERCPU_B                  },
        { "sock",                       MEMCG_SOCK                      },
        { "vmalloc",                    MEMCG_VMALLOC                   },
@@ -1467,29 +1539,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
        return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
 }
 
-/* Subset of vm_event_item to report for memcg event stats */
-static const unsigned int memcg_vm_event_stat[] = {
-       PGSCAN_KSWAPD,
-       PGSCAN_DIRECT,
-       PGSTEAL_KSWAPD,
-       PGSTEAL_DIRECT,
-       PGFAULT,
-       PGMAJFAULT,
-       PGREFILL,
-       PGACTIVATE,
-       PGDEACTIVATE,
-       PGLAZYFREE,
-       PGLAZYFREED,
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
-       ZSWPIN,
-       ZSWPOUT,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       THP_FAULT_ALLOC,
-       THP_COLLAPSE_ALLOC,
-#endif
-};
-
 static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
 {
        struct seq_buf s;
@@ -1530,10 +1579,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
                       memcg_events(memcg, PGSTEAL_KSWAPD) +
                       memcg_events(memcg, PGSTEAL_DIRECT));
 
-       for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+       for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+               if (memcg_vm_event_stat[i] == PGPGIN ||
+                   memcg_vm_event_stat[i] == PGPGOUT)
+                       continue;
+
                seq_buf_printf(&s, "%s %lu\n",
                               vm_event_name(memcg_vm_event_stat[i]),
                               memcg_events(memcg, memcg_vm_event_stat[i]));
+       }
 
        /* The above should easily fit into one page */
        WARN_ON_ONCE(seq_buf_has_overflowed(&s));
@@ -1607,17 +1661,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 {
        unsigned long max = READ_ONCE(memcg->memory.max);
 
-       if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
-               if (mem_cgroup_swappiness(memcg))
-                       max += min(READ_ONCE(memcg->swap.max),
-                                  (unsigned long)total_swap_pages);
-       } else { /* v1 */
+       if (do_memsw_account()) {
                if (mem_cgroup_swappiness(memcg)) {
                        /* Calculate swap excess capacity from memsw limit */
                        unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
 
                        max += min(swap, (unsigned long)total_swap_pages);
                }
+       } else {
+               if (mem_cgroup_swappiness(memcg))
+                       max += min(READ_ONCE(memcg->swap.max),
+                                  (unsigned long)total_swap_pages);
        }
        return max;
 }
@@ -3363,7 +3417,7 @@ void split_page_memcg(struct page *head, unsigned int nr)
                css_get_many(&memcg->css, nr - 1);
 }
 
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
@@ -5116,8 +5170,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
        struct mem_cgroup *memcg;
 
        cgrp = cgroup_get_from_id(ino);
-       if (!cgrp)
-               return ERR_PTR(-ENOENT);
+       if (IS_ERR(cgrp))
+               return ERR_CAST(cgrp);
 
        css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
        if (css)
@@ -5170,6 +5224,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
        for_each_node(node)
                free_mem_cgroup_per_node_info(memcg, node);
+       kfree(memcg->vmstats);
        free_percpu(memcg->vmstats_percpu);
        kfree(memcg);
 }
@@ -5199,6 +5254,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                goto fail;
        }
 
+       memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+       if (!memcg->vmstats)
+               goto fail;
+
        memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
                                                 GFP_KERNEL_ACCOUNT);
        if (!memcg->vmstats_percpu)
@@ -5270,6 +5329,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
                page_counter_init(&memcg->kmem, &parent->kmem);
                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
        } else {
+               init_memcg_events();
                page_counter_init(&memcg->memory, NULL);
                page_counter_init(&memcg->swap, NULL);
                page_counter_init(&memcg->kmem, NULL);
@@ -5418,9 +5478,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                 * below us. We're in a per-cpu loop here and this is
                 * a global counter, so the first cycle will get them.
                 */
-               delta = memcg->vmstats.state_pending[i];
+               delta = memcg->vmstats->state_pending[i];
                if (delta)
-                       memcg->vmstats.state_pending[i] = 0;
+                       memcg->vmstats->state_pending[i] = 0;
 
                /* Add CPU changes on this level since the last flush */
                v = READ_ONCE(statc->state[i]);
@@ -5433,15 +5493,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                        continue;
 
                /* Aggregate counts on this level and propagate upwards */
-               memcg->vmstats.state[i] += delta;
+               memcg->vmstats->state[i] += delta;
                if (parent)
-                       parent->vmstats.state_pending[i] += delta;
+                       parent->vmstats->state_pending[i] += delta;
        }
 
-       for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
-               delta = memcg->vmstats.events_pending[i];
+       for (i = 0; i < NR_MEMCG_EVENTS; i++) {
+               delta = memcg->vmstats->events_pending[i];
                if (delta)
-                       memcg->vmstats.events_pending[i] = 0;
+                       memcg->vmstats->events_pending[i] = 0;
 
                v = READ_ONCE(statc->events[i]);
                if (v != statc->events_prev[i]) {
@@ -5452,9 +5512,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                if (!delta)
                        continue;
 
-               memcg->vmstats.events[i] += delta;
+               memcg->vmstats->events[i] += delta;
                if (parent)
-                       parent->vmstats.events_pending[i] += delta;
+                       parent->vmstats->events_pending[i] += delta;
        }
 
        for_each_node_state(nid, N_MEMORY) {
@@ -7230,7 +7290,7 @@ static int __init mem_cgroup_init(void)
 }
 subsys_initcall(mem_cgroup_init);
 
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 {
        while (!refcount_inc_not_zero(&memcg->id.ref)) {
@@ -7268,7 +7328,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
        if (mem_cgroup_disabled())
                return;
 
-       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+       if (!do_memsw_account())
                return;
 
        memcg = folio_memcg(folio);
@@ -7297,7 +7357,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
        if (!mem_cgroup_is_root(memcg))
                page_counter_uncharge(&memcg->memory, nr_entries);
 
-       if (!cgroup_memory_noswap && memcg != swap_memcg) {
+       if (memcg != swap_memcg) {
                if (!mem_cgroup_is_root(swap_memcg))
                        page_counter_charge(&swap_memcg->memsw, nr_entries);
                page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7333,7 +7393,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
        struct mem_cgroup *memcg;
        unsigned short oldid;
 
-       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+       if (do_memsw_account())
                return 0;
 
        memcg = folio_memcg(folio);
@@ -7349,7 +7409,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
 
        memcg = mem_cgroup_id_get_online(memcg);
 
-       if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
+       if (!mem_cgroup_is_root(memcg) &&
            !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
                memcg_memory_event(memcg, MEMCG_SWAP_MAX);
                memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7377,15 +7437,18 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
        struct mem_cgroup *memcg;
        unsigned short id;
 
+       if (mem_cgroup_disabled())
+               return;
+
        id = swap_cgroup_record(entry, 0, nr_pages);
        rcu_read_lock();
        memcg = mem_cgroup_from_id(id);
        if (memcg) {
-               if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
-                       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-                               page_counter_uncharge(&memcg->swap, nr_pages);
-                       else
+               if (!mem_cgroup_is_root(memcg)) {
+                       if (do_memsw_account())
                                page_counter_uncharge(&memcg->memsw, nr_pages);
+                       else
+                               page_counter_uncharge(&memcg->swap, nr_pages);
                }
                mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
                mem_cgroup_id_put_many(memcg, nr_pages);
@@ -7397,7 +7460,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 {
        long nr_swap_pages = get_nr_swap_pages();
 
-       if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+       if (mem_cgroup_disabled() || do_memsw_account())
                return nr_swap_pages;
        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
                nr_swap_pages = min_t(long, nr_swap_pages,
@@ -7414,7 +7477,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
 
        if (vm_swap_full())
                return true;
-       if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+       if (do_memsw_account())
                return false;
 
        memcg = folio_memcg(folio);
@@ -7434,10 +7497,9 @@ bool mem_cgroup_swap_full(struct folio *folio)
 
 static int __init setup_swap_account(char *s)
 {
-       if (!strcmp(s, "1"))
-               cgroup_memory_noswap = false;
-       else if (!strcmp(s, "0"))
-               cgroup_memory_noswap = true;
+       pr_warn_once("The swapaccount= commandline option is deprecated. "
+                    "Please report your usecase to linux-mm@kvack.org if you "
+                    "depend on this functionality.\n");
        return 1;
 }
 __setup("swapaccount=", setup_swap_account);
@@ -7706,20 +7768,9 @@ static struct cftype zswap_files[] = {
 };
 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
 
-/*
- * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
- * instead of a core_initcall(), this could mean cgroup_memory_noswap still
- * remains set to false even when memcg is disabled via "cgroup_disable=memory"
- * boot parameter. This may result in premature OOPS inside
- * mem_cgroup_get_nr_swap_pages() function in corner cases.
- */
 static int __init mem_cgroup_swap_init(void)
 {
-       /* No memory control -> no swap control */
        if (mem_cgroup_disabled())
-               cgroup_memory_noswap = true;
-
-       if (cgroup_memory_noswap)
                return 0;
 
        WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
@@ -7729,6 +7780,6 @@ static int __init mem_cgroup_swap_init(void)
 #endif
        return 0;
 }
-core_initcall(mem_cgroup_swap_init);
+subsys_initcall(mem_cgroup_swap_init);
 
-#endif /* CONFIG_MEMCG_SWAP */
+#endif /* CONFIG_SWAP */