patch-5.15.79-rt54.patch
[platform/kernel/linux-rpi.git] / mm / memcontrol.c
index 8cdeb33..6479209 100644 (file)
@@ -650,6 +650,38 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 static DEFINE_SPINLOCK(stats_flush_lock);
 static DEFINE_PER_CPU(unsigned int, stats_updates);
 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
+
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#else
+      VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_enable();
+#endif
+}
 
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 {
@@ -671,6 +703,7 @@ static void __mem_cgroup_flush_stats(void)
        if (!spin_trylock_irqsave(&stats_flush_lock, flag))
                return;
 
+       flush_next_time = jiffies_64 + 2*FLUSH_TIME;
        cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
        atomic_set(&stats_flush_threshold, 0);
        spin_unlock_irqrestore(&stats_flush_lock, flag);
@@ -682,10 +715,16 @@ void mem_cgroup_flush_stats(void)
                __mem_cgroup_flush_stats();
 }
 
+void mem_cgroup_flush_stats_delayed(void)
+{
+       if (time_after64(jiffies_64, flush_next_time))
+               mem_cgroup_flush_stats();
+}
+
 static void flush_memcg_stats_dwork(struct work_struct *w)
 {
        __mem_cgroup_flush_stats();
-       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }
 
 /**
@@ -727,6 +766,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
 
+       /*
+        * The caller from rmap relay on disabled preemption becase they never
+        * update their counter from in-interrupt context. For these two
+        * counters we check that the update is never performed from an
+        * interrupt context while other caller need to have disabled interrupt.
+        */
+       __memcg_stats_lock();
+       if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+               switch (idx) {
+               case NR_ANON_MAPPED:
+               case NR_FILE_MAPPED:
+               case NR_ANON_THPS:
+               case NR_SHMEM_PMDMAPPED:
+               case NR_FILE_PMDMAPPED:
+                       WARN_ON_ONCE(!in_task());
+                       break;
+               default:
+                       WARN_ON_ONCE(!irqs_disabled());
+               }
+       }
+
        /* Update memcg */
        __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 
@@ -734,6 +794,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
 
        memcg_rstat_updated(memcg, val);
+       memcg_stats_unlock();
 }
 
 /**
@@ -834,8 +895,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
        if (mem_cgroup_disabled())
                return;
 
+       memcg_stats_lock();
        __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
        memcg_rstat_updated(memcg, count);
+       memcg_stats_unlock();
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -899,6 +962,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return;
+
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_THRESH))) {
@@ -2092,39 +2158,37 @@ void unlock_page_memcg(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page_memcg);
 
-struct obj_stock {
+struct memcg_stock_pcp {
+       local_lock_t stock_lock;
+       struct mem_cgroup *cached; /* this never be root cgroup */
+       unsigned int nr_pages;
+
 #ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup *cached_objcg;
        struct pglist_data *cached_pgdat;
        unsigned int nr_bytes;
        int nr_slab_reclaimable_b;
        int nr_slab_unreclaimable_b;
-#else
-       int dummy[0];
 #endif
-};
-
-struct memcg_stock_pcp {
-       struct mem_cgroup *cached; /* this never be root cgroup */
-       unsigned int nr_pages;
-       struct obj_stock task_obj;
-       struct obj_stock irq_obj;
 
        struct work_struct work;
        unsigned long flags;
 #define FLUSHING_CACHED_CHARGE 0
 };
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+       .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+};
 static DEFINE_MUTEX(percpu_charge_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct obj_stock *stock);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                     struct mem_cgroup *root_memcg);
 
 #else
-static inline void drain_obj_stock(struct obj_stock *stock)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
+       return NULL;
 }
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                     struct mem_cgroup *root_memcg)
@@ -2134,41 +2198,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 #endif
 
 /*
- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- * sequence used in this case to access content from object stock is slow.
- * To optimize for user context access, there are now two object stocks for
- * task context and interrupt context access respectively.
- *
- * The task context object stock can be accessed by disabling preemption only
- * which is cheap in non-preempt kernel. The interrupt context object stock
- * can only be accessed after disabling interrupt. User context code can
- * access interrupt object stock, but not vice versa.
- */
-static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
-{
-       struct memcg_stock_pcp *stock;
-
-       if (likely(in_task())) {
-               *pflags = 0UL;
-               preempt_disable();
-               stock = this_cpu_ptr(&memcg_stock);
-               return &stock->task_obj;
-       }
-
-       local_irq_save(*pflags);
-       stock = this_cpu_ptr(&memcg_stock);
-       return &stock->irq_obj;
-}
-
-static inline void put_obj_stock(unsigned long flags)
-{
-       if (likely(in_task()))
-               preempt_enable();
-       else
-               local_irq_restore(flags);
-}
-
-/**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
  * @nr_pages: how many pages to charge.
@@ -2188,7 +2217,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        if (nr_pages > MEMCG_CHARGE_BATCH)
                return ret;
 
-       local_irq_save(flags);
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
 
        stock = this_cpu_ptr(&memcg_stock);
        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2196,7 +2225,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
                ret = true;
        }
 
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
        return ret;
 }
@@ -2225,6 +2254,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
 
        /*
@@ -2232,28 +2262,25 @@ static void drain_local_stock(struct work_struct *dummy)
         * drain_stock races is that we always operate on local CPU stock
         * here with IRQ disabled
         */
-       local_irq_save(flags);
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
 
        stock = this_cpu_ptr(&memcg_stock);
-       drain_obj_stock(&stock->irq_obj);
-       if (in_task())
-               drain_obj_stock(&stock->task_obj);
+       old = drain_obj_stock(stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 }
 
 /*
  * Cache charges(val) to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
-       unsigned long flags;
-
-       local_irq_save(flags);
 
        stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached != memcg) { /* reset if necessary */
@@ -2265,8 +2292,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 
        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
                drain_stock(stock);
+}
 
-       local_irq_restore(flags);
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+       unsigned long flags;
+
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+       __refill_stock(memcg, nr_pages);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
 
 /*
@@ -2286,7 +2320,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
         * as well as workers from this path always operate on the local
         * per-cpu data. CPU up doesn't touch memcg_stock at all.
         */
-       curcpu = get_cpu();
+       migrate_disable();
+       curcpu = smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                struct mem_cgroup *memcg;
@@ -2309,7 +2344,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                                schedule_work_on(cpu, &stock->work);
                }
        }
-       put_cpu();
+       migrate_enable();
        mutex_unlock(&percpu_charge_mutex);
 }
 
@@ -3074,17 +3109,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
                     enum node_stat_item idx, int nr)
 {
+       struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        int *bytes;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+       stock = this_cpu_ptr(&memcg_stock);
+
        /*
         * Save vmstat data in stock and skip vmstat array update unless
         * accumulating over a page of vmstat data or when pgdat or idx
         * changes.
         */
        if (stock->cached_objcg != objcg) {
-               drain_obj_stock(stock);
+               old = drain_obj_stock(stock);
                obj_cgroup_get(objcg);
                stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
                                ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@ -3128,38 +3167,53 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
        if (nr)
                mod_objcg_mlstate(objcg, pgdat, idx, nr);
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 {
+       struct memcg_stock_pcp *stock;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        bool ret = false;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
        if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
                stock->nr_bytes -= nr_bytes;
                ret = true;
        }
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
        return ret;
 }
 
-static void drain_obj_stock(struct obj_stock *stock)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
        struct obj_cgroup *old = stock->cached_objcg;
 
        if (!old)
-               return;
+               return NULL;
 
        if (stock->nr_bytes) {
                unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
                unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
 
-               if (nr_pages)
-                       obj_cgroup_uncharge_pages(old, nr_pages);
+               if (nr_pages) {
+                       struct mem_cgroup *memcg;
+
+                       memcg = get_mem_cgroup_from_objcg(old);
+
+                       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+                               page_counter_uncharge(&memcg->kmem, nr_pages);
+
+                       __refill_stock(memcg, nr_pages);
+
+                       css_put(&memcg->css);
+               }
 
                /*
                 * The leftover is flushed to the centralized per-memcg value.
@@ -3194,8 +3248,12 @@ static void drain_obj_stock(struct obj_stock *stock)
                stock->cached_pgdat = NULL;
        }
 
-       obj_cgroup_put(old);
        stock->cached_objcg = NULL;
+       /*
+        * The `old' objects needs to be released by the caller via
+        * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+        */
+       return old;
 }
 
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -3203,13 +3261,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 {
        struct mem_cgroup *memcg;
 
-       if (in_task() && stock->task_obj.cached_objcg) {
-               memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
-               if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
-                       return true;
-       }
-       if (stock->irq_obj.cached_objcg) {
-               memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+       if (stock->cached_objcg) {
+               memcg = obj_cgroup_memcg(stock->cached_objcg);
                if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
                        return true;
        }
@@ -3220,12 +3273,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
                             bool allow_uncharge)
 {
+       struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        unsigned int nr_pages = 0;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached_objcg != objcg) { /* reset if necessary */
-               drain_obj_stock(stock);
+               old = drain_obj_stock(stock);
                obj_cgroup_get(objcg);
                stock->cached_objcg = objcg;
                stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3239,7 +3296,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
                stock->nr_bytes &= (PAGE_SIZE - 1);
        }
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 
        if (nr_pages)
                obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -3538,6 +3597,36 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
        return mem_cgroup_force_empty(memcg) ?: nbytes;
 }
 
+#ifdef CONFIG_MEMCG_SWAP
+static int mem_cgroup_force_reclaim(struct cgroup_subsys_state *css,
+                              struct cftype *cft, u64 val)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       unsigned long nr_to_reclaim = val;
+       unsigned long total = 0;
+       int loop;
+
+       for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+               total += try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim,
+                                               GFP_KERNEL, true);
+
+               /*
+                * If nothing was reclaimed after two attempts, there
+                * may be no reclaimable pages in this hierarchy.
+                * If more than nr_to_reclaim pages were already reclaimed,
+                * finish force reclaim.
+                */
+               if (loop && (!total || total > nr_to_reclaim))
+                       break;
+       }
+
+       pr_info("%s: [Mem_reclaim] Loop: %d - Total_reclaimed: %lu - nr_to_reclaim: %lu\n",
+               __func__, loop, total, nr_to_reclaim);
+
+       return total;
+}
+#endif
+
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
                                     struct cftype *cft)
 {
@@ -3806,8 +3895,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                }
                break;
        case RES_SOFT_LIMIT:
-               memcg->soft_limit = nr_pages;
-               ret = 0;
+               if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+                       ret = -EOPNOTSUPP;
+               } else {
+                       memcg->soft_limit = nr_pages;
+                       ret = 0;
+               }
                break;
        }
        return ret ?: nbytes;
@@ -4783,6 +4876,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
        char *endp;
        int ret;
 
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return -EOPNOTSUPP;
+
        buf = strstrip(buf);
 
        efd = simple_strtoul(buf, &endp, 10);
@@ -6864,7 +6960,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
        unsigned long nr_pages;
        struct mem_cgroup *memcg;
        struct obj_cgroup *objcg;
-       bool use_objcg = PageMemcgKmem(page);
 
        VM_BUG_ON_PAGE(PageLRU(page), page);
 
@@ -6873,7 +6968,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
         * page memcg or objcg at this point, we have fully
         * exclusive access to the page.
         */
-       if (use_objcg) {
+       if (PageMemcgKmem(page)) {
                objcg = __page_objcg(page);
                /*
                 * This get matches the put at the end of the function and
@@ -6901,7 +6996,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 
        nr_pages = compound_nr(page);
 
-       if (use_objcg) {
+       if (PageMemcgKmem(page)) {
                ug->nr_memory += nr_pages;
                ug->nr_kmem += nr_pages;
 
@@ -7231,8 +7326,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * important here to have the interrupts disabled because it is the
         * only synchronisation we have for updating the per-CPU variables.
         */
-       VM_BUG_ON(!irqs_disabled());
+       memcg_stats_lock();
        mem_cgroup_charge_statistics(memcg, page, -nr_entries);
+       memcg_stats_unlock();
        memcg_check_events(memcg, page);
 
        css_put(&memcg->css);
@@ -7482,6 +7578,10 @@ static struct cftype memsw_files[] = {
                .write = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read_u64,
        },
+       {
+               .name = "force_reclaim",
+               .write_u64 = mem_cgroup_force_reclaim,
+       },
        { },    /* terminate */
 };