drm: virtio_gpu: add support for ARGB8888 primary plane
[platform/kernel/linux-rpi.git] / mm / memcontrol.c
index 6da5020..e2ea2fb 100644 (file)
@@ -103,11 +103,6 @@ static bool do_memsw_account(void)
        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 }
 
-/* memcg and lruvec stats flushing */
-static void flush_memcg_stats_dwork(struct work_struct *w);
-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
-
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
@@ -239,7 +234,7 @@ enum res_type {
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
 
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
 {
        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
                (current->flags & PF_EXITING);
@@ -259,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
 
 bool mem_cgroup_kmem_disabled(void)
 {
@@ -303,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
        if (nr_pages)
                obj_cgroup_uncharge_pages(objcg, nr_pages);
 
-       spin_lock_irqsave(&css_set_lock, flags);
+       spin_lock_irqsave(&objcg_lock, flags);
        list_del(&objcg->list);
-       spin_unlock_irqrestore(&css_set_lock, flags);
+       spin_unlock_irqrestore(&objcg_lock, flags);
 
        percpu_ref_exit(ref);
        kfree_rcu(objcg, rcu);
@@ -337,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 
        objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 
-       spin_lock_irq(&css_set_lock);
+       spin_lock_irq(&objcg_lock);
 
        /* 1) Ready to reparent active objcg. */
        list_add(&objcg->list, &memcg->objcg_list);
@@ -347,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
        /* 3) Move already reparented objcgs to the parent's list */
        list_splice(&memcg->objcg_list, &parent->objcg_list);
 
-       spin_unlock_irq(&css_set_lock);
+       spin_unlock_irq(&objcg_lock);
 
        percpu_ref_kill(&objcg->refcnt);
 }
@@ -635,6 +630,103 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
        return mz;
 }
 
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ *    rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ *    only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_SPINLOCK(stats_flush_lock);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
+
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#else
+      VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+      preempt_enable();
+#endif
+}
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+{
+       unsigned int x;
+
+       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+
+       x = __this_cpu_add_return(stats_updates, abs(val));
+       if (x > MEMCG_CHARGE_BATCH) {
+               atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+               __this_cpu_write(stats_updates, 0);
+       }
+}
+
+static void __mem_cgroup_flush_stats(void)
+{
+       unsigned long flag;
+
+       if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+               return;
+
+       flush_next_time = jiffies_64 + 2*FLUSH_TIME;
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       atomic_set(&stats_flush_threshold, 0);
+       spin_unlock_irqrestore(&stats_flush_lock, flag);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+       if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+               __mem_cgroup_flush_stats();
+}
+
+void mem_cgroup_flush_stats_delayed(void)
+{
+       if (time_after64(jiffies_64, flush_next_time))
+               mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+       __mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+}
+
 /**
  * __mod_memcg_state - update cgroup memory statistics
  * @memcg: the memory cgroup
@@ -647,7 +739,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
                return;
 
        __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg, val);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -674,11 +766,35 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
 
+       /*
+        * The caller from rmap relay on disabled preemption becase they never
+        * update their counter from in-interrupt context. For these two
+        * counters we check that the update is never performed from an
+        * interrupt context while other caller need to have disabled interrupt.
+        */
+       __memcg_stats_lock();
+       if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+               switch (idx) {
+               case NR_ANON_MAPPED:
+               case NR_FILE_MAPPED:
+               case NR_ANON_THPS:
+               case NR_SHMEM_PMDMAPPED:
+               case NR_FILE_PMDMAPPED:
+                       WARN_ON_ONCE(!in_task());
+                       break;
+               default:
+                       WARN_ON_ONCE(!irqs_disabled());
+               }
+       }
+
        /* Update memcg */
-       __mod_memcg_state(memcg, idx, val);
+       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 
        /* Update lruvec */
        __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+       memcg_rstat_updated(memcg, val);
+       memcg_stats_unlock();
 }
 
 /**
@@ -779,8 +895,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
        if (mem_cgroup_disabled())
                return;
 
+       memcg_stats_lock();
        __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg, count);
+       memcg_stats_unlock();
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -844,6 +962,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return;
+
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_THRESH))) {
@@ -1414,7 +1535,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
         *
         * Current memory state:
         */
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                u64 size;
@@ -1575,7 +1696,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * A few threads which were not waiting at mutex_lock_killable() can
         * fail to bail out. Therefore, check again after holding oom_lock.
         */
-       ret = should_force_charge() || out_of_memory(&oc);
+       ret = task_is_dying() || out_of_memory(&oc);
 
 unlock:
        mutex_unlock(&oom_lock);
@@ -2037,39 +2158,37 @@ void unlock_page_memcg(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page_memcg);
 
-struct obj_stock {
+struct memcg_stock_pcp {
+       local_lock_t stock_lock;
+       struct mem_cgroup *cached; /* this never be root cgroup */
+       unsigned int nr_pages;
+
 #ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup *cached_objcg;
        struct pglist_data *cached_pgdat;
        unsigned int nr_bytes;
        int nr_slab_reclaimable_b;
        int nr_slab_unreclaimable_b;
-#else
-       int dummy[0];
 #endif
-};
-
-struct memcg_stock_pcp {
-       struct mem_cgroup *cached; /* this never be root cgroup */
-       unsigned int nr_pages;
-       struct obj_stock task_obj;
-       struct obj_stock irq_obj;
 
        struct work_struct work;
        unsigned long flags;
 #define FLUSHING_CACHED_CHARGE 0
 };
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+       .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+};
 static DEFINE_MUTEX(percpu_charge_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct obj_stock *stock);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                     struct mem_cgroup *root_memcg);
 
 #else
-static inline void drain_obj_stock(struct obj_stock *stock)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
+       return NULL;
 }
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                     struct mem_cgroup *root_memcg)
@@ -2079,41 +2198,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 #endif
 
 /*
- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- * sequence used in this case to access content from object stock is slow.
- * To optimize for user context access, there are now two object stocks for
- * task context and interrupt context access respectively.
- *
- * The task context object stock can be accessed by disabling preemption only
- * which is cheap in non-preempt kernel. The interrupt context object stock
- * can only be accessed after disabling interrupt. User context code can
- * access interrupt object stock, but not vice versa.
- */
-static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
-{
-       struct memcg_stock_pcp *stock;
-
-       if (likely(in_task())) {
-               *pflags = 0UL;
-               preempt_disable();
-               stock = this_cpu_ptr(&memcg_stock);
-               return &stock->task_obj;
-       }
-
-       local_irq_save(*pflags);
-       stock = this_cpu_ptr(&memcg_stock);
-       return &stock->irq_obj;
-}
-
-static inline void put_obj_stock(unsigned long flags)
-{
-       if (likely(in_task()))
-               preempt_enable();
-       else
-               local_irq_restore(flags);
-}
-
-/**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
  * @nr_pages: how many pages to charge.
@@ -2133,7 +2217,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        if (nr_pages > MEMCG_CHARGE_BATCH)
                return ret;
 
-       local_irq_save(flags);
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
 
        stock = this_cpu_ptr(&memcg_stock);
        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2141,7 +2225,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
                ret = true;
        }
 
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
        return ret;
 }
@@ -2170,6 +2254,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
 
        /*
@@ -2177,28 +2262,25 @@ static void drain_local_stock(struct work_struct *dummy)
         * drain_stock races is that we always operate on local CPU stock
         * here with IRQ disabled
         */
-       local_irq_save(flags);
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
 
        stock = this_cpu_ptr(&memcg_stock);
-       drain_obj_stock(&stock->irq_obj);
-       if (in_task())
-               drain_obj_stock(&stock->task_obj);
+       old = drain_obj_stock(stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 }
 
 /*
  * Cache charges(val) to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
-       unsigned long flags;
-
-       local_irq_save(flags);
 
        stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached != memcg) { /* reset if necessary */
@@ -2210,8 +2292,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 
        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
                drain_stock(stock);
+}
 
-       local_irq_restore(flags);
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+       unsigned long flags;
+
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+       __refill_stock(memcg, nr_pages);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
 
 /*
@@ -2231,7 +2320,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
         * as well as workers from this path always operate on the local
         * per-cpu data. CPU up doesn't touch memcg_stock at all.
         */
-       curcpu = get_cpu();
+       migrate_disable();
+       curcpu = smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                struct mem_cgroup *memcg;
@@ -2254,7 +2344,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                                schedule_work_on(cpu, &stock->work);
                }
        }
-       put_cpu();
+       migrate_enable();
        mutex_unlock(&percpu_charge_mutex);
 }
 
@@ -2530,6 +2620,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
        struct page_counter *counter;
        enum oom_status oom_status;
        unsigned long nr_reclaimed;
+       bool passed_oom = false;
        bool may_swap = true;
        bool drained = false;
        unsigned long pflags;
@@ -2565,15 +2656,6 @@ retry:
                goto force;
 
        /*
-        * Unlike in global OOM situations, memcg is not in a physical
-        * memory shortage.  Allow dying and OOM-killed tasks to
-        * bypass the last charges so that they can exit quickly and
-        * free their memory.
-        */
-       if (unlikely(should_force_charge()))
-               goto force;
-
-       /*
         * Prevent unbounded recursion when reclaim operations need to
         * allocate memory. This might exceed the limits temporarily,
         * but we prefer facilitating memory reclaim and getting back
@@ -2630,8 +2712,9 @@ retry:
        if (gfp_mask & __GFP_RETRY_MAYFAIL)
                goto nomem;
 
-       if (fatal_signal_pending(current))
-               goto force;
+       /* Avoid endless loop for tasks bypassed by the oom killer */
+       if (passed_oom && task_is_dying())
+               goto nomem;
 
        /*
         * keep retrying as long as the memcg oom killer is able to make
@@ -2640,14 +2723,10 @@ retry:
         */
        oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
                       get_order(nr_pages * PAGE_SIZE));
-       switch (oom_status) {
-       case OOM_SUCCESS:
+       if (oom_status == OOM_SUCCESS) {
+               passed_oom = true;
                nr_retries = MAX_RECLAIM_RETRIES;
                goto retry;
-       case OOM_FAILED:
-               goto force;
-       default:
-               goto nomem;
        }
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
@@ -3030,17 +3109,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
                     enum node_stat_item idx, int nr)
 {
+       struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        int *bytes;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+       stock = this_cpu_ptr(&memcg_stock);
+
        /*
         * Save vmstat data in stock and skip vmstat array update unless
         * accumulating over a page of vmstat data or when pgdat or idx
         * changes.
         */
        if (stock->cached_objcg != objcg) {
-               drain_obj_stock(stock);
+               old = drain_obj_stock(stock);
                obj_cgroup_get(objcg);
                stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
                                ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@ -3084,38 +3167,53 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
        if (nr)
                mod_objcg_mlstate(objcg, pgdat, idx, nr);
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 {
+       struct memcg_stock_pcp *stock;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        bool ret = false;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
        if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
                stock->nr_bytes -= nr_bytes;
                ret = true;
        }
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
        return ret;
 }
 
-static void drain_obj_stock(struct obj_stock *stock)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
        struct obj_cgroup *old = stock->cached_objcg;
 
        if (!old)
-               return;
+               return NULL;
 
        if (stock->nr_bytes) {
                unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
                unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
 
-               if (nr_pages)
-                       obj_cgroup_uncharge_pages(old, nr_pages);
+               if (nr_pages) {
+                       struct mem_cgroup *memcg;
+
+                       memcg = get_mem_cgroup_from_objcg(old);
+
+                       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+                               page_counter_uncharge(&memcg->kmem, nr_pages);
+
+                       __refill_stock(memcg, nr_pages);
+
+                       css_put(&memcg->css);
+               }
 
                /*
                 * The leftover is flushed to the centralized per-memcg value.
@@ -3150,8 +3248,12 @@ static void drain_obj_stock(struct obj_stock *stock)
                stock->cached_pgdat = NULL;
        }
 
-       obj_cgroup_put(old);
        stock->cached_objcg = NULL;
+       /*
+        * The `old' objects needs to be released by the caller via
+        * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+        */
+       return old;
 }
 
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -3159,13 +3261,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 {
        struct mem_cgroup *memcg;
 
-       if (in_task() && stock->task_obj.cached_objcg) {
-               memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
-               if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
-                       return true;
-       }
-       if (stock->irq_obj.cached_objcg) {
-               memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+       if (stock->cached_objcg) {
+               memcg = obj_cgroup_memcg(stock->cached_objcg);
                if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
                        return true;
        }
@@ -3176,12 +3273,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
                             bool allow_uncharge)
 {
+       struct memcg_stock_pcp *stock;
+       struct obj_cgroup *old = NULL;
        unsigned long flags;
-       struct obj_stock *stock = get_obj_stock(&flags);
        unsigned int nr_pages = 0;
 
+       local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached_objcg != objcg) { /* reset if necessary */
-               drain_obj_stock(stock);
+               old = drain_obj_stock(stock);
                obj_cgroup_get(objcg);
                stock->cached_objcg = objcg;
                stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3195,7 +3296,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
                stock->nr_bytes &= (PAGE_SIZE - 1);
        }
 
-       put_obj_stock(flags);
+       local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+       if (old)
+               obj_cgroup_put(old);
 
        if (nr_pages)
                obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -3494,6 +3597,36 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
        return mem_cgroup_force_empty(memcg) ?: nbytes;
 }
 
+#ifdef CONFIG_MEMCG_SWAP
+static int mem_cgroup_force_reclaim(struct cgroup_subsys_state *css,
+                              struct cftype *cft, u64 val)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       unsigned long nr_to_reclaim = val;
+       unsigned long total = 0;
+       int loop;
+
+       for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+               total += try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim,
+                                               GFP_KERNEL, true);
+
+               /*
+                * If nothing was reclaimed after two attempts, there
+                * may be no reclaimable pages in this hierarchy.
+                * If more than nr_to_reclaim pages were already reclaimed,
+                * finish force reclaim.
+                */
+               if (loop && (!total || total > nr_to_reclaim))
+                       break;
+       }
+
+       pr_info("%s: [Mem_reclaim] Loop: %d - Total_reclaimed: %lu - nr_to_reclaim: %lu\n",
+               __func__, loop, total, nr_to_reclaim);
+
+       return total;
+}
+#endif
+
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
                                     struct cftype *cft)
 {
@@ -3518,8 +3651,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        unsigned long val;
 
        if (mem_cgroup_is_root(memcg)) {
-               /* mem_cgroup_threshold() calls here from irqsafe context */
-               cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+               mem_cgroup_flush_stats();
                val = memcg_page_state(memcg, NR_FILE_PAGES) +
                        memcg_page_state(memcg, NR_ANON_MAPPED);
                if (swap)
@@ -3763,8 +3895,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                }
                break;
        case RES_SOFT_LIMIT:
-               memcg->soft_limit = nr_pages;
-               ret = 0;
+               if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+                       ret = -EOPNOTSUPP;
+               } else {
+                       memcg->soft_limit = nr_pages;
+                       ret = 0;
+               }
                break;
        }
        return ret ?: nbytes;
@@ -3900,7 +4036,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
        int nid;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                seq_printf(m, "%s=%lu", stat->name,
@@ -3972,7 +4108,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                unsigned long nr;
@@ -4475,7 +4611,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 
-       cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
        *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -4736,10 +4872,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
        unsigned int efd, cfd;
        struct fd efile;
        struct fd cfile;
+       struct dentry *cdentry;
        const char *name;
        char *endp;
        int ret;
 
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return -EOPNOTSUPP;
+
        buf = strstrip(buf);
 
        efd = simple_strtoul(buf, &endp, 10);
@@ -4787,6 +4927,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
                goto out_put_cfile;
 
        /*
+        * The control file must be a regular cgroup1 file. As a regular cgroup
+        * file can't be renamed, it's safe to access its name afterwards.
+        */
+       cdentry = cfile.file->f_path.dentry;
+       if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
         * Determine the event callbacks and set them in @event.  This used
         * to be done via struct cftype but cgroup core no longer knows
         * about these events.  The following is crude but the whole thing
@@ -4794,7 +4944,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
         *
         * DO NOT ADD NEW FILES.
         */
-       name = cfile.file->f_path.dentry->d_name.name;
+       name = cdentry->d_name.name;
 
        if (!strcmp(name, "memory.usage_in_bytes")) {
                event->register_event = mem_cgroup_usage_register_event;
@@ -4818,7 +4968,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
         * automatically removed on cgroup destruction but the removal is
         * asynchronous, so take an extra ref on @css.
         */
-       cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+       cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
                                               &memory_cgrp_subsys);
        ret = -EINVAL;
        if (IS_ERR(cfile_css))
@@ -5341,21 +5491,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg_wb_domain_size_changed(memcg);
 }
 
-void mem_cgroup_flush_stats(void)
-{
-       if (!spin_trylock(&stats_flush_lock))
-               return;
-
-       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
-       spin_unlock(&stats_flush_lock);
-}
-
-static void flush_memcg_stats_dwork(struct work_struct *w)
-{
-       mem_cgroup_flush_stats();
-       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
-}
-
 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -6373,7 +6508,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
        int i;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                int nid;
@@ -6836,7 +6971,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
        unsigned long nr_pages;
        struct mem_cgroup *memcg;
        struct obj_cgroup *objcg;
-       bool use_objcg = PageMemcgKmem(page);
 
        VM_BUG_ON_PAGE(PageLRU(page), page);
 
@@ -6845,7 +6979,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
         * page memcg or objcg at this point, we have fully
         * exclusive access to the page.
         */
-       if (use_objcg) {
+       if (PageMemcgKmem(page)) {
                objcg = __page_objcg(page);
                /*
                 * This get matches the put at the end of the function and
@@ -6873,7 +7007,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 
        nr_pages = compound_nr(page);
 
-       if (use_objcg) {
+       if (PageMemcgKmem(page)) {
                ug->nr_memory += nr_pages;
                ug->nr_kmem += nr_pages;
 
@@ -7077,7 +7211,7 @@ static int __init cgroup_memory(char *s)
                if (!strcmp(token, "nokmem"))
                        cgroup_memory_nokmem = true;
        }
-       return 0;
+       return 1;
 }
 __setup("cgroup.memory=", cgroup_memory);
 
@@ -7203,8 +7337,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * important here to have the interrupts disabled because it is the
         * only synchronisation we have for updating the per-CPU variables.
         */
-       VM_BUG_ON(!irqs_disabled());
+       memcg_stats_lock();
        mem_cgroup_charge_statistics(memcg, page, -nr_entries);
+       memcg_stats_unlock();
        memcg_check_events(memcg, page);
 
        css_put(&memcg->css);
@@ -7454,6 +7589,10 @@ static struct cftype memsw_files[] = {
                .write = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read_u64,
        },
+       {
+               .name = "force_reclaim",
+               .write_u64 = mem_cgroup_force_reclaim,
+       },
        { },    /* terminate */
 };