X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=mm%2Fmemcontrol.c;h=e2ea2fb2002d56da2cf29c11e28dd2c8e7715664;hb=47431c295f19913937fdd9b8cfd521e29bf6ca9c;hp=6da5020a8656da81761fbd3ca48efd8317dd1304;hpb=9c0c4d24ac000e52d55348961d3a3ba42065e0cf;p=platform%2Fkernel%2Flinux-rpi.git diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020..e2ea2fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -239,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -259,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM -extern spinlock_t css_set_lock; +static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { @@ -303,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); - spin_lock_irqsave(&css_set_lock, flags); + spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); - spin_unlock_irqrestore(&css_set_lock, flags); + spin_unlock_irqrestore(&objcg_lock, flags); percpu_ref_exit(ref); kfree_rcu(objcg, rcu); @@ -337,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - spin_lock_irq(&css_set_lock); + spin_lock_irq(&objcg_lock); /* 1) Ready to reparent active objcg. */ list_add(&objcg->list, &memcg->objcg_list); @@ -347,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); - spin_unlock_irq(&css_set_lock); + spin_unlock_irq(&objcg_lock); percpu_ref_kill(&objcg->refcnt); } @@ -635,6 +630,103 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) + +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#else + VM_BUG_ON(!irqs_disabled()); +#endif +} + +static void __memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif +} + +static void memcg_stats_unlock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif +} + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +{ + unsigned int x; + + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } +} + +static void __mem_cgroup_flush_stats(void) +{ + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + + flush_next_time = jiffies_64 + 2*FLUSH_TIME; + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + __mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -647,7 +739,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -674,11 +766,35 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + /* + * The caller from rmap relay on disabled preemption becase they never + * update their counter from in-interrupt context. For these two + * counters we check that the update is never performed from an + * interrupt context while other caller need to have disabled interrupt. + */ + __memcg_stats_lock(); + if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + switch (idx) { + case NR_ANON_MAPPED: + case NR_FILE_MAPPED: + case NR_ANON_THPS: + case NR_SHMEM_PMDMAPPED: + case NR_FILE_PMDMAPPED: + WARN_ON_ONCE(!in_task()); + break; + default: + WARN_ON_ONCE(!irqs_disabled()); + } + } + /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg, val); + memcg_stats_unlock(); } /** @@ -779,8 +895,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg, count); + memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -844,6 +962,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1414,7 +1535,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -1575,7 +1696,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -2037,39 +2158,37 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -struct obj_stock { +struct memcg_stock_pcp { + local_lock_t stock_lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#else - int dummy[0]; #endif -}; - -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - struct obj_stock task_obj; - struct obj_stock irq_obj; struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +}; static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { + return NULL; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) @@ -2079,41 +2198,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, #endif /* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - -/** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. @@ -2133,7 +2217,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @@ -2141,7 +2225,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2170,6 +2254,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -2177,28 +2262,25 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); + old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned long flags; - - local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @@ -2210,8 +2292,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); +} - local_irq_restore(flags); +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned long flags; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + __refill_stock(memcg, nr_pages); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -2231,7 +2320,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + migrate_disable(); + curcpu = smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @@ -2254,7 +2344,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) schedule_work_on(cpu, &stock->work); } } - put_cpu(); + migrate_enable(); mutex_unlock(&percpu_charge_mutex); } @@ -2530,6 +2620,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2565,15 +2656,6 @@ retry: goto force; /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, * but we prefer facilitating memory reclaim and getting back @@ -2630,8 +2712,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2640,14 +2723,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -3030,17 +3109,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); int *bytes; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + stock = this_cpu_ptr(&memcg_stock); + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx * changes. */ if (stock->cached_objcg != objcg) { - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; @@ -3084,38 +3167,53 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = stock->cached_objcg; if (!old) - return; + return NULL; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) - obj_cgroup_uncharge_pages(old, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(old); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + __refill_stock(memcg, nr_pages); + + css_put(&memcg->css); + } /* * The leftover is flushed to the centralized per-memcg value. @@ -3150,8 +3248,12 @@ static void drain_obj_stock(struct obj_stock *stock) stock->cached_pgdat = NULL; } - obj_cgroup_put(old); stock->cached_objcg = NULL; + /* + * The `old' objects needs to be released by the caller via + * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. + */ + return old; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -3159,13 +3261,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { struct mem_cgroup *memcg; - if (in_task() && stock->task_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; - } - if (stock->irq_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (stock->cached_objcg) { + memcg = obj_cgroup_memcg(stock->cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3176,12 +3273,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->cached_objcg = objcg; stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) @@ -3195,7 +3296,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -3494,6 +3597,36 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, return mem_cgroup_force_empty(memcg) ?: nbytes; } +#ifdef CONFIG_MEMCG_SWAP +static int mem_cgroup_force_reclaim(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long nr_to_reclaim = val; + unsigned long total = 0; + int loop; + + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + total += try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim, + GFP_KERNEL, true); + + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + * If more than nr_to_reclaim pages were already reclaimed, + * finish force reclaim. + */ + if (loop && (!total || total > nr_to_reclaim)) + break; + } + + pr_info("%s: [Mem_reclaim] Loop: %d - Total_reclaimed: %lu - nr_to_reclaim: %lu\n", + __func__, loop, total, nr_to_reclaim); + + return total; +} +#endif + static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3518,8 +3651,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3763,8 +3895,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } break; case RES_SOFT_LIMIT: - memcg->soft_limit = nr_pages; - ret = 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + ret = -EOPNOTSUPP; + } else { + memcg->soft_limit = nr_pages; + ret = 0; + } break; } return ret ?: nbytes; @@ -3900,7 +4036,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -3972,7 +4108,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4475,7 +4611,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -4736,10 +4872,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + buf = strstrip(buf); efd = simple_strtoul(buf, &endp, 10); @@ -4787,6 +4927,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4794,7 +4944,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4818,7 +4968,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) @@ -5341,21 +5491,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -6373,7 +6508,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -6836,7 +6971,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = PageMemcgKmem(page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -6845,7 +6979,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) * page memcg or objcg at this point, we have fully * exclusive access to the page. */ - if (use_objcg) { + if (PageMemcgKmem(page)) { objcg = __page_objcg(page); /* * This get matches the put at the end of the function and @@ -6873,7 +7007,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) nr_pages = compound_nr(page); - if (use_objcg) { + if (PageMemcgKmem(page)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; @@ -7077,7 +7211,7 @@ static int __init cgroup_memory(char *s) if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; } - return 0; + return 1; } __setup("cgroup.memory=", cgroup_memory); @@ -7203,8 +7337,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ - VM_BUG_ON(!irqs_disabled()); + memcg_stats_lock(); mem_cgroup_charge_statistics(memcg, page, -nr_entries); + memcg_stats_unlock(); memcg_check_events(memcg, page); css_put(&memcg->css); @@ -7454,6 +7589,10 @@ static struct cftype memsw_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, + { + .name = "force_reclaim", + .write_u64 = mem_cgroup_force_reclaim, + }, { }, /* terminate */ };