return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
-/* memcg and lruvec stats flushing */
-static void flush_memcg_stats_dwork(struct work_struct *w);
-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
-
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
}
#ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
bool mem_cgroup_kmem_disabled(void)
{
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
- spin_lock_irqsave(&css_set_lock, flags);
+ spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
- spin_unlock_irqrestore(&css_set_lock, flags);
+ spin_unlock_irqrestore(&objcg_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
- spin_lock_irq(&css_set_lock);
+ spin_lock_irq(&objcg_lock);
/* 1) Ready to reparent active objcg. */
list_add(&objcg->list, &memcg->objcg_list);
/* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
return mz;
}
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ * rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ * only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_SPINLOCK(stats_flush_lock);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+{
+ unsigned int x;
+
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+
+ x = __this_cpu_add_return(stats_updates, abs(val));
+ if (x > MEMCG_CHARGE_BATCH) {
+ atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ __this_cpu_write(stats_updates, 0);
+ }
+}
+
+static void __mem_cgroup_flush_stats(void)
+{
+ unsigned long flag;
+
+ if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+ return;
+
+ flush_next_time = jiffies_64 + 2*FLUSH_TIME;
+ cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+ atomic_set(&stats_flush_threshold, 0);
+ spin_unlock_irqrestore(&stats_flush_lock, flag);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ __mem_cgroup_flush_stats();
+}
+
+void mem_cgroup_flush_stats_delayed(void)
+{
+ if (time_after64(jiffies_64, flush_next_time))
+ mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ __mem_cgroup_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ memcg_rstat_updated(memcg, val);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
memcg = pn->memcg;
/* Update memcg */
- __mod_memcg_state(memcg, idx, val);
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+ memcg_rstat_updated(memcg, val);
}
/**
return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ memcg_rstat_updated(memcg, count);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
*
* Current memory state:
*/
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
struct page_counter *counter;
enum oom_status oom_status;
unsigned long nr_reclaimed;
+ bool passed_oom = false;
bool may_swap = true;
bool drained = false;
unsigned long pflags;
goto force;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (fatal_signal_pending(current))
- goto force;
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- /* mem_cgroup_threshold() calls here from irqsafe context */
- cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
memcg_wb_domain_size_changed(memcg);
}
-void mem_cgroup_flush_stats(void)
-{
- if (!spin_trylock(&stats_flush_lock))
- return;
-
- cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
- spin_unlock(&stats_flush_lock);
-}
-
-static void flush_memcg_stats_dwork(struct work_struct *w)
-{
- mem_cgroup_flush_stats();
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
-}
-
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
}
- return 0;
+ return 1;
}
__setup("cgroup.memory=", cgroup_memory);