*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
bool oom_lock;
atomic_t under_oom;
+ atomic_t oom_wakeups;
int swappiness;
/* OOM-Killer disable */
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
- unsigned long move_charge_at_immigrate;
+ unsigned long move_charge_at_immigrate;
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
+ /*
+ * Protects soft_contributed transitions.
+ * See mem_cgroup_update_soft_limit
+ */
+ spinlock_t soft_lock;
+
+ /*
+ * If true then this group has increased parents' children_in_excess
+ * when it got over the soft limit.
+ * When a group falls bellow the soft limit, parents' children_in_excess
+ * is decreased and soft_contributed changed to false.
+ */
+ bool soft_contributed;
+
+ /* Number of children that are in soft limit excess */
+ atomic_t children_in_excess;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
}
/*
+ * Called from rate-limited memcg_check_events when enough
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
+ * that all the parents up the hierarchy will be notified that this group
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
+ * makes the transition a single action whenever the state flips from one to
+ * the other.
+ */
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
+{
+ unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
+ struct mem_cgroup *parent = memcg;
+ int delta = 0;
+
+ spin_lock(&memcg->soft_lock);
+ if (excess) {
+ if (!memcg->soft_contributed) {
+ delta = 1;
+ memcg->soft_contributed = true;
+ }
+ } else {
+ if (memcg->soft_contributed) {
+ delta = -1;
+ memcg->soft_contributed = false;
+ }
+ }
+
+ /*
+ * Necessary to update all ancestors when hierarchy is used
+ * because their event counter is not touched.
+ * We track children even outside the hierarchy for the root
+ * cgroup because tree walk starting at root should visit
+ * all cgroups and we want to prevent from pointless tree
+ * walk if no children is below the limit.
+ */
+ while (delta && (parent = parent_mem_cgroup(parent)))
+ atomic_add(delta, &parent->children_in_excess);
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+ atomic_add(delta, &root_mem_cgroup->children_in_excess);
+ spin_unlock(&memcg->soft_lock);
+}
+
+/*
* Check events in order.
*
*/
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit;
bool do_numainfo __maybe_unused;
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
preempt_enable();
mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ mem_cgroup_update_soft_limit(memcg);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
/*
* A group is eligible for the soft limit reclaim under the given root
* hierarchy if
- * a) it is over its soft limit
- * b) any parent up the hierarchy is over its soft limit
+ * a) it is over its soft limit
+ * b) any parent up the hierarchy is over its soft limit
+ *
+ * If the given group doesn't have any children over the limit then it
+ * doesn't make any sense to iterate its subtree.
*/
enum mem_cgroup_filter_t
mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
struct mem_cgroup *root)
{
- struct mem_cgroup *parent = memcg;
+ struct mem_cgroup *parent;
+
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ parent = memcg;
if (res_counter_soft_limit_excess(&memcg->res))
return VISIT;
* If any parent up to the root in the hierarchy is over its soft limit
* then we have to obey and reclaim from this group as well.
*/
- while((parent = parent_mem_cgroup(parent))) {
+ while ((parent = parent_mem_cgroup(parent))) {
if (res_counter_soft_limit_excess(&parent->res))
return VISIT;
if (parent == root)
break;
}
+ if (!atomic_read(&memcg->children_in_excess))
+ return SKIP_TREE;
return SKIP;
}
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
- * Has to be called with memcg_oom_lock
*/
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
+ spin_lock(&memcg_oom_lock);
+
for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) {
/*
iter->oom_lock = true;
}
- if (!failed)
- return true;
-
- /*
- * OK, we failed to lock the whole subtree so we have to clean up
- * what we set up to the failing subtree
- */
- for_each_mem_cgroup_tree(iter, memcg) {
- if (iter == failed) {
- mem_cgroup_iter_break(memcg, iter);
- break;
+ if (failed) {
+ /*
+ * OK, we failed to lock the whole subtree so we have
+ * to clean up what we set up to the failing subtree
+ */
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == failed) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ iter->oom_lock = false;
}
- iter->oom_lock = false;
}
- return false;
+
+ spin_unlock(&memcg_oom_lock);
+
+ return !failed;
}
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
- return 0;
+ spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
}
-static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
+ atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
}
/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
*/
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
- int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- struct oom_wait_info owait;
- bool locked, need_to_kill;
+ bool locked;
+ int wakeups;
- owait.memcg = memcg;
- owait.wait.flags = 0;
- owait.wait.func = memcg_oom_wake_function;
- owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
- need_to_kill = true;
- mem_cgroup_mark_under_oom(memcg);
+ if (!current->memcg_oom.may_oom)
+ return;
+
+ current->memcg_oom.in_memcg_oom = 1;
- /* At first, try to OOM lock hierarchy under memcg.*/
- spin_lock(&memcg_oom_lock);
- locked = mem_cgroup_oom_lock(memcg);
/*
- * Even if signal_pending(), we can't quit charge() loop without
- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
- * under OOM is always welcomed, use TASK_KILLABLE here.
+ * As with any blocking lock, a contender needs to start
+ * listening for wakeups before attempting the trylock,
+ * otherwise it can miss the wakeup from the unlock and sleep
+ * indefinitely. This is just open-coded because our locking
+ * is so particular to memcg hierarchies.
*/
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- if (!locked || memcg->oom_kill_disable)
- need_to_kill = false;
+ wakeups = atomic_read(&memcg->oom_wakeups);
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
if (locked)
mem_cgroup_oom_notify(memcg);
- spin_unlock(&memcg_oom_lock);
- if (need_to_kill) {
- finish_wait(&memcg_oom_waitq, &owait.wait);
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
mem_cgroup_out_of_memory(memcg, mask, order);
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
} else {
- schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
+ /*
+ * A system call can just return -ENOMEM, but if this
+ * is a page fault and somebody else is handling the
+ * OOM already, we need to sleep on the OOM waitqueue
+ * for this memcg until the situation is resolved.
+ * Which can take some time because it might be
+ * handled by a userspace task.
+ *
+ * However, this is the charge context, which means
+ * that we may sit on a large call stack and hold
+ * various filesystem locks, the mmap_sem etc. and we
+ * don't want the OOM handler to deadlock on them
+ * while we sit here and wait. Store the current OOM
+ * context in the task_struct, then return -ENOMEM.
+ * At the end of the page fault handler, with the
+ * stack unwound, pagefault_out_of_memory() will check
+ * back with us by calling
+ * mem_cgroup_oom_synchronize(), possibly putting the
+ * task to sleep.
+ */
+ current->memcg_oom.oom_locked = locked;
+ current->memcg_oom.wakeups = wakeups;
+ css_get(&memcg->css);
+ current->memcg_oom.wait_on_memcg = memcg;
}
- spin_lock(&memcg_oom_lock);
- if (locked)
- mem_cgroup_oom_unlock(memcg);
- memcg_wakeup_oom(memcg);
- spin_unlock(&memcg_oom_lock);
+}
- mem_cgroup_unmark_under_oom(memcg);
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation. Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+ struct oom_wait_info owait;
+ struct mem_cgroup *memcg;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ /* OOM is global, do not handle */
+ if (!current->memcg_oom.in_memcg_oom)
return false;
- /* Give chance to dying process */
- schedule_timeout_uninterruptible(1);
+
+ /*
+ * We invoked the OOM killer but there is a chance that a kill
+ * did not free up any charges. Everybody else might already
+ * be sleeping, so restart the fault and keep the rampage
+ * going until some charges are released.
+ */
+ memcg = current->memcg_oom.wait_on_memcg;
+ if (!memcg)
+ goto out;
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ goto out_memcg;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ /* Only sleep if we didn't miss any wakeups since OOM */
+ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ schedule();
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+ mem_cgroup_unmark_under_oom(memcg);
+ if (current->memcg_oom.oom_locked) {
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
+ }
+ css_put(&memcg->css);
+ current->memcg_oom.wait_on_memcg = NULL;
+out:
+ current->memcg_oom.in_memcg_oom = 0;
return true;
}
flush_work(&stock->work);
}
out:
- put_online_cpus();
+ put_online_cpus();
}
/*
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
- CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
- bool oom_check)
+ bool invoke_oom)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
- /* If we don't need to call oom-killer at el, return immediately */
- if (!oom_check)
- return CHARGE_NOMEM;
- /* check OOM */
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
- return CHARGE_OOM_DIE;
+ if (invoke_oom)
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
- return CHARGE_RETRY;
+ return CHARGE_NOMEM;
}
/*
}
do {
- bool oom_check;
+ bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
goto bypass;
}
- oom_check = false;
- if (oom && !nr_oom_retries) {
- oom_check = true;
- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- }
-
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
- oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+ nr_pages, invoke_oom);
switch (ret) {
case CHARGE_OK:
break;
css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom) {
+ if (!oom || invoke_oom) {
css_put(&memcg->css);
goto nomem;
}
- /* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
- css_put(&memcg->css);
- goto bypass;
}
} while (ret != CHARGE_OK);
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
- */
+ */
smp_wmb();
SetPageCgroupUsed(pc);
* the page allocator. Therefore, the following sequence when backed by
* the SLUB allocator:
*
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
+ * memcg_stop_kmem_account();
+ * kmalloc(<large_number>)
+ * memcg_resume_kmem_account();
*
* would effectively ignore the fact that we should skip accounting,
* since it will drive us directly to this function without passing
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
- if (curusage >= oldusage)
+ if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
- retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ spin_lock_init(&memcg->soft_lock);
return &memcg->css;
mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
+ if (memcg->soft_contributed) {
+ while ((memcg = parent_mem_cgroup(memcg)))
+ atomic_dec(&memcg->children_in_excess);
+
+ if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+ atomic_dec(&root_mem_cgroup->children_in_excess);
+ }
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}