* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
* protected by css_get and the tree walk is rcu safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
-
- if (css_tryget(&mem->css))
- return mem;
+ if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))
+ return mem_cgroup_from_css(next_css);
else {
prev_css = next_css;
goto skip_node;
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
- if (position && !css_tryget(&position->css))
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget(&position->css))
position = NULL;
}
return position;
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
int sequence)
{
- if (last_visited)
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
if (!memcg)
iter->generation++;
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
- KMEM_ACCOUNTED_MASK;
+ memcg_kmem_is_active(memcg);
}
/*
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
schedule_work(&cachep->memcg_params->destroy);
}
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
{
struct kmem_cache *new;
static char *tmp_name = NULL;
+ static DEFINE_MUTEX(mutex); /* protects tmp_name */
- lockdep_assert_held(&memcg_cache_mutex);
+ BUG_ON(!memcg_can_account_kmem(memcg));
+ mutex_lock(&mutex);
/*
* kmem_cache_create_memcg duplicates the given name and
* cgroup_name for this name requires RCU context.
if (new)
new->allocflags |= __GFP_KMEMCG;
+ else
+ new = s;
+ mutex_unlock(&mutex);
return new;
}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL)
- new_cachep = cachep;
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
-}
-
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
+ * we'll take the activate_kmem_mutex to protect ourselves against
+ * this.
*/
- mutex_lock(&set_limit_mutex);
+ mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&activate_kmem_mutex);
}
struct create_work {
return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ err = memcg_update_all_caches(memcg_id + 1);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
if (css->cgroup->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
if (parent != root_mem_cgroup)
mem_cgroup_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &mem_cgroup_subsys);
}
/*