* increase it.
*/
static DEFINE_IDA(kmem_limited_groups);
-static int memcg_limited_groups_array_size;
+int memcg_limited_groups_array_size;
+
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
* the alloc/free process all the time. In a small machine, 4 kmem-limited
memcg_check_events(memcg, page);
}
+static DEFINE_MUTEX(set_limit_mutex);
+
#ifdef CONFIG_MEMCG_KMEM
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
}
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+ struct kmem_cache *cachep;
+
+ VM_BUG_ON(p->is_root_cache);
+ cachep = p->root_cache;
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+}
+
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct memcg_cache_params *params;
+
+ if (!memcg_can_account_kmem(memcg))
+ return -EIO;
+
+ print_slabinfo_header(m);
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+ cache_show(memcg_params_to_cache(params), m);
+ mutex_unlock(&memcg->slab_caches_mutex);
+
+ return 0;
+}
+#endif
+
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size = sizeof(struct memcg_cache_params);
if (!s->memcg_params)
return -ENOMEM;
- if (memcg)
+ if (memcg) {
s->memcg_params->memcg = memcg;
+ s->memcg_params->root_cache = root_cache;
+ }
return 0;
}
kfree(s->memcg_params);
}
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account--;
+}
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *p;
+
+ p = container_of(w, struct memcg_cache_params, destroy);
+
+ cachep = memcg_params_to_cache(p);
+
+ /*
+ * If we get down to 0 after shrink, we could delete right away.
+ * However, memcg_release_pages() already puts us back in the workqueue
+ * in that case. If we proceed deleting, we'll get a dangling
+ * reference, and removing the object from the workqueue in that case
+ * is unnecessary complication. We are not a fast path.
+ *
+ * Note that this case is fundamentally different from racing with
+ * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+ * kmem_cache_shrink, not only we would be reinserting a dead cache
+ * into the queue, but doing so from inside the worker racing to
+ * destroy it.
+ *
+ * So if we aren't down to zero, we'll just schedule a worker and try
+ * again
+ */
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ return;
+ } else
+ kmem_cache_destroy(cachep);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+ if (!cachep->memcg_params->dead)
+ return;
+
+ /*
+ * There are many ways in which we can get here.
+ *
+ * We can get to a memory-pressure situation while the delayed work is
+ * still pending to run. The vmscan shrinkers can then release all
+ * cache memory and get us to destruction. If this is the case, we'll
+ * be executed twice, which is a bug (the second time will execute over
+ * bogus data). In this case, cancelling the work should be fine.
+ *
+ * But we can also get here from the worker itself, if
+ * kmem_cache_shrink is enough to shake all the remaining objects and
+ * get the page count to 0. In this case, we'll deadlock if we try to
+ * cancel the work (the worker runs with an internal lock held, which
+ * is the same lock we would hold for cancel_work_sync().)
+ *
+ * Since we can't possibly know who got us here, just refrain from
+ * running if there is already work pending
+ */
+ if (work_pending(&cachep->memcg_params->destroy))
+ return;
+ /*
+ * We have to defer the actual destroying to a workqueue, because
+ * we might currently be in a context that cannot sleep.
+ */
+ schedule_work(&cachep->memcg_params->destroy);
+}
+
static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
{
char *name;
return NULL;
new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor);
+ (s->flags & ~SLAB_PANIC), s->ctor, s);
+
+ if (new)
+ new->allocflags |= __GFP_KMEMCG;
kfree(name);
return new;
goto out;
new_cachep = kmem_cache_dup(memcg, cachep);
-
if (new_cachep == NULL) {
new_cachep = cachep;
goto out;
}
mem_cgroup_get(memcg);
- new_cachep->memcg_params->root_cache = cachep;
+ atomic_set(&new_cachep->memcg_params->nr_pages , 0);
cachep->memcg_params->memcg_caches[idx] = new_cachep;
/*
return new_cachep;
}
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+ int i;
+
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ return;
+
+ /*
+ * If the cache is being destroyed, we trust that there is no one else
+ * requesting objects from it. Even if there are, the sanity checks in
+ * kmem_cache_destroy should caught this ill-case.
+ *
+ * Still, we don't want anyone else freeing memcg_caches under our
+ * noses, which can happen if a new memcg comes to life. As usual,
+ * we'll take the set_limit_mutex to protect ourselves against this.
+ */
+ mutex_lock(&set_limit_mutex);
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ c = s->memcg_params->memcg_caches[i];
+ if (!c)
+ continue;
+
+ /*
+ * We will now manually delete the caches, so to avoid races
+ * we need to cancel all pending destruction workers and
+ * proceed with destruction ourselves.
+ *
+ * kmem_cache_destroy() will call kmem_cache_shrink internally,
+ * and that could spawn the workers again: it is likely that
+ * the cache still have active pages until this very moment.
+ * This would lead us back to mem_cgroup_destroy_cache.
+ *
+ * But that will not execute at all if the "dead" flag is not
+ * set, so flip it down to guarantee we are in control.
+ */
+ c->memcg_params->dead = false;
+ cancel_work_sync(&c->memcg_params->destroy);
+ kmem_cache_destroy(c);
+ }
+ mutex_unlock(&set_limit_mutex);
+}
+
struct create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ cachep = memcg_params_to_cache(params);
+ cachep->memcg_params->dead = true;
+ INIT_WORK(&cachep->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
+ schedule_work(&cachep->memcg_params->destroy);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
static void memcg_create_cache_work_func(struct work_struct *w)
{
struct create_work *cw;
* Enqueue the creation of a per-memcg kmem_cache.
* Called with rcu_read_lock.
*/
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
struct create_work *cw;
schedule_work(&cw->work);
}
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ /*
+ * We need to stop accounting when we kmalloc, because if the
+ * corresponding kmalloc cache is not yet created, the first allocation
+ * in __memcg_create_cache_enqueue will recurse.
+ *
+ * However, it is better to enclose the whole function. Depending on
+ * the debugging options enabled, INIT_WORK(), for instance, can
+ * trigger an allocation. This too, will make us recurse. Because at
+ * this point we can't allow ourselves back into memcg_kmem_get_cache,
+ * the safest choice is to do it like this, wrapping the whole function.
+ */
+ memcg_stop_kmem_account();
+ __memcg_create_cache_enqueue(memcg, cachep);
+ memcg_resume_kmem_account();
+}
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return cachep;
+
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
rcu_read_unlock();
VM_BUG_ON(mem_cgroup_is_root(memcg));
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
}
#endif
-static DEFINE_MUTEX(set_limit_mutex);
-
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
},
+#ifdef CONFIG_SLABINFO
+ {
+ .name = "kmem.slabinfo",
+ .read_seq_string = mem_cgroup_slabinfo_read,
+ },
+#endif
#endif
{ }, /* terminate */
};
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
memcg->use_hierarchy = parent->use_hierarchy;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
mem_cgroup_reparent_charges(memcg);
+ mem_cgroup_destroy_all_caches(memcg);
}
static void mem_cgroup_css_free(struct cgroup *cont)
.use_id = 1,
};
+/*
+ * The rest of init is performed during ->css_alloc() for root css which
+ * happens before initcalls. hotcpu_notifier() can't be done together as
+ * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
+ * dependency. Do it from a subsys_initcall().
+ */
+static int __init mem_cgroup_init(void)
+{
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);
+
#ifdef CONFIG_MEMCG_SWAP
static int __init enable_swap_account(char *s)
{