memcg: fix endless loop caused by mem_cgroup_iter

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index d2da65c..da07784 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -149,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
          * matches memcg->dead_count of the hierarchy root group.
          */
         struct mem_cgroup *last_visited;
-       unsigned long last_dead_count;
+       int last_dead_count;
  
         /* scan generation, increased every round-trip */
         unsigned int generation;
@@ -382,15 +382,10 @@ struct mem_cgroup {
  
  /* internal only representation about the status of kmem accounting. */
  enum {
-       KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
-       KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+       KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
         KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
  };
  
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-               ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
  #ifdef CONFIG_MEMCG_KMEM
  static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
  {
@@ -402,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
         return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
  }
  
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-       set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-       clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
  static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
  {
         /*
@@ -1134,10 +1119,8 @@ skip_node:
          * protected by css_get and the tree walk is rcu safe.
          */
         if (next_css) {
-               struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
-
-               if (css_tryget(&mem->css))
-                       return mem;
+               if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))
+                       return mem_cgroup_from_css(next_css);
                 else {
                         prev_css = next_css;
                         goto skip_node;
@@ -1175,7 +1158,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
         if (iter->last_dead_count == *sequence) {
                 smp_rmb();
                 position = iter->last_visited;
-               if (position && !css_tryget(&position->css))
+
+               /*
+                * We cannot take a reference to root because we might race
+                * with root removal and returning NULL would end up in
+                * an endless loop on the iterator user level when root
+                * would be returned all the time.
+                */
+               if (position && position != root &&
+                               !css_tryget(&position->css))
                         position = NULL;
         }
         return position;
@@ -1184,9 +1175,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
  static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
                                    struct mem_cgroup *last_visited,
                                    struct mem_cgroup *new_position,
+                                  struct mem_cgroup *root,
                                    int sequence)
  {
-       if (last_visited)
+       /* root reference counting symmetric to mem_cgroup_iter_load */
+       if (last_visited && last_visited != root)
                 css_put(&last_visited->css);
         /*
          * We store the sequence count from the time @last_visited was
@@ -1261,7 +1254,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                 memcg = __mem_cgroup_iter_next(root, last_visited);
  
                 if (reclaim) {
-                       mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+                       mem_cgroup_iter_update(iter, last_visited, memcg, root,
+                                       seq);
  
                         if (!memcg)
                                 iter->generation++;
@@ -1858,13 +1852,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 break;
                         };
                         points = oom_badness(task, memcg, NULL, totalpages);
-                       if (points > chosen_points) {
-                               if (chosen)
-                                       put_task_struct(chosen);
-                               chosen = task;
-                               chosen_points = points;
-                               get_task_struct(chosen);
-                       }
+                       if (!points || points < chosen_points)
+                               continue;
+                       /* Prefer thread group leaders for display purposes */
+                       if (points == chosen_points &&
+                           thread_group_leader(chosen))
+                               continue;
+
+                       if (chosen)
+                               put_task_struct(chosen);
+                       chosen = task;
+                       chosen_points = points;
+                       get_task_struct(chosen);
                 }
                 css_task_iter_end(&it);
         }
@@ -2992,11 +2991,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
  static DEFINE_MUTEX(set_limit_mutex);
  
  #ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
  static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
  {
         return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-               (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
-                                                       KMEM_ACCOUNTED_MASK;
+               memcg_kmem_is_active(memcg);
  }
  
  /*
@@ -3105,43 +3105,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
         return memcg ? memcg->kmemcg_id : -1;
  }
  
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-       int num, ret;
-
-       num = ida_simple_get(&kmem_limited_groups,
-                               0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-       if (num < 0)
-               return num;
-       /*
-        * After this point, kmem_accounted (that we test atomically in
-        * the beginning of this conditional), is no longer 0. This
-        * guarantees only one process will set the following boolean
-        * to true. We don't need test_and_set because we're protected
-        * by the set_limit_mutex anyway.
-        */
-       memcg_kmem_set_activated(memcg);
-
-       ret = memcg_update_all_caches(num+1);
-       if (ret) {
-               ida_simple_remove(&kmem_limited_groups, num);
-               memcg_kmem_clear_activated(memcg);
-               return ret;
-       }
-
-       memcg->kmemcg_id = num;
-       INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-       mutex_init(&memcg->slab_caches_mutex);
-       return 0;
-}
-
  static size_t memcg_caches_array_size(int num_groups)
  {
         ssize_t size;
@@ -3178,18 +3141,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
  
         if (num_groups > memcg_limited_groups_array_size) {
                 int i;
+               struct memcg_cache_params *new_params;
                 ssize_t size = memcg_caches_array_size(num_groups);
  
                 size *= sizeof(void *);
                 size += offsetof(struct memcg_cache_params, memcg_caches);
  
-               s->memcg_params = kzalloc(size, GFP_KERNEL);
-               if (!s->memcg_params) {
-                       s->memcg_params = cur_params;
+               new_params = kzalloc(size, GFP_KERNEL);
+               if (!new_params)
                         return -ENOMEM;
-               }
  
-               s->memcg_params->is_root_cache = true;
+               new_params->is_root_cache = true;
  
                 /*
                  * There is the chance it will be bigger than
@@ -3203,7 +3165,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                 for (i = 0; i < memcg_limited_groups_array_size; i++) {
                         if (!cur_params->memcg_caches[i])
                                 continue;
-                       s->memcg_params->memcg_caches[i] =
+                       new_params->memcg_caches[i] =
                                                 cur_params->memcg_caches[i];
                 }
  
@@ -3216,7 +3178,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                  * bigger than the others. And all updates will reset this
                  * anyway.
                  */
-               kfree(cur_params);
+               rcu_assign_pointer(s->memcg_params, new_params);
+               if (cur_params)
+                       kfree_rcu(cur_params, rcu_head);
         }
         return 0;
  }
@@ -3427,27 +3391,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
         schedule_work(&cachep->memcg_params->destroy);
  }
  
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-                                        struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+                                                 struct kmem_cache *s)
  {
         struct kmem_cache *new;
         static char *tmp_name = NULL;
+       static DEFINE_MUTEX(mutex);     /* protects tmp_name */
  
-       lockdep_assert_held(&memcg_cache_mutex);
+       BUG_ON(!memcg_can_account_kmem(memcg));
  
+       mutex_lock(&mutex);
         /*
          * kmem_cache_create_memcg duplicates the given name and
          * cgroup_name for this name requires RCU context.
@@ -3470,25 +3423,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
  
         if (new)
                 new->allocflags |= __GFP_KMEMCG;
+       else
+               new = s;
  
+       mutex_unlock(&mutex);
         return new;
  }
  
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-                                                 struct kmem_cache *cachep)
-{
-       struct kmem_cache *new_cachep;
-
-       BUG_ON(!memcg_can_account_kmem(memcg));
-
-       mutex_lock(&memcg_cache_mutex);
-       new_cachep = kmem_cache_dup(memcg, cachep);
-       if (new_cachep == NULL)
-               new_cachep = cachep;
-       mutex_unlock(&memcg_cache_mutex);
-       return new_cachep;
-}
-
  void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
  {
         struct kmem_cache *c;
@@ -3506,9 +3447,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
          *
          * Still, we don't want anyone else freeing memcg_caches under our
          * noses, which can happen if a new memcg comes to life. As usual,
-        * we'll take the set_limit_mutex to protect ourselves against this.
+        * we'll take the activate_kmem_mutex to protect ourselves against
+        * this.
          */
-       mutex_lock(&set_limit_mutex);
+       mutex_lock(&activate_kmem_mutex);
         for_each_memcg_cache_index(i) {
                 c = cache_from_memcg_idx(s, i);
                 if (!c)
@@ -3531,7 +3473,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                 cancel_work_sync(&c->memcg_params->destroy);
                 kmem_cache_destroy(c);
         }
-       mutex_unlock(&set_limit_mutex);
+       mutex_unlock(&activate_kmem_mutex);
  }
  
  struct create_work {
@@ -5195,11 +5137,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
         return val;
  }
  
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
-       int ret = -EINVAL;
  #ifdef CONFIG_MEMCG_KMEM
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+                                unsigned long long limit)
+{
+       int err = 0;
+       int memcg_id;
+
+       if (memcg_kmem_is_active(memcg))
+               return 0;
+
+       /*
+        * We are going to allocate memory for data shared by all memory
+        * cgroups so let's stop accounting here.
+        */
+       memcg_stop_kmem_account();
+
         /*
          * For simplicity, we won't allow this to be disabled.  It also can't
          * be changed if the cgroup has children already, or if tasks had
@@ -5213,72 +5167,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
          * of course permitted.
          */
         mutex_lock(&memcg_create_mutex);
-       mutex_lock(&set_limit_mutex);
-       if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
-               if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
-                       ret = -EBUSY;
-                       goto out;
-               }
-               ret = res_counter_set_limit(&memcg->kmem, val);
-               VM_BUG_ON(ret);
+       if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+               err = -EBUSY;
+       mutex_unlock(&memcg_create_mutex);
+       if (err)
+               goto out;
  
-               ret = memcg_update_cache_sizes(memcg);
-               if (ret) {
-                       res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
-                       goto out;
-               }
-               static_key_slow_inc(&memcg_kmem_enabled_key);
-               /*
-                * setting the active bit after the inc will guarantee no one
-                * starts accounting before all call sites are patched
-                */
-               memcg_kmem_set_active(memcg);
-       } else
-               ret = res_counter_set_limit(&memcg->kmem, val);
+       memcg_id = ida_simple_get(&kmem_limited_groups,
+                                 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+       if (memcg_id < 0) {
+               err = memcg_id;
+               goto out;
+       }
+
+       /*
+        * Make sure we have enough space for this cgroup in each root cache's
+        * memcg_params.
+        */
+       err = memcg_update_all_caches(memcg_id + 1);
+       if (err)
+               goto out_rmid;
+
+       memcg->kmemcg_id = memcg_id;
+       INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+       mutex_init(&memcg->slab_caches_mutex);
+
+       /*
+        * We couldn't have accounted to this cgroup, because it hasn't got the
+        * active bit set yet, so this should succeed.
+        */
+       err = res_counter_set_limit(&memcg->kmem, limit);
+       VM_BUG_ON(err);
+
+       static_key_slow_inc(&memcg_kmem_enabled_key);
+       /*
+        * Setting the active bit after enabling static branching will
+        * guarantee no one starts accounting before all call sites are
+        * patched.
+        */
+       memcg_kmem_set_active(memcg);
  out:
-       mutex_unlock(&set_limit_mutex);
-       mutex_unlock(&memcg_create_mutex);
-#endif
+       memcg_resume_kmem_account();
+       return err;
+
+out_rmid:
+       ida_simple_remove(&kmem_limited_groups, memcg_id);
+       goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+                              unsigned long long limit)
+{
+       int ret;
+
+       mutex_lock(&activate_kmem_mutex);
+       ret = __memcg_activate_kmem(memcg, limit);
+       mutex_unlock(&activate_kmem_mutex);
+       return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                  unsigned long long val)
+{
+       int ret;
+
+       if (!memcg_kmem_is_active(memcg))
+               ret = memcg_activate_kmem(memcg, val);
+       else
+               ret = res_counter_set_limit(&memcg->kmem, val);
         return ret;
  }
  
-#ifdef CONFIG_MEMCG_KMEM
  static int memcg_propagate_kmem(struct mem_cgroup *memcg)
  {
         int ret = 0;
         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-       if (!parent)
-               goto out;
  
-       memcg->kmem_account_flags = parent->kmem_account_flags;
-       /*
-        * When that happen, we need to disable the static branch only on those
-        * memcgs that enabled it. To achieve this, we would be forced to
-        * complicate the code by keeping track of which memcgs were the ones
-        * that actually enabled limits, and which ones got it from its
-        * parents.
-        *
-        * It is a lot simpler just to do static_key_slow_inc() on every child
-        * that is accounted.
-        */
-       if (!memcg_kmem_is_active(memcg))
-               goto out;
+       if (!parent)
+               return 0;
  
+       mutex_lock(&activate_kmem_mutex);
         /*
-        * __mem_cgroup_free() will issue static_key_slow_dec() because this
-        * memcg is active already. If the later initialization fails then the
-        * cgroup core triggers the cleanup so we do not have to do it here.
+        * If the parent cgroup is not kmem-active now, it cannot be activated
+        * after this point, because it has at least one child already.
          */
-       static_key_slow_inc(&memcg_kmem_enabled_key);
-
-       mutex_lock(&set_limit_mutex);
-       memcg_stop_kmem_account();
-       ret = memcg_update_cache_sizes(memcg);
-       memcg_resume_kmem_account();
-       mutex_unlock(&set_limit_mutex);
-out:
+       if (memcg_kmem_is_active(parent))
+               ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+       mutex_unlock(&activate_kmem_mutex);
         return ret;
  }
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                  unsigned long long val)
+{
+       return -EINVAL;
+}
  #endif /* CONFIG_MEMCG_KMEM */
  
  /*
@@ -5312,7 +5295,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
                 else if (type == _MEMSWAP)
                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
                 else if (type == _KMEM)
-                       ret = memcg_update_kmem_limit(css, val);
+                       ret = memcg_update_kmem_limit(memcg, val);
                 else
                         return -EINVAL;
                 break;
@@ -6546,7 +6529,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
-       int error = 0;
  
         if (css->cgroup->id > MEM_CGROUP_ID_MAX)
                 return -ENOSPC;
@@ -6581,10 +6563,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 if (parent != root_mem_cgroup)
                         mem_cgroup_subsys.broken_hierarchy = true;
         }
-
-       error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
         mutex_unlock(&memcg_create_mutex);
-       return error;
+
+       return memcg_init_kmem(memcg, &mem_cgroup_subsys);
  }
  
  /*