memcg: move mem_cgroup_soft_limit_tree_init to mem_cgroup_init

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index fbb60b1..bb894b0 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
         "pgmajfault",
  };
  
+static const char * const mem_cgroup_lru_names[] = {
+       "inactive_anon",
+       "active_anon",
+       "inactive_file",
+       "active_file",
+       "unevictable",
+};
+
  /*
   * Per memcg event counter is incremented at every pagein/pageout. With THP,
   * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
  };
  
  struct mem_cgroup_lru_info {
-       struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+       struct mem_cgroup_per_node *nodeinfo[0];
  };
  
  /*
@@ -276,17 +284,6 @@ struct mem_cgroup {
          */
         struct res_counter kmem;
         /*
-        * Per cgroup active and inactive list, similar to the
-        * per zone LRU lists.
-        */
-       struct mem_cgroup_lru_info info;
-       int last_scanned_node;
-#if MAX_NUMNODES > 1
-       nodemask_t      scan_nodes;
-       atomic_t        numainfo_events;
-       atomic_t        numainfo_updating;
-#endif
-       /*
          * Should the accounting and control be hierarchical, per subtree?
          */
         bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
          /* Index in the kmem_cache->memcg_params->memcg_caches array */
         int kmemcg_id;
  #endif
+
+       int last_scanned_node;
+#if MAX_NUMNODES > 1
+       nodemask_t      scan_nodes;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
+#endif
+       /*
+        * Per cgroup active and inactive list, similar to the
+        * per zone LRU lists.
+        *
+        * WARNING: This has to be the last element of the struct. Don't
+        * add new fields after this point.
+        */
+       struct mem_cgroup_lru_info info;
  };
  
+static size_t memcg_size(void)
+{
+       return sizeof(struct mem_cgroup) +
+               nr_node_ids * sizeof(struct mem_cgroup_per_node);
+}
+
  /* internal only representation about the status of kmem accounting. */
  enum {
         KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
  
  /* Stuffs for move charges at task migration. */
  /*
- * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
- * left-shifted bitmap of these types.
+ * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
   */
  enum move_type {
         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
         spinlock_t        lock; /* for from, to */
         struct mem_cgroup *from;
         struct mem_cgroup *to;
+       unsigned long immigrate_flags;
         unsigned long precharge;
         unsigned long moved_charge;
         unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
  
  static bool move_anon(void)
  {
-       return test_bit(MOVE_CHARGE_TYPE_ANON,
-                                       &mc.to->move_charge_at_immigrate);
+       return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
  }
  
  static bool move_file(void)
  {
-       return test_bit(MOVE_CHARGE_TYPE_FILE,
-                                       &mc.to->move_charge_at_immigrate);
+       return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
  }
  
  /*
@@ -471,6 +488,13 @@ enum res_type {
  #define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
  #define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
  
+/*
+ * The memcg_create_mutex will be held whenever a new cgroup is created.
+ * As a consequence, any change that needs to protect against new child cgroups
+ * appearing has to hold it as well.
+ */
+static DEFINE_MUTEX(memcg_create_mutex);
+
  static void mem_cgroup_get(struct mem_cgroup *memcg);
  static void mem_cgroup_put(struct mem_cgroup *memcg);
  
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
  static struct mem_cgroup_per_zone *
  mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
  {
+       VM_BUG_ON((unsigned)nid >= nr_node_ids);
         return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
  }
  
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
         return inactive * inactive_ratio < active;
  }
  
-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
-{
-       unsigned long active;
-       unsigned long inactive;
-
-       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
-       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
-
-       return (active > inactive);
-}
-
  #define mem_cgroup_from_res_counter(counter, member)   \
         container_of(counter, struct mem_cgroup, member)
  
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
         spin_unlock_irqrestore(&memcg->move_lock, *flags);
  }
  
+#define K(x) ((x) << (PAGE_SHIFT-10))
  /**
- * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
          */
         static char memcg_name[PATH_MAX];
         int ret;
+       struct mem_cgroup *iter;
+       unsigned int i;
  
-       if (!memcg || !p)
+       if (!p)
                 return;
  
         rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         }
         rcu_read_unlock();
  
-       printk(KERN_INFO "Task in %s killed", memcg_name);
+       pr_info("Task in %s killed", memcg_name);
  
         rcu_read_lock();
         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         /*
          * Continues from above, so we don't need an KERN_ level
          */
-       printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+       pr_cont(" as a result of limit of %s\n", memcg_name);
  done:
  
-       printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+       pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
-       printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
-               "failcnt %llu\n",
+       pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
-       printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+       pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
                 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
                 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
                 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+
+       for_each_mem_cgroup_tree(iter, memcg) {
+               pr_info("Memory cgroup stats");
+
+               rcu_read_lock();
+               ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
+               if (!ret)
+                       pr_cont(" for %s", memcg_name);
+               rcu_read_unlock();
+               pr_cont(":");
+
+               for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+                       if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                               continue;
+                       pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+                               K(mem_cgroup_read_stat(iter, i)));
+               }
+
+               for (i = 0; i < NR_LRU_LISTS; i++)
+                       pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+                               K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+               pr_cont("\n");
+       }
  }
  
  /*
@@ -4391,8 +4431,8 @@ void mem_cgroup_print_bad_page(struct page *page)
  
         pc = lookup_page_cgroup_used(page);
         if (pc) {
-               printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
-                      pc, pc->flags, pc->mem_cgroup);
+               pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+                        pc, pc->flags, pc->mem_cgroup);
         }
  }
  #endif
@@ -4719,6 +4759,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
  }
  
  /*
+ * This mainly exists for tests during the setting of set of use_hierarchy.
+ * Since this is the very setting we are changing, the current hierarchy value
+ * is meaningless
+ */
+static inline bool __memcg_has_children(struct mem_cgroup *memcg)
+{
+       struct cgroup *pos;
+
+       /* bounce at first found */
+       cgroup_for_each_child(pos, memcg->css.cgroup)
+               return true;
+       return false;
+}
+
+/*
+ * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
+ * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
+ * from mem_cgroup_count_children(), in the sense that we don't really care how
+ * many children we have; we only need to know if we have any.  It also counts
+ * any memcg without hierarchy as infertile.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+       return memcg->use_hierarchy && __memcg_has_children(memcg);
+}
+
+/*
   * Reclaims as many pages from the given memcg as possible and moves
   * the rest to the parent.
   *
@@ -4788,7 +4855,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
         if (parent)
                 parent_memcg = mem_cgroup_from_cont(parent);
  
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
  
         if (memcg->use_hierarchy == val)
                 goto out;
@@ -4803,7 +4870,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
          */
         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
                                 (val == 1 || val == 0)) {
-               if (list_empty(&cont->children))
+               if (!__memcg_has_children(memcg))
                         memcg->use_hierarchy = val;
                 else
                         retval = -EBUSY;
@@ -4811,7 +4878,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                 retval = -EINVAL;
  
  out:
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
  
         return retval;
  }
@@ -4896,8 +4963,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
  {
         int ret = -EINVAL;
  #ifdef CONFIG_MEMCG_KMEM
-       bool must_inc_static_branch = false;
-
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
         /*
          * For simplicity, we won't allow this to be disabled.  It also can't
@@ -4910,18 +4975,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
          *
          * After it first became limited, changes in the value of the limit are
          * of course permitted.
-        *
-        * Taking the cgroup_lock is really offensive, but it is so far the only
-        * way to guarantee that no children will appear. There are plenty of
-        * other offenders, and they should all go away. Fine grained locking
-        * is probably the way to go here. When we are fully hierarchical, we
-        * can also get rid of the use_hierarchy check.
          */
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
         mutex_lock(&set_limit_mutex);
         if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
-               if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
-                                               !list_empty(&cont->children))) {
+               if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
                         ret = -EBUSY;
                         goto out;
                 }
@@ -4933,7 +4991,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                         res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
                         goto out;
                 }
-               must_inc_static_branch = true;
+               static_key_slow_inc(&memcg_kmem_enabled_key);
+               /*
+                * setting the active bit after the inc will guarantee no one
+                * starts accounting before all call sites are patched
+                */
+               memcg_kmem_set_active(memcg);
+
                 /*
                  * kmem charges can outlive the cgroup. In the case of slab
                  * pages, for instance, a page contain objects from various
@@ -4945,28 +5009,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                 ret = res_counter_set_limit(&memcg->kmem, val);
  out:
         mutex_unlock(&set_limit_mutex);
-       cgroup_unlock();
-
-       /*
-        * We are by now familiar with the fact that we can't inc the static
-        * branch inside cgroup_lock. See disarm functions for details. A
-        * worker here is overkill, but also wrong: After the limit is set, we
-        * must start accounting right away. Since this operation can't fail,
-        * we can safely defer it to here - no rollback will be needed.
-        *
-        * The boolean used to control this is also safe, because
-        * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
-        * able to set it to true;
-        */
-       if (must_inc_static_branch) {
-               static_key_slow_inc(&memcg_kmem_enabled_key);
-               /*
-                * setting the active bit after the inc will guarantee no one
-                * starts accounting before all call sites are patched
-                */
-               memcg_kmem_set_active(memcg);
-       }
-
+       mutex_unlock(&memcg_create_mutex);
  #endif
         return ret;
  }
@@ -5148,15 +5191,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  
         if (val >= (1 << NR_MOVE_TYPE))
                 return -EINVAL;
+
         /*
-        * We check this value several times in both in can_attach() and
-        * attach(), so we need cgroup lock to prevent this value from being
-        * inconsistent.
+        * No kind of locking is needed in here, because ->can_attach() will
+        * check this value once in the beginning of the process, and then carry
+        * on with stale data. This means that changes to this value will only
+        * affect task migrations starting after the change.
          */
-       cgroup_lock();
         memcg->move_charge_at_immigrate = val;
-       cgroup_unlock();
-
         return 0;
  }
  #else
@@ -5214,14 +5256,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
  }
  #endif /* CONFIG_NUMA */
  
-static const char * const mem_cgroup_lru_names[] = {
-       "inactive_anon",
-       "active_anon",
-       "inactive_file",
-       "active_file",
-       "unevictable",
-};
-
  static inline void mem_cgroup_lru_names_not_uptodate(void)
  {
         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5335,18 +5369,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
  
         parent = mem_cgroup_from_cont(cgrp->parent);
  
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
  
         /* If under hierarchy, only empty-root can set this value */
-       if ((parent->use_hierarchy) ||
-           (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
-               cgroup_unlock();
+       if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+               mutex_unlock(&memcg_create_mutex);
                 return -EINVAL;
         }
  
         memcg->swappiness = val;
  
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
  
         return 0;
  }
@@ -5672,17 +5705,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
  
         parent = mem_cgroup_from_cont(cgrp->parent);
  
-       cgroup_lock();
+       mutex_lock(&memcg_create_mutex);
         /* oom-kill-disable is a flag for subhierarchy. */
-       if ((parent->use_hierarchy) ||
-           (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
-               cgroup_unlock();
+       if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+               mutex_unlock(&memcg_create_mutex);
                 return -EINVAL;
         }
         memcg->oom_kill_disable = val;
         if (!val)
                 memcg_oom_recover(memcg);
-       cgroup_unlock();
+       mutex_unlock(&memcg_create_mutex);
         return 0;
  }
  
@@ -5797,33 +5829,6 @@ static struct cftype mem_cgroup_files[] = {
                 .read_seq_string = memcg_numa_stat_show,
         },
  #endif
-#ifdef CONFIG_MEMCG_SWAP
-       {
-               .name = "memsw.usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-               .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
-       },
-       {
-               .name = "memsw.max_usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-               .trigger = mem_cgroup_reset,
-               .read = mem_cgroup_read,
-       },
-       {
-               .name = "memsw.limit_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-               .write_string = mem_cgroup_write,
-               .read = mem_cgroup_read,
-       },
-       {
-               .name = "memsw.failcnt",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-               .trigger = mem_cgroup_reset,
-               .read = mem_cgroup_read,
-       },
-#endif
  #ifdef CONFIG_MEMCG_KMEM
         {
                 .name = "kmem.limit_in_bytes",
@@ -5858,6 +5863,36 @@ static struct cftype mem_cgroup_files[] = {
         { },    /* terminate */
  };
  
+#ifdef CONFIG_MEMCG_SWAP
+static struct cftype memsw_cgroup_files[] = {
+       {
+               .name = "memsw.usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+               .read = mem_cgroup_read,
+               .register_event = mem_cgroup_usage_register_event,
+               .unregister_event = mem_cgroup_usage_unregister_event,
+       },
+       {
+               .name = "memsw.max_usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+               .trigger = mem_cgroup_reset,
+               .read = mem_cgroup_read,
+       },
+       {
+               .name = "memsw.limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+               .write_string = mem_cgroup_write,
+               .read = mem_cgroup_read,
+       },
+       {
+               .name = "memsw.failcnt",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+               .trigger = mem_cgroup_reset,
+               .read = mem_cgroup_read,
+       },
+       { },    /* terminate */
+};
+#endif
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
  {
         struct mem_cgroup_per_node *pn;
@@ -5896,9 +5931,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
         struct mem_cgroup *memcg;
-       int size = sizeof(struct mem_cgroup);
+       size_t size = memcg_size();
  
-       /* Can be very big if MAX_NUMNODES is very big */
+       /* Can be very big if nr_node_ids is very big */
         if (size < PAGE_SIZE)
                 memcg = kzalloc(size, GFP_KERNEL);
         else
@@ -5935,7 +5970,7 @@ out_free:
  static void __mem_cgroup_free(struct mem_cgroup *memcg)
  {
         int node;
-       int size = sizeof(struct mem_cgroup);
+       size_t size = memcg_size();
  
         mem_cgroup_remove_from_trees(memcg);
         free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6017,19 +6052,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
  }
  EXPORT_SYMBOL(parent_mem_cgroup);
  
-#ifdef CONFIG_MEMCG_SWAP
-static void __init enable_swap_cgroup(void)
-{
-       if (!mem_cgroup_disabled() && really_do_swap_account)
-               do_swap_account = 1;
-}
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
-#endif
-
-static int mem_cgroup_soft_limit_tree_init(void)
+static void __init mem_cgroup_soft_limit_tree_init(void)
  {
         struct mem_cgroup_tree_per_node *rtpn;
         struct mem_cgroup_tree_per_zone *rtpz;
@@ -6040,8 +6063,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
                 if (!node_state(node, N_NORMAL_MEMORY))
                         tmp = -1;
                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-               if (!rtpn)
-                       goto err_cleanup;
+               BUG_ON(!rtpn);
  
                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
  
@@ -6051,23 +6073,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
                         spin_lock_init(&rtpz->lock);
                 }
         }
-       return 0;
-
-err_cleanup:
-       for_each_node(node) {
-               if (!soft_limit_tree.rb_tree_per_node[node])
-                       break;
-               kfree(soft_limit_tree.rb_tree_per_node[node]);
-               soft_limit_tree.rb_tree_per_node[node] = NULL;
-       }
-       return 1;
-
  }
  
  static struct cgroup_subsys_state * __ref
  mem_cgroup_css_alloc(struct cgroup *cont)
  {
-       struct mem_cgroup *memcg, *parent;
+       struct mem_cgroup *memcg;
         long error = -ENOMEM;
         int node;
  
@@ -6082,23 +6093,51 @@ mem_cgroup_css_alloc(struct cgroup *cont)
         /* root ? */
         if (cont->parent == NULL) {
                 int cpu;
-               enable_swap_cgroup();
-               parent = NULL;
-               if (mem_cgroup_soft_limit_tree_init())
-                       goto free_out;
+
                 root_mem_cgroup = memcg;
                 for_each_possible_cpu(cpu) {
                         struct memcg_stock_pcp *stock =
                                                 &per_cpu(memcg_stock, cpu);
                         INIT_WORK(&stock->work, drain_local_stock);
                 }
-       } else {
-               parent = mem_cgroup_from_cont(cont->parent);
-               memcg->use_hierarchy = parent->use_hierarchy;
-               memcg->oom_kill_disable = parent->oom_kill_disable;
+
+               res_counter_init(&memcg->res, NULL);
+               res_counter_init(&memcg->memsw, NULL);
+               res_counter_init(&memcg->kmem, NULL);
         }
  
-       if (parent && parent->use_hierarchy) {
+       memcg->last_scanned_node = MAX_NUMNODES;
+       INIT_LIST_HEAD(&memcg->oom_notify);
+       atomic_set(&memcg->refcnt, 1);
+       memcg->move_charge_at_immigrate = 0;
+       mutex_init(&memcg->thresholds_lock);
+       spin_lock_init(&memcg->move_lock);
+
+       return &memcg->css;
+
+free_out:
+       __mem_cgroup_free(memcg);
+       return ERR_PTR(error);
+}
+
+static int
+mem_cgroup_css_online(struct cgroup *cont)
+{
+       struct mem_cgroup *memcg, *parent;
+       int error = 0;
+
+       if (!cont->parent)
+               return 0;
+
+       mutex_lock(&memcg_create_mutex);
+       memcg = mem_cgroup_from_cont(cont);
+       parent = mem_cgroup_from_cont(cont->parent);
+
+       memcg->use_hierarchy = parent->use_hierarchy;
+       memcg->oom_kill_disable = parent->oom_kill_disable;
+       memcg->swappiness = mem_cgroup_swappiness(parent);
+
+       if (parent->use_hierarchy) {
                 res_counter_init(&memcg->res, &parent->res);
                 res_counter_init(&memcg->memsw, &parent->memsw);
                 res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6119,20 +6158,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                  * much sense so let cgroup subsystem know about this
                  * unfortunate state in our controller.
                  */
-               if (parent && parent != root_mem_cgroup)
+               if (parent != root_mem_cgroup)
                         mem_cgroup_subsys.broken_hierarchy = true;
         }
-       memcg->last_scanned_node = MAX_NUMNODES;
-       INIT_LIST_HEAD(&memcg->oom_notify);
-
-       if (parent)
-               memcg->swappiness = mem_cgroup_swappiness(parent);
-       atomic_set(&memcg->refcnt, 1);
-       memcg->move_charge_at_immigrate = 0;
-       mutex_init(&memcg->thresholds_lock);
-       spin_lock_init(&memcg->move_lock);
  
         error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+       mutex_unlock(&memcg_create_mutex);
         if (error) {
                 /*
                  * We call put now because our (and parent's) refcnts
@@ -6140,12 +6171,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                  * call __mem_cgroup_free, so return directly
                  */
                 mem_cgroup_put(memcg);
-               return ERR_PTR(error);
+               if (parent->use_hierarchy)
+                       mem_cgroup_put(parent);
         }
-       return &memcg->css;
-free_out:
-       __mem_cgroup_free(memcg);
-       return ERR_PTR(error);
+       return error;
  }
  
  static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6281,7 +6310,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
          * Because lookup_swap_cache() updates some statistics counter,
          * we call find_get_page() with swapper_space directly.
          */
-       page = find_get_page(&swapper_space, ent.val);
+       page = find_get_page(swap_address_space(ent), ent.val);
         if (do_swap_account)
                 entry->val = ent.val;
  
@@ -6322,7 +6351,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                 swp_entry_t swap = radix_to_swp_entry(page);
                 if (do_swap_account)
                         *entry = swap;
-               page = find_get_page(&swapper_space, swap.val);
+               page = find_get_page(swap_address_space(swap), swap.val);
         }
  #endif
         return page;
@@ -6532,8 +6561,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
         struct task_struct *p = cgroup_taskset_first(tset);
         int ret = 0;
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+       unsigned long move_charge_at_immigrate;
  
-       if (memcg->move_charge_at_immigrate) {
+       /*
+        * We are now commited to this value whatever it is. Changes in this
+        * tunable will only affect upcoming migrations, not the current one.
+        * So we need to save it, and keep it going.
+        */
+       move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
+       if (move_charge_at_immigrate) {
                 struct mm_struct *mm;
                 struct mem_cgroup *from = mem_cgroup_from_task(p);
  
@@ -6553,6 +6589,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
                         spin_lock(&mc.lock);
                         mc.from = from;
                         mc.to = memcg;
+                       mc.immigrate_flags = move_charge_at_immigrate;
                         spin_unlock(&mc.lock);
                         /* We set mc.moving_task later */
  
@@ -6747,6 +6784,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
         .name = "memory",
         .subsys_id = mem_cgroup_subsys_id,
         .css_alloc = mem_cgroup_css_alloc,
+       .css_online = mem_cgroup_css_online,
         .css_offline = mem_cgroup_css_offline,
         .css_free = mem_cgroup_css_free,
         .can_attach = mem_cgroup_can_attach,
@@ -6757,19 +6795,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
         .use_id = 1,
  };
  
-/*
- * The rest of init is performed during ->css_alloc() for root css which
- * happens before initcalls.  hotcpu_notifier() can't be done together as
- * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
- * dependency.  Do it from a subsys_initcall().
- */
-static int __init mem_cgroup_init(void)
-{
-       hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-       return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
  #ifdef CONFIG_MEMCG_SWAP
  static int __init enable_swap_account(char *s)
  {
@@ -6782,4 +6807,36 @@ static int __init enable_swap_account(char *s)
  }
  __setup("swapaccount=", enable_swap_account);
  
+static void __init memsw_file_init(void)
+{
+       WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+}
+
+static void __init enable_swap_cgroup(void)
+{
+       if (!mem_cgroup_disabled() && really_do_swap_account) {
+               do_swap_account = 1;
+               memsw_file_init();
+       }
+}
+
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
  #endif
+
+/*
+ * The rest of init is performed during ->css_alloc() for root css which
+ * happens before initcalls.  hotcpu_notifier() can't be done together as
+ * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
+ * dependency.  Do it from a subsys_initcall().
+ */
+static int __init mem_cgroup_init(void)
+{
+       hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+       enable_swap_cgroup();
+       mem_cgroup_soft_limit_tree_init();
+       return 0;
+}
+subsys_initcall(mem_cgroup_init);