memcg: clean up try_charge main loop

[profile/ivi/kernel-x86-ivi.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 20a8193..991860e 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,8 @@
  
  #include <asm/uaccess.h>
  
+#include <trace/events/vmscan.h>
+
  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  #define MEM_CGROUP_RECLAIM_RETRIES     5
  struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +213,6 @@ struct mem_cgroup {
         */
         spinlock_t reclaim_param_lock;
  
-       int     prev_priority;  /* for recording reclaim priority */
-
         /*
          * While reclaiming in a hierarchy, we cache the last child we
          * reclaimed from.
@@ -858,35 +858,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
         return ret;
  }
  
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-       int prev_priority;
-
-       spin_lock(&mem->reclaim_param_lock);
-       prev_priority = mem->prev_priority;
-       spin_unlock(&mem->reclaim_param_lock);
-
-       return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-       spin_lock(&mem->reclaim_param_lock);
-       if (priority < mem->prev_priority)
-               mem->prev_priority = priority;
-       spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-       spin_lock(&mem->reclaim_param_lock);
-       mem->prev_priority = priority;
-       spin_unlock(&mem->reclaim_param_lock);
-}
-
  static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
  {
         unsigned long active;
@@ -1038,6 +1009,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
         }
  
         *scanned = scan;
+
+       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+                                     0, 0, 0, mode);
+
         return nr_taken;
  }
  
@@ -1072,6 +1047,49 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
         return swappiness;
  }
  
+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *from = mc.from;
+       struct mem_cgroup *to = mc.to;
+       bool ret = false;
+
+       if (from == mem || to == mem)
+               return true;
+
+       if (!from || !to || !mem->use_hierarchy)
+               return false;
+
+       rcu_read_lock();
+       if (css_tryget(&from->css)) {
+               ret = css_is_ancestor(&from->css, &mem->css);
+               css_put(&from->css);
+       }
+       if (!ret && css_tryget(&to->css)) {
+               ret = css_is_ancestor(&to->css, &mem->css);
+               css_put(&to->css);
+       }
+       rcu_read_unlock();
+       return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+       if (mc.moving_task && current != mc.moving_task) {
+               if (mem_cgroup_under_move(mem)) {
+                       DEFINE_WAIT(wait);
+                       prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+                       /* moving charge context might have finished. */
+                       if (mc.moving_task)
+                               schedule();
+                       finish_wait(&mc.waitq, &wait);
+                       return true;
+               }
+       }
+       return false;
+}
+
  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
  {
         int *val = data;
@@ -1158,6 +1176,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
  }
  
  /*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+       u64 limit;
+       u64 memsw;
+
+       limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+                       total_swap_pages;
+       memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+       /*
+        * If memsw is finite and limits the amount of swap space available
+        * to this memcg, return that limit.
+        */
+       return min(limit, memsw);
+}
+
+/*
   * Visit the first child (need not be the first child as per the ordering
   * of the cgroup list, since we track last_scanned_child) of @mem and use
   * that to reclaim free pages from.
@@ -1582,16 +1618,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
         return NOTIFY_OK;
  }
  
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+       CHARGE_OK,              /* success */
+       CHARGE_RETRY,           /* need to retry but retry is not bad */
+       CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+       CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+                               int csize, bool oom_check)
+{
+       struct mem_cgroup *mem_over_limit;
+       struct res_counter *fail_res;
+       unsigned long flags = 0;
+       int ret;
+
+       ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+       if (likely(!ret)) {
+               if (!do_swap_account)
+                       return CHARGE_OK;
+               ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+               if (likely(!ret))
+                       return CHARGE_OK;
+
+               mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+               flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+       } else
+               mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+       if (csize > PAGE_SIZE) /* change csize and retry */
+               return CHARGE_RETRY;
+
+       if (!(gfp_mask & __GFP_WAIT))
+               return CHARGE_WOULDBLOCK;
+
+       ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                       gfp_mask, flags);
+       /*
+        * try_to_free_mem_cgroup_pages() might not give us a full
+        * picture of reclaim. Some pages are reclaimed and might be
+        * moved to swap cache or just unmapped from the cgroup.
+        * Check the limit again to see if the reclaim reduced the
+        * current usage of the cgroup before giving up
+        */
+       if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+               return CHARGE_RETRY;
+
+       /*
+        * At task move, charge accounts can be doubly counted. So, it's
+        * better to wait until the end of task_move if something is going on.
+        */
+       if (mem_cgroup_wait_acct_move(mem_over_limit))
+               return CHARGE_RETRY;
+
+       /* If we don't need to call oom-killer at el, return immediately */
+       if (!oom_check)
+               return CHARGE_NOMEM;
+       /* check OOM */
+       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+               return CHARGE_OOM_DIE;
+
+       return CHARGE_RETRY;
+}
+
  /*
   * Unlike exported interface, "oom" parameter is added. if oom==true,
   * oom-killer can be invoked.
   */
  static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                       gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+               gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
  {
-       struct mem_cgroup *mem, *mem_over_limit;
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-       struct res_counter *fail_res;
+       int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       struct mem_cgroup *mem = NULL;
+       int ret;
         int csize = CHARGE_SIZE;
  
         /*
@@ -1609,120 +1712,56 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
          * thread group leader migrates. It's possible that mm is not
          * set, if so charge the init_mm (happens for pagecache usage).
          */
-       mem = *memcg;
-       if (likely(!mem)) {
+       if (*memcg) {
+               mem = *memcg;
+               css_get(&mem->css);
+       } else {
                 mem = try_get_mem_cgroup_from_mm(mm);
+               if (unlikely(!mem))
+                       return 0;
                 *memcg = mem;
-       } else {
-               css_get(&mem->css);
         }
-       if (unlikely(!mem))
-               return 0;
  
         VM_BUG_ON(css_is_removed(&mem->css));
         if (mem_cgroup_is_root(mem))
                 goto done;
  
-       while (1) {
-               int ret = 0;
-               unsigned long flags = 0;
+       do {
+               bool oom_check;
  
                 if (consume_stock(mem))
-                       goto done;
-
-               ret = res_counter_charge(&mem->res, csize, &fail_res);
-               if (likely(!ret)) {
-                       if (!do_swap_account)
-                               break;
-                       ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                       if (likely(!ret))
-                               break;
-                       /* mem+swap counter fails */
-                       res_counter_uncharge(&mem->res, csize);
-                       flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-                       mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                       memsw);
-               } else
-                       /* mem counter fails */
-                       mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                       res);
+                       goto done; /* don't need to fill stock */
+               /* If killed, bypass charge */
+               if (fatal_signal_pending(current))
+                       goto bypass;
  
-               /* reduce request size and retry */
-               if (csize > PAGE_SIZE) {
-                       csize = PAGE_SIZE;
-                       continue;
+               oom_check = false;
+               if (oom && !nr_oom_retries) {
+                       oom_check = true;
+                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
                 }
-               if (!(gfp_mask & __GFP_WAIT))
-                       goto nomem;
-
-               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                               gfp_mask, flags);
-               if (ret)
-                       continue;
-
-               /*
-                * try_to_free_mem_cgroup_pages() might not give us a full
-                * picture of reclaim. Some pages are reclaimed and might be
-                * moved to swap cache or just unmapped from the cgroup.
-                * Check the limit again to see if the reclaim reduced the
-                * current usage of the cgroup before giving up
-                *
-                */
-               if (mem_cgroup_check_under_limit(mem_over_limit))
-                       continue;
  
-               /* try to avoid oom while someone is moving charge */
-               if (mc.moving_task && current != mc.moving_task) {
-                       struct mem_cgroup *from, *to;
-                       bool do_continue = false;
-                       /*
-                        * There is a small race that "from" or "to" can be
-                        * freed by rmdir, so we use css_tryget().
-                        */
-                       from = mc.from;
-                       to = mc.to;
-                       if (from && css_tryget(&from->css)) {
-                               if (mem_over_limit->use_hierarchy)
-                                       do_continue = css_is_ancestor(
-                                                       &from->css,
-                                                       &mem_over_limit->css);
-                               else
-                                       do_continue = (from == mem_over_limit);
-                               css_put(&from->css);
-                       }
-                       if (!do_continue && to && css_tryget(&to->css)) {
-                               if (mem_over_limit->use_hierarchy)
-                                       do_continue = css_is_ancestor(
-                                                       &to->css,
-                                                       &mem_over_limit->css);
-                               else
-                                       do_continue = (to == mem_over_limit);
-                               css_put(&to->css);
-                       }
-                       if (do_continue) {
-                               DEFINE_WAIT(wait);
-                               prepare_to_wait(&mc.waitq, &wait,
-                                                       TASK_INTERRUPTIBLE);
-                               /* moving charge context might have finished. */
-                               if (mc.moving_task)
-                                       schedule();
-                               finish_wait(&mc.waitq, &wait);
-                               continue;
-                       }
-               }
+               ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
  
-               if (!nr_retries--) {
+               switch (ret) {
+               case CHARGE_OK:
+                       break;
+               case CHARGE_RETRY: /* not in OOM situation but retry */
+                       csize = PAGE_SIZE;
+                       break;
+               case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                       goto nomem;
+               case CHARGE_NOMEM: /* OOM routine works */
                         if (!oom)
                                 goto nomem;
-                       if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
-                               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                               continue;
-                       }
-                       /* When we reach here, current task is dying .*/
-                       css_put(&mem->css);
+                       /* If oom, we never return -ENOMEM */
+                       nr_oom_retries--;
+                       break;
+               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
                         goto bypass;
                 }
-       }
+       } while (ret != CHARGE_OK);
+
         if (csize > PAGE_SIZE)
                 refill_stock(mem, csize - PAGE_SIZE);
  done:
@@ -1731,6 +1770,8 @@ nomem:
         css_put(&mem->css);
         return -ENOMEM;
  bypass:
+       if (mem)
+               css_put(&mem->css);
         *memcg = NULL;
         return 0;
  }