X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=mm%2Fmemcontrol.c;h=93cb16d2b96ad35320d62c7c4a3540656a36b6aa;hb=0e574a932d2cab8eb3b02d21feb59f2c09154738;hp=b63f5f7dfa071225487ebf5baa365f5b3baebe3b;hpb=a07613a54d700a974f3a4a657da78ef5d097315d;p=platform%2Fadaptation%2Frenesas_rcar%2Frenesas_kernel.git diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b63f5f7..93cb16d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -50,6 +50,8 @@ #include #include #include "internal.h" +#include +#include #include @@ -121,16 +123,22 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; +struct mem_cgroup_reclaim_iter { + /* css_id of the last scanned hierarchy member */ + int position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { - /* - * spin_lock to protect the per cgroup LRU - */ - struct list_head lists[NR_LRU_LISTS]; + struct lruvec lruvec; unsigned long count[NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ @@ -231,11 +239,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - /* - * While reclaiming in a hierarchy, we cache the last child we - * reclaimed from. - */ - int last_scanned_child; int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; @@ -286,6 +289,10 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + +#ifdef CONFIG_INET + struct tcp_memcontrol tcp_mem; +#endif }; /* Stuffs for move charges at task migration. */ @@ -360,12 +367,70 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); + +/* Writing them here to avoid exposing memcg's inner layout */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_INET +#include +#include + +static bool mem_cgroup_is_root(struct mem_cgroup *memcg); +void sock_update_memcg(struct sock *sk) +{ + if (static_branch(&memcg_socket_limit_enabled)) { + struct mem_cgroup *memcg; + + BUG_ON(!sk->sk_prot->proto_cgroup); + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_cgrp) { + BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); + mem_cgroup_get(sk->sk_cgrp->memcg); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!mem_cgroup_is_root(memcg)) { + mem_cgroup_get(memcg); + sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) { + struct mem_cgroup *memcg; + WARN_ON(!sk->sk_cgrp->memcg); + memcg = sk->sk_cgrp->memcg; + mem_cgroup_put(memcg); + } +} + +struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + + return &memcg->tcp_mem.cg_proto; +} +EXPORT_SYMBOL(tcp_proto_cgroup); +#endif /* CONFIG_INET */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -590,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); -} - -void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -683,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return total; } -static bool __memcg_event_check(struct mem_cgroup *memcg, int target) +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - return ((long)next - (long)val < 0); -} - -static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; - default: - return; + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; } - - __this_cpu_write(memcg->stat->targets[target], next); + return false; } /* @@ -724,28 +774,30 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit, do_numainfo; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); +#if MAX_NUMNODES > 1 + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); +#endif + preempt_enable(); + mem_cgroup_threshold(memcg); - __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT))) { + if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_NUMAINFO))) { + if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_NUMAINFO); - } #endif - } - preempt_enable(); + } else + preempt_enable(); } -static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, @@ -787,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { - struct cgroup_subsys_state *css; - int found; + struct mem_cgroup *memcg = NULL; + int id = 0; - if (!memcg) /* ROOT cgroup has the smallest ID */ - return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!memcg->use_hierarchy) { - if (css_tryget(&memcg->css)) - return memcg; + if (mem_cgroup_disabled()) return NULL; - } - rcu_read_lock(); - /* - * searching a memory cgroup which has the smallest ID under given - * ROOT cgroup. (ID >= 1) - */ - css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); - if (css && css_tryget(css)) - memcg = container_of(css, struct mem_cgroup, css); - else - memcg = NULL; - rcu_read_unlock(); - return memcg; -} -static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, - struct mem_cgroup *root, - bool cond) -{ - int nextid = css_id(&iter->css) + 1; - int found; - int hierarchy_used; - struct cgroup_subsys_state *css; + if (!root) + root = root_mem_cgroup; - hierarchy_used = iter->use_hierarchy; + if (prev && !reclaim) + id = css_id(&prev->css); - css_put(&iter->css); - /* If no ROOT, walk all, ignore hierarchy */ - if (!cond || (root && !hierarchy_used)) - return NULL; + if (prev && prev != root) + css_put(&prev->css); - if (!root) - root = root_mem_cgroup; + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + return NULL; + return root; + } - do { - iter = NULL; - rcu_read_lock(); + while (!memcg) { + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css; + + if (reclaim) { + int nid = zone_to_nid(reclaim->zone); + int zid = zone_idx(reclaim->zone); + struct mem_cgroup_per_zone *mz; - css = css_get_next(&mem_cgroup_subsys, nextid, - &root->css, &found); - if (css && css_tryget(css)) - iter = container_of(css, struct mem_cgroup, css); + mz = mem_cgroup_zoneinfo(root, nid, zid); + iter = &mz->reclaim_iter[reclaim->priority]; + if (prev && reclaim->generation != iter->generation) + return NULL; + id = iter->position; + } + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); + if (css) { + if (css == &root->css || css_tryget(css)) + memcg = container_of(css, + struct mem_cgroup, css); + } else + id = 0; rcu_read_unlock(); - /* If css is NULL, no more cgroups will be found */ - nextid = found + 1; - } while (css && !iter); - return iter; + if (reclaim) { + iter->position = id; + if (!css) + iter->generation++; + else if (!prev && memcg) + reclaim->generation = iter->generation; + } + + if (prev && !css) + return NULL; + } + return memcg; } -/* - * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please - * be careful that "break" loop is not allowed. We have reference count. - * Instead of that modify "cond" to be false and "continue" to exit the loop. - */ -#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ - for (iter = mem_cgroup_start_loop(root);\ - iter != NULL;\ - iter = mem_cgroup_get_next(iter, root, cond)) -#define for_each_mem_cgroup_tree(iter, root) \ - for_each_mem_cgroup_tree_cond(iter, root, true) +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} -#define for_each_mem_cgroup_all(iter) \ - for_each_mem_cgroup_tree_cond(iter, NULL, true) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -883,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) goto out; switch (idx) { - case PGMAJFAULT: - mem_cgroup_pgmajfault(memcg, 1); - break; case PGFAULT: - mem_cgroup_pgfault(memcg, 1); + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); @@ -897,6 +982,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -911,112 +1017,123 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. + */ +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { - struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + struct page_cgroup *pc; if (mem_cgroup_disabled()) - return; + return &zone->lruvec; + pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); + smp_mb(); + /* + * If the page is uncharged, it may be freed soon, but it + * could also be swap cache (readahead, swapoff) that needs to + * be reclaimable in the future. root_mem_cgroup will babysit + * it for the time being. + */ + if (PageCgroupUsed(pc)) { + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + memcg = pc->mem_cgroup; + SetPageCgroupAcctLRU(pc); + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); + return &mz->lruvec; } -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; - enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lists[lru]); + /* + * root_mem_cgroup babysits uncharged LRU pages, but + * PageCgroupUsed is cleared when the page is about to get + * freed. PageCgroupAcctLRU remembers whether the + * LRU-accounting happened against pc->mem_cgroup or + * root_mem_cgroup. + */ + if (TestClearPageCgroupAcctLRU(pc)) { + VM_BUG_ON(!pc->mem_cgroup); + memcg = pc->mem_cgroup; + } else + memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +void mem_cgroup_lru_del(struct page *page) { - struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lists[lru]); + mem_cgroup_lru_del_list(page, page_lru(page)); } -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - list_add(&pc->lru, &mz->lists[lru]); + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1027,6 +1144,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) */ static void mem_cgroup_lru_del_before_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1043,17 +1161,28 @@ static void mem_cgroup_lru_del_before_commit(struct page *page) return; spin_lock_irqsave(&zone->lru_lock, flags); + lru = page_lru(page); /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. + * The uncharged page could still be registered to the LRU of + * the stale pc->mem_cgroup. + * + * As pc->mem_cgroup is about to get overwritten, the old LRU + * accounting needs to be taken care of. Let root_mem_cgroup + * babysit the page until the new memcg is responsible for it. + * + * The PCG_USED bit is guarded by lock_page() as the page is + * swapcache/pagecache. */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); + if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } static void mem_cgroup_lru_add_after_commit(struct page *page) { + enum lru_list lru; unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1071,22 +1200,22 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) if (likely(!PageLRU(page))) return; spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); + lru = page_lru(page); + /* + * If the page is not on the LRU, someone will soon put it + * there. If it is, and also already accounted for on the + * memcg-side, it must be on the right lruvec as setting + * pc->mem_cgroup and PageCgroupUsed is properly ordered. + * Otherwise, root_mem_cgroup has been babysitting the page + * during the charge. Move it to the new memcg now. + */ + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) { + del_page_from_lru_list(zone, page, lru); + add_page_to_lru_list(zone, page, lru); + } spin_unlock_irqrestore(&zone->lru_lock, flags); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) -{ - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); -} - /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree @@ -1192,68 +1321,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1470,41 +1537,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_memcg) +static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned long flags) { - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; + unsigned long total = 0; + bool noswap = false; + int loop; - if (!root_memcg->use_hierarchy) { - css_get(&root_memcg->css); - ret = root_memcg; - } - - while (!ret) { - rcu_read_lock(); - nextid = root_memcg->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); + if (flags & MEM_CGROUP_RECLAIM_NOSWAP) + noswap = true; + if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) + noswap = true; - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_memcg->last_scanned_child = 0; - } else - root_memcg->last_scanned_child = found; + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + if (loop) + drain_all_stock_async(memcg); + total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); + /* + * Allow limit shrinkers, which are triggered directly + * by userspace, to catch signals and stop reclaim + * after minimal progress, regardless of the margin. + */ + if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) + break; + if (mem_cgroup_margin(memcg)) + break; + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + */ + if (loop && !total) + break; } - - return ret; + return total; } /** @@ -1644,61 +1710,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * Scan the hierarchy if needed to reclaim memory. We remember the last child - * we reclaimed from, so that we don't end up penalizing one child extensively - * based on its position in the children list. - * - * root_memcg is the original ancestor that we've been reclaim from. - * - * We give up and return to the caller when we visit root_memcg twice. - * (other groups can be removed while we're walking....) - * - * If shrink==true, for avoiding to free too much, this returns immedieately. - */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim; - int ret, total = 0; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_memcg->memsw_is_minimum) - noswap = true; - while (1) { - victim = mem_cgroup_select_victim(root_memcg); - if (victim == root_memcg) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { loop++; - /* - * We are not draining per cpu cached charges during - * soft limit reclaim because global reclaim doesn't - * care about charges. It tries to free some memory and - * charges will not give any. - */ - if (!check_soft && loop >= 1) - drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) { - css_put(&victim->css); + if (!total) break; - } /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to @@ -1706,40 +1746,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * coming back to reclaim from this cgroup */ if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break; - } } - } - if (!mem_cgroup_reclaimable(victim, noswap)) { - /* this cgroup's local usage == 0 */ - css_put(&victim->css); continue; } - /* we use swappiness of local cgroup */ - if (check_soft) { - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; - } else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); - css_put(&victim->css); - /* - * At shrinking usage, we can't check we should stop here or - * reclaim more. It's depends on callers. last_scanned_child - * will work enough for keeping fairness under tree. - */ - if (shrink) - return ret; - total += ret; - if (check_soft) { - if (!res_counter_soft_limit_excess(&root_memcg->res)) - return total; - } else if (mem_cgroup_margin(root_memcg)) - return total; + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } + mem_cgroup_iter_break(root_memcg, victim); return total; } @@ -1751,16 +1771,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; - bool cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ failed = iter; - cond = false; + mem_cgroup_iter_break(memcg, iter); + break; } else iter->oom_lock = true; } @@ -1772,11 +1792,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ - cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - cond = false; - continue; + mem_cgroup_iter_break(memcg, iter); + break; } iter->oom_lock = false; } @@ -2172,7 +2191,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct mem_cgroup *iter; if ((action == CPU_ONLINE)) { - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; } @@ -2180,7 +2199,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) mem_cgroup_drain_pcp_counter(iter, cpu); stock = &per_cpu(memcg_stock, cpu); @@ -2234,8 +2253,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags, NULL); + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -2522,39 +2540,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compund_lock. + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. */ -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *tail_pc = lookup_page_cgroup(tail); - unsigned long flags; + struct page_cgroup *pc; + int i; if (mem_cgroup_disabled()) return; - /* - * We have no races with charge/uncharge but will have races with - * page state accounting. - */ - move_lock_page_cgroup(head_pc, &flags); + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + /* + * LRU flags cannot be copied because we need to add tail + * page to LRU by generic call and our hooks will be called. + */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + } - tail_pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb(); /* see __commit_charge() */ if (PageCgroupAcctLRU(head_pc)) { enum lru_list lru; struct mem_cgroup_per_zone *mz; - /* - * LRU flags cannot be copied because we need to add tail - *.page to LRU by generic call and our hook will be called. * We hold lru_lock, then, reduce counter directly. */ lru = page_lru(head); mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; } - tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - move_unlock_page_cgroup(head_pc, &flags); } #endif @@ -2816,12 +2834,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t mask, struct mem_cgroup **ptr) + gfp_t mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *ptr = NULL; + *memcgp = NULL; if (mem_cgroup_disabled()) return 0; @@ -2839,27 +2857,27 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); + return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); } static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { if (mem_cgroup_disabled()) return; - if (!ptr) + if (!memcg) return; - cgroup_exclude_rmdir(&ptr->css); + cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); + __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2869,21 +2887,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *swap_memcg; unsigned short id; - struct mem_cgroup *memcg; id = swap_cgroup_record(ent, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { + swap_memcg = mem_cgroup_lookup(id); + if (swap_memcg) { /* * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + if (!mem_cgroup_is_root(swap_memcg)) + res_counter_uncharge(&swap_memcg->memsw, + PAGE_SIZE); + mem_cgroup_swap_statistics(swap_memcg, false); + mem_cgroup_put(swap_memcg); } rcu_read_unlock(); } @@ -2892,13 +2911,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&ptr->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +void mem_cgroup_commit_charge_swapin(struct page *page, + struct mem_cgroup *memcg) { - __mem_cgroup_commit_charge_swapin(page, ptr, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge_swapin(page, memcg, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -3227,14 +3247,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; - *ptr = NULL; + *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) @@ -3285,10 +3305,10 @@ int mem_cgroup_prepare_migration(struct page *page, if (!memcg) return 0; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *ptr == NULL) { + if (ret || *memcgp == NULL) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); @@ -3366,6 +3386,50 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, cgroup_release_and_wakeup_rmdir(&memcg->css); } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + struct zone *zone; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + zone = page_zone(newpage); + pc = lookup_page_cgroup(newpage); + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(newpage)) + del_page_from_lru_list(zone, newpage, page_lru(newpage)); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); + if (PageLRU(newpage)) + add_page_to_lru_list(zone, newpage, page_lru(newpage)); + spin_unlock_irqrestore(&zone->lru_lock, flags); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -3468,9 +3532,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3528,10 +3591,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3574,10 +3636,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT, - &nr_scanned); + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock(&mctz->lock); @@ -3645,22 +3705,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lists[lru]; + list = &mz->lruvec.lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3669,16 +3730,16 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); if (ret == -ENOMEM) @@ -3686,7 +3747,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; @@ -4612,6 +4673,36 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + /* + * Part of this would be better living in a separate allocation + * function, leaving us with just the cgroup tree population work. + * We, however, depend on state such as network's proto_list that + * is only initialized after cgroup creation. I found the less + * cumbersome way to deal with it to defer it all to populate time + */ + return mem_cgroup_sockets_init(cont, ss); +}; + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + mem_cgroup_sockets_destroy(cont, ss); +} +#else +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + return 0; +} + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ +} +#endif + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4750,7 +4841,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) - INIT_LIST_HEAD(&mz->lists[l]); + INIT_LIST_HEAD(&mz->lruvec.lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = memcg; @@ -4843,12 +4934,13 @@ static void mem_cgroup_put(struct mem_cgroup *memcg) /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { if (!memcg->res.parent) return NULL; return mem_cgroup_from_res_counter(memcg->res.parent, res); } +EXPORT_SYMBOL(parent_mem_cgroup); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -4936,7 +5028,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); } - memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); @@ -4964,6 +5055,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_destroy(ss, cont); + mem_cgroup_put(memcg); } @@ -4977,6 +5070,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, if (!ret) ret = register_memsw_files(cont, ss); + + if (!ret) + ret = register_kmem_files(cont, ss); + return ret; } @@ -5297,8 +5394,9 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p) + struct cgroup_taskset *tset) { + struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); @@ -5336,7 +5434,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p) + struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); } @@ -5453,9 +5551,9 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *old_cont, - struct task_struct *p) + struct cgroup_taskset *tset) { + struct task_struct *p = cgroup_taskset_first(tset); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5470,19 +5568,18 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p) + struct cgroup_taskset *tset) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p) + struct cgroup_taskset *tset) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *old_cont, - struct task_struct *p) + struct cgroup_taskset *tset) { } #endif