X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=mm%2Fmemcontrol.c;h=b2ee6df0e9bb31eebd3b2f1528cbbcccb2b9c43e;hb=623e3db9f9b7d6e7b2a99180f9cf0825c936ab7a;hp=69af5d5801fcbfdaabf8bafabb31bd928cf376a8;hpb=0e79dedde951e981612ed4e6d74873d61d2a113b;p=platform%2Fadaptation%2Frenesas_rcar%2Frenesas_kernel.git diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69af5d5..b2ee6df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -89,7 +89,6 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; @@ -298,6 +297,12 @@ struct mem_cgroup { */ unsigned long move_charge_at_immigrate; /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + /* * percpu counter. */ struct mem_cgroup_stat_cpu *stat; @@ -690,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, - bool file, int nr_pages) + bool anon, int nr_pages) { preempt_disable(); - if (file) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + /* + * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is + * counted as CACHE even if it's on ANON LRU. + */ + if (anon) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); /* pagein of a big page is an event. So, ignore page size */ @@ -1283,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ - int cpu; +/* + * memcg->moving_account is used for checking possibility that some thread is + * calling move_account(). When a thread on CPU-A starts moving pages under + * a memcg, other threads should check memcg->moving_account under + * rcu_read_lock(), like this: + * + * CPU-A CPU-B + * rcu_read_lock() + * memcg->moving_account+1 if (memcg->mocing_account) + * take heavy locks. + * synchronize_rcu() update something. + * rcu_read_unlock() + * start move here. + */ - get_online_cpus(); - spin_lock(&memcg->pcp_counter_lock); - for_each_online_cpu(cpu) - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; - memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; - spin_unlock(&memcg->pcp_counter_lock); - put_online_cpus(); +/* for quick checking without looking up memcg */ +atomic_t memcg_moving __read_mostly; +static void mem_cgroup_start_move(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg_moving); + atomic_inc(&memcg->moving_account); synchronize_rcu(); } static void mem_cgroup_end_move(struct mem_cgroup *memcg) { - int cpu; - - if (!memcg) - return; - get_online_cpus(); - spin_lock(&memcg->pcp_counter_lock); - for_each_online_cpu(cpu) - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; - memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; - spin_unlock(&memcg->pcp_counter_lock); - put_online_cpus(); + /* + * Now, mem_cgroup_clear_mc() may call this function with NULL. + * We check NULL in callee rather than caller. + */ + if (memcg) { + atomic_dec(&memcg_moving); + atomic_dec(&memcg->moving_account); + } } + /* * 2 routines for checking "mem" is under move_account() or not. * - * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used - * for avoiding race in accounting. If true, + * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This + * is used for avoiding races in accounting. If true, * pc->mem_cgroup may be overwritten. * * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or @@ -1324,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * waiting at hith-memory prressure caused by "move". */ -static bool mem_cgroup_stealed(struct mem_cgroup *memcg) +static bool mem_cgroup_stolen(struct mem_cgroup *memcg) { VM_BUG_ON(!rcu_read_lock_held()); - return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; + return atomic_read(&memcg->moving_account) > 0; } static bool mem_cgroup_under_move(struct mem_cgroup *memcg) @@ -1368,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +/* + * Take this lock when + * - a code tries to modify page's memcg while it's USED. + * - a code tries to modify page state accounting in a memcg. + * see mem_cgroup_stolen(), too. + */ +static void move_lock_mem_cgroup(struct mem_cgroup *memcg, + unsigned long *flags) +{ + spin_lock_irqsave(&memcg->move_lock, *flags); +} + +static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, + unsigned long *flags) +{ + spin_unlock_irqrestore(&memcg->move_lock, *flags); +} + /** * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit @@ -1878,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are - * possibility of race condition. If there is, we take a lock. + * small, we check mm->moving_account and detect there are possibility of race + * If there is, we take a lock. */ +void __mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); +again: + memcg = pc->mem_cgroup; + if (unlikely(!memcg || !PageCgroupUsed(pc))) + return; + /* + * If this memory cgroup is not under account moving, we don't + * need to take move_lock_page_cgroup(). Because we already hold + * rcu_read_lock(), any calls to move_account will be delayed until + * rcu_read_unlock() if mem_cgroup_stolen() == true. + */ + if (!mem_cgroup_stolen(memcg)) + return; + + move_lock_mem_cgroup(memcg, flags); + if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { + move_unlock_mem_cgroup(memcg, flags); + goto again; + } + *locked = true; +} + +void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + + /* + * It's guaranteed that pc->mem_cgroup never changes while + * lock is held because a routine modifies pc->mem_cgroup + * should take move_lock_page_cgroup(). + */ + move_unlock_mem_cgroup(pc->mem_cgroup, flags); +} + void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); - bool need_unlock = false; unsigned long uninitialized_var(flags); if (mem_cgroup_disabled()) return; - rcu_read_lock(); memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - goto out; - /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(memcg))) { - /* take a lock against to access pc->mem_cgroup */ - move_lock_page_cgroup(pc, &flags); - need_unlock = true; - memcg = pc->mem_cgroup; - if (!memcg || !PageCgroupUsed(pc)) - goto out; - } + return; switch (idx) { case MEMCG_NR_FILE_MAPPED: - if (val > 0) - SetPageCgroupFileMapped(pc); - else if (!page_mapped(page)) - ClearPageCgroupFileMapped(pc); idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: @@ -1920,13 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page, } this_cpu_add(memcg->stat->count[idx], val); - -out: - if (unlikely(need_unlock)) - move_unlock_page_cgroup(pc, &flags); - rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -2097,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) per_cpu(memcg->stat->events[i], cpu) = 0; memcg->nocpu_base.events[i] += x; } - /* need to clear ON_MOVE value, works as a kind of lock. */ - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; - spin_unlock(&memcg->pcp_counter_lock); -} - -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) -{ - int idx = MEM_CGROUP_ON_MOVE; - - spin_lock(&memcg->pcp_counter_lock); - per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; spin_unlock(&memcg->pcp_counter_lock); } @@ -2119,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct memcg_stock_pcp *stock; struct mem_cgroup *iter; - if ((action == CPU_ONLINE)) { - for_each_mem_cgroup(iter) - synchronize_mem_cgroup_on_move(iter, cpu); + if (action == CPU_ONLINE) return NOTIFY_OK; - } if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; @@ -2442,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, { struct zone *uninitialized_var(zone); bool was_on_lru = false; + bool anon; lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { @@ -2477,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * See mem_cgroup_add_lru_list(), etc. */ smp_wmb(); - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_CACHE: - case MEM_CGROUP_CHARGE_TYPE_SHMEM: - SetPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - case MEM_CGROUP_CHARGE_TYPE_MAPPED: - ClearPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - default: - break; - } + SetPageCgroupUsed(pc); if (lrucare) { if (was_on_lru) { @@ -2500,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, spin_unlock_irq(&zone->lru_lock); } - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); + if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + anon = true; + else + anon = false; + + mem_cgroup_charge_statistics(memcg, anon, nr_pages); unlock_page_cgroup(pc); /* @@ -2513,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_MIGRATION)) +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2565,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; + bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(page)); @@ -2584,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page, if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) goto unlock; - move_lock_page_cgroup(pc, &flags); + move_lock_mem_cgroup(from, &flags); - if (PageCgroupFileMapped(pc)) { + if (!anon && page_mapped(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(from, anon, -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); + mem_cgroup_charge_statistics(to, anon, nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2608,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page, * guaranteed that "to" is never removed. So, we don't check rmdir * status here. */ - move_unlock_page_cgroup(pc, &flags); + move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: unlock_page_cgroup(pc); @@ -2921,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; + bool anon; if (mem_cgroup_disabled()) return NULL; @@ -2946,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (!PageCgroupUsed(pc)) goto unlock_out; + anon = PageAnon(page); + switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: + /* + * Generally PageAnon tells if it's the anon statistics to be + * updated; but sometimes e.g. mem_cgroup_uncharge_page() is + * used before page reached the stage of being marked PageAnon. + */ + anon = true; + /* fallthrough */ case MEM_CGROUP_CHARGE_TYPE_DROP: /* See mem_cgroup_prepare_migration() */ if (page_mapped(page) || PageCgroupMigration(pc)) @@ -2964,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(memcg, anon, -nr_pages); ClearPageCgroupUsed(pc); /* @@ -3271,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, { struct page *used, *unused; struct page_cgroup *pc; + bool anon; if (!memcg) return; @@ -3292,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, lock_page_cgroup(pc); ClearPageCgroupMigration(pc); unlock_page_cgroup(pc); - - __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); + anon = PageAnon(used); + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED + : MEM_CGROUP_CHARGE_TYPE_CACHE); /* * If a page is a file cache, radix-tree replacement is very atomic @@ -3303,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, * and USED bit check in mem_cgroup_uncharge_page() will do enough * check. (see prepare_charge() also) */ - if (PageAnon(used)) + if (anon) mem_cgroup_uncharge_page(used); /* * At migration, we may charge account against cgroup which has no @@ -3333,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, /* fix accounting on old pages */ lock_page_cgroup(pc); memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + mem_cgroup_charge_statistics(memcg, false, -1); ClearPageCgroupUsed(pc); unlock_page_cgroup(pc); @@ -3855,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) break; default: BUG(); - break; } return val; } @@ -4420,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, else BUG(); - /* - * Something went wrong if we trying to unregister a threshold - * if we don't have thresholds - */ - BUG_ON(!thresholds); - if (!thresholds->primary) goto unlock; @@ -4975,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont) atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); return &memcg->css; free_out: __mem_cgroup_free(memcg); @@ -5069,7 +5110,7 @@ one_by_one: } /** - * is_target_pte_for_mc - check a pte whether it is valid for move charge + * get_mctgt_type - get target type of moving charge * @vma: the vma the pte to be checked belongs * @addr: the address corresponding to the pte to be checked * @ptent: the pte to be checked @@ -5092,7 +5133,7 @@ union mc_target { }; enum mc_target_type { - MC_TARGET_NONE, /* not used */ + MC_TARGET_NONE = 0, MC_TARGET_PAGE, MC_TARGET_SWAP, }; @@ -5173,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } -static int is_target_pte_for_mc(struct vm_area_struct *vma, +static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; struct page_cgroup *pc; - int ret = 0; + enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; if (pte_present(ptent)) @@ -5189,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) - return 0; + return ret; if (page) { pc = lookup_page_cgroup(page); /* @@ -5215,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We don't consider swapping or file mapped pages because THP does not + * support them for now. + * Caller should make sure that pmd_trans_huge(pmd) is true. + */ +static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + enum mc_target_type ret = MC_TARGET_NONE; + + page = pmd_page(pmd); + VM_BUG_ON(!page || !PageHead(page)); + if (!move_anon()) + return ret; + pc = lookup_page_cgroup(page); + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) { + get_page(page); + target->page = page; + } + } + return ret; +} +#else +static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + return MC_TARGET_NONE; +} +#endif + static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -5223,13 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(walk->mm, pmd); - if (pmd_trans_unstable(pmd)) + if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) + mc.precharge += HPAGE_PMD_NR; + spin_unlock(&vma->vm_mm->page_table_lock); return 0; + } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - if (is_target_pte_for_mc(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, *pte, NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -5384,25 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, struct vm_area_struct *vma = walk->private; pte_t *pte; spinlock_t *ptl; + enum mc_target_type target_type; + union mc_target target; + struct page *page; + struct page_cgroup *pc; - split_huge_page_pmd(walk->mm, pmd); - if (pmd_trans_unstable(pmd)) + /* + * We don't take compound_lock() here but no race with splitting thp + * happens because: + * - if pmd_trans_huge_lock() returns 1, the relevant thp is not + * under splitting, which means there's no concurrent thp split, + * - if another thread runs into split_huge_page() just after we + * entered this if-block, the thread must wait for page table lock + * to be unlocked in __split_huge_page_splitting(), where the main + * part of thp split is not executed yet. + */ + if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (!mc.precharge) { + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); + if (target_type == MC_TARGET_PAGE) { + page = target.page; + if (!isolate_lru_page(page)) { + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + pc, mc.from, mc.to, + false)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + putback_lru_page(page); + } + put_page(page); + } + spin_unlock(&vma->vm_mm->page_table_lock); return 0; + } + retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); - union mc_target target; - int type; - struct page *page; - struct page_cgroup *pc; swp_entry_t ent; if (!mc.precharge) break; - type = is_target_pte_for_mc(vma, addr, ptent, &target); - switch (type) { + switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; if (isolate_lru_page(page)) @@ -5415,7 +5524,7 @@ retry: mc.moved_charge++; } putback_lru_page(page); -put: /* is_target_pte_for_mc() gets the page */ +put: /* get_mctgt_type() gets the page */ put_page(page); break; case MC_TARGET_SWAP: