unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
- struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- bool charge)
+ bool file, int nr_pages)
{
- int val = (charge) ? 1 : -1;
-
preempt_disable();
- if (PageCgroupCache(pc))
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+ if (file)
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
- if (charge)
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
else
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
preempt_enable();
}
* removed from global LRU.
*/
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
return;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc))
+ return;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
return;
pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
return NULL;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return NULL;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
if (!mz)
return NULL;
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* possibility of race condition. If there is, we take a lock.
*/
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(mem))) {
+ if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
/* take a lock against to access pc->mem_cgroup */
- lock_page_cgroup(pc);
+ move_lock_page_cgroup(pc, &flags);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
- this_cpu_add(mem->stat->count[idx], val);
-
switch (idx) {
- case MEM_CGROUP_STAT_FILE_MAPPED:
+ case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
}
+ this_cpu_add(mem->stat->count[idx], val);
+
out:
if (unlikely(need_unlock))
- unlock_page_cgroup(pc);
+ move_unlock_page_cgroup(pc, &flags);
rcu_read_unlock();
return;
}
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
- mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (page_size == PAGE_SIZE && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ csize = page_size;
css_put(&mem->css);
mem = NULL;
goto again;
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
css_put(&mem->css);
done:
*memcg = mem;
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
{
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
return mem;
}
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
{
+ int nr_pages = page_size >> PAGE_SHIFT;
+
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, page_size);
return;
}
-
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
break;
}
- mem_cgroup_charge_statistics(mem, pc, true);
-
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
/*
* "charge_statistics" updated event counter. Then, check it.
memcg_check_events(mem, pc->page);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+ (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
+ struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+ unsigned long flags;
+
+ /*
+ * We have no races with charge/uncharge but will have races with
+ * page state accounting.
+ */
+ move_lock_page_cgroup(head_pc, &flags);
+
+ tail_pc->mem_cgroup = head_pc->mem_cgroup;
+ smp_wmb(); /* see __commit_charge() */
+ if (PageCgroupAcctLRU(head_pc)) {
+ enum lru_list lru;
+ struct mem_cgroup_per_zone *mz;
+
+ /*
+ * LRU flags cannot be copied because we need to add tail
+ *.page to LRU by generic call and our hook will be called.
+ * We hold lru_lock, then, reduce counter directly.
+ */
+ lru = page_lru(head);
+ mz = page_cgroup_zoneinfo(head_pc);
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ }
+ tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ move_unlock_page_cgroup(head_pc, &flags);
+}
+#endif
+
/**
* __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
*/
static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+ int charge_size)
{
+ int nr_pages = charge_size >> PAGE_SHIFT;
+
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
VM_BUG_ON(!page_is_cgroup_locked(pc));
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, pc, false);
+ mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ mem_cgroup_cancel_charge(from, charge_size);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, pc, true);
+ mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* __mem_cgroup_move_account()
*/
static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to,
+ bool uncharge, int charge_size)
{
int ret = -EINVAL;
+ unsigned long flags;
+
+ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+ return -EBUSY;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- __mem_cgroup_move_account(pc, from, to, uncharge);
+ move_lock_page_cgroup(pc, &flags);
+ __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
+ int charge = PAGE_SIZE;
+ unsigned long flags;
int ret;
/* Is ROOT ? */
goto out;
if (isolate_lru_page(page))
goto put;
+ /* The page is isolated from LRU and we have no race with splitting */
+ charge = PAGE_SIZE << compound_order(page);
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
if (ret || !parent)
goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent, true);
+ if (charge > PAGE_SIZE)
+ flags = compound_lock_irqsave(page);
+
+ ret = mem_cgroup_move_account(pc, child, parent, true, charge);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ mem_cgroup_cancel_charge(parent, charge);
put_back:
+ if (charge > PAGE_SIZE)
+ compound_unlock_irqrestore(page, flags);
putback_lru_page(page);
put:
put_page(page);
struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
+ int page_size = PAGE_SIZE;
+
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
pc = lookup_page_cgroup(page);
/* can happen at boot */
return 0;
prefetchw(pc);
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
ClearPageCgroupUsed(pc);
/*
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ __do_uncharge(mem, ctype, page_size);
return mem;
enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
pte_t *pte;
spinlock_t *ptl;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
unsigned long precharge;
struct vm_area_struct *vma;
- /* We've already held the mmap_sem */
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
+ up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
- if (mc.mm) {
- up_read(&mc.mm->mmap_sem);
- mmput(mc.mm);
- }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
- mc.moving_task = NULL;
- mc.mm = NULL;
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
- /*
- * We do all the move charge works under one mmap_sem to
- * avoid deadlock with down_write(&mmap_sem)
- * -> try_charge() -> if (mc.moving_task) -> sleep.
- */
- down_read(&mm->mmap_sem);
-
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
- VM_BUG_ON(mc.mm);
-
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
spin_unlock(&mc.lock);
- mc.moving_task = current;
- mc.mm = mm;
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- /* We call up_read() and mmput() in clear_mc(). */
- } else
- mmput(mm);
+ }
+ mmput(mm);
}
return ret;
}
spinlock_t *ptl;
retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(pc,
- mc.from, mc.to, false)) {
+ mc.from, mc.to, false, PAGE_SIZE)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
struct vm_area_struct *vma;
lru_add_drain_all();
- /* We've already held the mmap_sem */
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
*/
break;
}
+ up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- if (!mc.mm)
+ struct mm_struct *mm;
+
+ if (!mc.to)
/* no need to move charge */
return;
- mem_cgroup_move_charge(mc.mm);
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */