From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 24 Nov 2010 12:57:06 -0800 Subject: [PATCH] memcg: avoid deadlock between move charge and try_charge() __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 62d1880..26218df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -278,13 +278,14 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } - up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) mc.moved_swap = 0; } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); @@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; @@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } - mmput(mm); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); } return ret; } @@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } - up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - struct mm_struct *mm; - - if (!mc.to) + if (!mc.mm) /* no need to move charge */ return; - mm = get_task_mm(p); - if (mm) { - mem_cgroup_move_charge(mm); - mmput(mm); - } + mem_cgroup_move_charge(mc.mm); mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- 2.7.4