memcg: avoid deadlock between move charge and try_charge()
__mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Родитель
11e7946f19
Коммит
b1dd693e5b
|
@ -278,13 +278,14 @@ enum move_type {
|
||||||
|
|
||||||
/* "mc" and its members are protected by cgroup_mutex */
|
/* "mc" and its members are protected by cgroup_mutex */
|
||||||
static struct move_charge_struct {
|
static struct move_charge_struct {
|
||||||
spinlock_t lock; /* for from, to, moving_task */
|
spinlock_t lock; /* for from, to */
|
||||||
struct mem_cgroup *from;
|
struct mem_cgroup *from;
|
||||||
struct mem_cgroup *to;
|
struct mem_cgroup *to;
|
||||||
unsigned long precharge;
|
unsigned long precharge;
|
||||||
unsigned long moved_charge;
|
unsigned long moved_charge;
|
||||||
unsigned long moved_swap;
|
unsigned long moved_swap;
|
||||||
struct task_struct *moving_task; /* a task moving charges */
|
struct task_struct *moving_task; /* a task moving charges */
|
||||||
|
struct mm_struct *mm;
|
||||||
wait_queue_head_t waitq; /* a waitq for other context */
|
wait_queue_head_t waitq; /* a waitq for other context */
|
||||||
} mc = {
|
} mc = {
|
||||||
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
|
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
|
||||||
|
@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
|
||||||
unsigned long precharge;
|
unsigned long precharge;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
|
||||||
down_read(&mm->mmap_sem);
|
/* We've already held the mmap_sem */
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||||
struct mm_walk mem_cgroup_count_precharge_walk = {
|
struct mm_walk mem_cgroup_count_precharge_walk = {
|
||||||
.pmd_entry = mem_cgroup_count_precharge_pte_range,
|
.pmd_entry = mem_cgroup_count_precharge_pte_range,
|
||||||
|
@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
|
||||||
walk_page_range(vma->vm_start, vma->vm_end,
|
walk_page_range(vma->vm_start, vma->vm_end,
|
||||||
&mem_cgroup_count_precharge_walk);
|
&mem_cgroup_count_precharge_walk);
|
||||||
}
|
}
|
||||||
up_read(&mm->mmap_sem);
|
|
||||||
|
|
||||||
precharge = mc.precharge;
|
precharge = mc.precharge;
|
||||||
mc.precharge = 0;
|
mc.precharge = 0;
|
||||||
|
@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void)
|
||||||
|
|
||||||
mc.moved_swap = 0;
|
mc.moved_swap = 0;
|
||||||
}
|
}
|
||||||
|
if (mc.mm) {
|
||||||
|
up_read(&mc.mm->mmap_sem);
|
||||||
|
mmput(mc.mm);
|
||||||
|
}
|
||||||
spin_lock(&mc.lock);
|
spin_lock(&mc.lock);
|
||||||
mc.from = NULL;
|
mc.from = NULL;
|
||||||
mc.to = NULL;
|
mc.to = NULL;
|
||||||
mc.moving_task = NULL;
|
|
||||||
spin_unlock(&mc.lock);
|
spin_unlock(&mc.lock);
|
||||||
|
mc.moving_task = NULL;
|
||||||
|
mc.mm = NULL;
|
||||||
mem_cgroup_end_move(from);
|
mem_cgroup_end_move(from);
|
||||||
memcg_oom_recover(from);
|
memcg_oom_recover(from);
|
||||||
memcg_oom_recover(to);
|
memcg_oom_recover(to);
|
||||||
|
@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
|
||||||
return 0;
|
return 0;
|
||||||
/* We move charges only when we move a owner of the mm */
|
/* We move charges only when we move a owner of the mm */
|
||||||
if (mm->owner == p) {
|
if (mm->owner == p) {
|
||||||
|
/*
|
||||||
|
* We do all the move charge works under one mmap_sem to
|
||||||
|
* avoid deadlock with down_write(&mmap_sem)
|
||||||
|
* -> try_charge() -> if (mc.moving_task) -> sleep.
|
||||||
|
*/
|
||||||
|
down_read(&mm->mmap_sem);
|
||||||
|
|
||||||
VM_BUG_ON(mc.from);
|
VM_BUG_ON(mc.from);
|
||||||
VM_BUG_ON(mc.to);
|
VM_BUG_ON(mc.to);
|
||||||
VM_BUG_ON(mc.precharge);
|
VM_BUG_ON(mc.precharge);
|
||||||
VM_BUG_ON(mc.moved_charge);
|
VM_BUG_ON(mc.moved_charge);
|
||||||
VM_BUG_ON(mc.moved_swap);
|
VM_BUG_ON(mc.moved_swap);
|
||||||
VM_BUG_ON(mc.moving_task);
|
VM_BUG_ON(mc.moving_task);
|
||||||
|
VM_BUG_ON(mc.mm);
|
||||||
|
|
||||||
mem_cgroup_start_move(from);
|
mem_cgroup_start_move(from);
|
||||||
spin_lock(&mc.lock);
|
spin_lock(&mc.lock);
|
||||||
mc.from = from;
|
mc.from = from;
|
||||||
|
@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
|
||||||
mc.precharge = 0;
|
mc.precharge = 0;
|
||||||
mc.moved_charge = 0;
|
mc.moved_charge = 0;
|
||||||
mc.moved_swap = 0;
|
mc.moved_swap = 0;
|
||||||
mc.moving_task = current;
|
|
||||||
spin_unlock(&mc.lock);
|
spin_unlock(&mc.lock);
|
||||||
|
mc.moving_task = current;
|
||||||
|
mc.mm = mm;
|
||||||
|
|
||||||
ret = mem_cgroup_precharge_mc(mm);
|
ret = mem_cgroup_precharge_mc(mm);
|
||||||
if (ret)
|
if (ret)
|
||||||
mem_cgroup_clear_mc();
|
mem_cgroup_clear_mc();
|
||||||
}
|
/* We call up_read() and mmput() in clear_mc(). */
|
||||||
mmput(mm);
|
} else
|
||||||
|
mmput(mm);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
|
||||||
lru_add_drain_all();
|
lru_add_drain_all();
|
||||||
down_read(&mm->mmap_sem);
|
/* We've already held the mmap_sem */
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||||
int ret;
|
int ret;
|
||||||
struct mm_walk mem_cgroup_move_charge_walk = {
|
struct mm_walk mem_cgroup_move_charge_walk = {
|
||||||
|
@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
|
||||||
*/
|
*/
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
up_read(&mm->mmap_sem);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
|
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
|
||||||
|
@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
|
||||||
struct task_struct *p,
|
struct task_struct *p,
|
||||||
bool threadgroup)
|
bool threadgroup)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm;
|
if (!mc.mm)
|
||||||
|
|
||||||
if (!mc.to)
|
|
||||||
/* no need to move charge */
|
/* no need to move charge */
|
||||||
return;
|
return;
|
||||||
|
|
||||||
mm = get_task_mm(p);
|
mem_cgroup_move_charge(mc.mm);
|
||||||
if (mm) {
|
|
||||||
mem_cgroup_move_charge(mm);
|
|
||||||
mmput(mm);
|
|
||||||
}
|
|
||||||
mem_cgroup_clear_mc();
|
mem_cgroup_clear_mc();
|
||||||
}
|
}
|
||||||
#else /* !CONFIG_MMU */
|
#else /* !CONFIG_MMU */
|
||||||
|
|
Загрузка…
Ссылка в новой задаче