cpuset: rewrite update_tasks_nodemask()
This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's mems_allowed. Not only simplify the code largely, but also avoid allocating an array to hold mm pointers of all the tasks in the cpuset. This array can be big (size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a chance to fail the allocation when under memory stress. Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Родитель
bd1a8ab73e
Коммит
3b6766fe66
109
kernel/cpuset.c
109
kernel/cpuset.c
|
@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
|||
mutex_unlock(&callback_mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
|
||||
* nodes if memory_migrate flag is set. Called with cgroup_mutex held.
|
||||
*/
|
||||
static void cpuset_change_nodemask(struct task_struct *p,
|
||||
struct cgroup_scanner *scan)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
struct cpuset *cs;
|
||||
int migrate;
|
||||
const nodemask_t *oldmem = scan->data;
|
||||
|
||||
mm = get_task_mm(p);
|
||||
if (!mm)
|
||||
return;
|
||||
|
||||
cs = cgroup_cs(scan->cg);
|
||||
migrate = is_memory_migrate(cs);
|
||||
|
||||
mpol_rebind_mm(mm, &cs->mems_allowed);
|
||||
if (migrate)
|
||||
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
|
||||
mmput(mm);
|
||||
}
|
||||
|
||||
static void *cpuset_being_rebound;
|
||||
|
||||
/**
|
||||
|
@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
|
|||
*/
|
||||
static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
|
||||
{
|
||||
struct task_struct *p;
|
||||
struct mm_struct **mmarray;
|
||||
int i, n, ntasks;
|
||||
int migrate;
|
||||
int fudge;
|
||||
struct cgroup_iter it;
|
||||
int retval;
|
||||
struct cgroup_scanner scan;
|
||||
|
||||
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
|
||||
|
||||
fudge = 10; /* spare mmarray[] slots */
|
||||
fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
|
||||
retval = -ENOMEM;
|
||||
scan.cg = cs->css.cgroup;
|
||||
scan.test_task = NULL;
|
||||
scan.process_task = cpuset_change_nodemask;
|
||||
scan.heap = NULL;
|
||||
scan.data = (nodemask_t *)oldmem;
|
||||
|
||||
/*
|
||||
* Allocate mmarray[] to hold mm reference for each task
|
||||
* in cpuset cs. Can't kmalloc GFP_KERNEL while holding
|
||||
* tasklist_lock. We could use GFP_ATOMIC, but with a
|
||||
* few more lines of code, we can retry until we get a big
|
||||
* enough mmarray[] w/o using GFP_ATOMIC.
|
||||
*/
|
||||
while (1) {
|
||||
ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
|
||||
ntasks += fudge;
|
||||
mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
|
||||
if (!mmarray)
|
||||
goto done;
|
||||
read_lock(&tasklist_lock); /* block fork */
|
||||
if (cgroup_task_count(cs->css.cgroup) <= ntasks)
|
||||
break; /* got enough */
|
||||
read_unlock(&tasklist_lock); /* try again */
|
||||
kfree(mmarray);
|
||||
}
|
||||
|
||||
n = 0;
|
||||
|
||||
/* Load up mmarray[] with mm reference for each task in cpuset. */
|
||||
cgroup_iter_start(cs->css.cgroup, &it);
|
||||
while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
|
||||
struct mm_struct *mm;
|
||||
|
||||
if (n >= ntasks) {
|
||||
printk(KERN_WARNING
|
||||
"Cpuset mempolicy rebind incomplete.\n");
|
||||
break;
|
||||
}
|
||||
mm = get_task_mm(p);
|
||||
if (!mm)
|
||||
continue;
|
||||
mmarray[n++] = mm;
|
||||
}
|
||||
cgroup_iter_end(cs->css.cgroup, &it);
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
/*
|
||||
* Now that we've dropped the tasklist spinlock, we can
|
||||
* rebind the vma mempolicies of each mm in mmarray[] to their
|
||||
* new cpuset, and release that mm. The mpol_rebind_mm()
|
||||
* call takes mmap_sem, which we couldn't take while holding
|
||||
* tasklist_lock. Forks can happen again now - the mpol_dup()
|
||||
* cpuset_being_rebound check will catch such forks, and rebind
|
||||
* their vma mempolicies too. Because we still hold the global
|
||||
* cgroup_mutex, we know that no other rebind effort will
|
||||
* be contending for the global variable cpuset_being_rebound.
|
||||
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
|
||||
* take while holding tasklist_lock. Forks can happen - the
|
||||
* mpol_dup() cpuset_being_rebound check will catch such forks,
|
||||
* and rebind their vma mempolicies too. Because we still hold
|
||||
* the global cgroup_mutex, we know that no other rebind effort
|
||||
* will be contending for the global variable cpuset_being_rebound.
|
||||
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
|
||||
* is idempotent. Also migrate pages in each mm to new nodes.
|
||||
*/
|
||||
migrate = is_memory_migrate(cs);
|
||||
for (i = 0; i < n; i++) {
|
||||
struct mm_struct *mm = mmarray[i];
|
||||
|
||||
mpol_rebind_mm(mm, &cs->mems_allowed);
|
||||
if (migrate)
|
||||
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
|
||||
mmput(mm);
|
||||
}
|
||||
retval = cgroup_scan_tasks(&scan);
|
||||
|
||||
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
|
||||
kfree(mmarray);
|
||||
cpuset_being_rebound = NULL;
|
||||
retval = 0;
|
||||
done:
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче