diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d448e646a4a..658eb1a32084 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -259,6 +259,13 @@ static char cpuset_name[CPUSET_NAME_LEN]; static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); +/* + * CPU / memory hotplug is handled asynchronously. + */ +static void cpuset_hotplug_workfn(struct work_struct *work); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1565,6 +1572,19 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + */ + flush_work(&cpuset_hotplug_work); + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2095,7 +2115,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) } /** - * cpuset_handle_hotplug - handle CPU/memory hot[un]plug + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2110,7 +2130,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_handle_hotplug(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus, tmp_cpus; static nodemask_t new_mems, tmp_mems; @@ -2177,7 +2197,18 @@ static void cpuset_handle_hotplug(void) void cpuset_update_active_cpus(bool cpu_online) { - cpuset_handle_hotplug(); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2189,7 +2220,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - cpuset_handle_hotplug(); + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } #endif