sched/fair: Rework and comment the group_imb code
Rik reported some weirdness due to the group_imb code. As a start to looking at it, clean it up a little and add a few explanatory comments. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Родитель
6906a40839
Коммит
30ce5dabc9
|
@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Group imbalance indicates (and tries to solve) the problem where balancing
|
||||||
|
* groups is inadequate due to tsk_cpus_allowed() constraints.
|
||||||
|
*
|
||||||
|
* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
|
||||||
|
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
|
||||||
|
* Something like:
|
||||||
|
*
|
||||||
|
* { 0 1 2 3 } { 4 5 6 7 }
|
||||||
|
* * * * *
|
||||||
|
*
|
||||||
|
* If we were to balance group-wise we'd place two tasks in the first group and
|
||||||
|
* two tasks in the second group. Clearly this is undesired as it will overload
|
||||||
|
* cpu 3 and leave one of the cpus in the second group unused.
|
||||||
|
*
|
||||||
|
* The current solution to this issue is detecting the skew in the first group
|
||||||
|
* by noticing it has a cpu that is overloaded while the remaining cpus are
|
||||||
|
* idle -- or rather, there's a distinct imbalance in the cpus; see
|
||||||
|
* sg_imbalanced().
|
||||||
|
*
|
||||||
|
* When this is so detected; this group becomes a candidate for busiest; see
|
||||||
|
* update_sd_pick_busiest(). And calculcate_imbalance() and
|
||||||
|
* find_busiest_group() avoid some of the usual balance conditional to allow it
|
||||||
|
* to create an effective group imbalance.
|
||||||
|
*
|
||||||
|
* This is a somewhat tricky proposition since the next run might not find the
|
||||||
|
* group imbalance and decide the groups need to be balanced again. A most
|
||||||
|
* subtle and fragile situation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct sg_imb_stats {
|
||||||
|
unsigned long max_nr_running, min_nr_running;
|
||||||
|
unsigned long max_cpu_load, min_cpu_load;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
|
||||||
|
{
|
||||||
|
sgi->max_cpu_load = sgi->max_nr_running = 0UL;
|
||||||
|
sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
update_sg_imb_stats(struct sg_imb_stats *sgi,
|
||||||
|
unsigned long load, unsigned long nr_running)
|
||||||
|
{
|
||||||
|
if (load > sgi->max_cpu_load)
|
||||||
|
sgi->max_cpu_load = load;
|
||||||
|
if (sgi->min_cpu_load > load)
|
||||||
|
sgi->min_cpu_load = load;
|
||||||
|
|
||||||
|
if (nr_running > sgi->max_nr_running)
|
||||||
|
sgi->max_nr_running = nr_running;
|
||||||
|
if (sgi->min_nr_running > nr_running)
|
||||||
|
sgi->min_nr_running = nr_running;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int
|
||||||
|
sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Consider the group unbalanced when the imbalance is larger
|
||||||
|
* than the average weight of a task.
|
||||||
|
*
|
||||||
|
* APZ: with cgroup the avg task weight can vary wildly and
|
||||||
|
* might not be a suitable number - should we keep a
|
||||||
|
* normalized nr_running number somewhere that negates
|
||||||
|
* the hierarchy?
|
||||||
|
*/
|
||||||
|
if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
|
||||||
|
(sgi->max_nr_running - sgi->min_nr_running) > 1)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
||||||
* @env: The load balancing environment.
|
* @env: The load balancing environment.
|
||||||
|
@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||||
struct sched_group *group, int load_idx,
|
struct sched_group *group, int load_idx,
|
||||||
int local_group, struct sg_lb_stats *sgs)
|
int local_group, struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
unsigned long nr_running, max_nr_running, min_nr_running;
|
struct sg_imb_stats sgi;
|
||||||
unsigned long load, max_cpu_load, min_cpu_load;
|
unsigned long nr_running;
|
||||||
|
unsigned long load;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
/* Tally up the load of all CPUs in the group */
|
init_sg_imb_stats(&sgi);
|
||||||
max_cpu_load = 0;
|
|
||||||
min_cpu_load = ~0UL;
|
|
||||||
max_nr_running = 0;
|
|
||||||
min_nr_running = ~0UL;
|
|
||||||
|
|
||||||
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
||||||
struct rq *rq = cpu_rq(i);
|
struct rq *rq = cpu_rq(i);
|
||||||
|
@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||||
load = target_load(i, load_idx);
|
load = target_load(i, load_idx);
|
||||||
} else {
|
} else {
|
||||||
load = source_load(i, load_idx);
|
load = source_load(i, load_idx);
|
||||||
|
update_sg_imb_stats(&sgi, load, nr_running);
|
||||||
if (load > max_cpu_load)
|
|
||||||
max_cpu_load = load;
|
|
||||||
if (min_cpu_load > load)
|
|
||||||
min_cpu_load = load;
|
|
||||||
|
|
||||||
if (nr_running > max_nr_running)
|
|
||||||
max_nr_running = nr_running;
|
|
||||||
if (min_nr_running > nr_running)
|
|
||||||
min_nr_running = nr_running;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sgs->group_load += load;
|
sgs->group_load += load;
|
||||||
|
@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||||
sgs->group_power = group->sgp->power;
|
sgs->group_power = group->sgp->power;
|
||||||
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
|
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
|
||||||
|
|
||||||
/*
|
|
||||||
* Consider the group unbalanced when the imbalance is larger
|
|
||||||
* than the average weight of a task.
|
|
||||||
*
|
|
||||||
* APZ: with cgroup the avg task weight can vary wildly and
|
|
||||||
* might not be a suitable number - should we keep a
|
|
||||||
* normalized nr_running number somewhere that negates
|
|
||||||
* the hierarchy?
|
|
||||||
*/
|
|
||||||
if (sgs->sum_nr_running)
|
if (sgs->sum_nr_running)
|
||||||
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
||||||
|
|
||||||
if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task &&
|
sgs->group_imb = sg_imbalanced(sgs, &sgi);
|
||||||
(max_nr_running - min_nr_running) > 1)
|
|
||||||
sgs->group_imb = 1;
|
|
||||||
|
|
||||||
sgs->group_capacity =
|
sgs->group_capacity =
|
||||||
DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
|
DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
|
||||||
|
@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
||||||
busiest = &sds->busiest_stat;
|
busiest = &sds->busiest_stat;
|
||||||
|
|
||||||
if (busiest->group_imb) {
|
if (busiest->group_imb) {
|
||||||
|
/*
|
||||||
|
* In the group_imb case we cannot rely on group-wide averages
|
||||||
|
* to ensure cpu-load equilibrium, look at wider averages. XXX
|
||||||
|
*/
|
||||||
busiest->load_per_task =
|
busiest->load_per_task =
|
||||||
min(busiest->load_per_task, sds->avg_load);
|
min(busiest->load_per_task, sds->avg_load);
|
||||||
}
|
}
|
||||||
|
@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
||||||
if (!busiest->group_imb) {
|
if (!busiest->group_imb) {
|
||||||
/*
|
/*
|
||||||
* Don't want to pull so many tasks that a group would go idle.
|
* Don't want to pull so many tasks that a group would go idle.
|
||||||
|
* Except of course for the group_imb case, since then we might
|
||||||
|
* have to drop below capacity to reach cpu-load equilibrium.
|
||||||
*/
|
*/
|
||||||
load_above_capacity =
|
load_above_capacity =
|
||||||
(busiest->sum_nr_running - busiest->group_capacity);
|
(busiest->sum_nr_running - busiest->group_capacity);
|
||||||
|
@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
||||||
* we also don't want to reduce the group load below the group capacity
|
* we also don't want to reduce the group load below the group capacity
|
||||||
* (so that we can implement power-savings policies etc). Thus we look
|
* (so that we can implement power-savings policies etc). Thus we look
|
||||||
* for the minimum possible imbalance.
|
* for the minimum possible imbalance.
|
||||||
* Be careful of negative numbers as they'll appear as very large values
|
|
||||||
* with unsigned longs.
|
|
||||||
*/
|
*/
|
||||||
max_pull = min(busiest->avg_load - sds->avg_load,
|
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
|
||||||
load_above_capacity);
|
|
||||||
|
|
||||||
/* How much load to actually move to equalise the imbalance */
|
/* How much load to actually move to equalise the imbalance */
|
||||||
env->imbalance = min(
|
env->imbalance = min(
|
||||||
|
@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the busiest group is imbalanced the below checks don't
|
* If the busiest group is imbalanced the below checks don't
|
||||||
* work because they assumes all things are equal, which typically
|
* work because they assume all things are equal, which typically
|
||||||
* isn't true due to cpus_allowed constraints and the like.
|
* isn't true due to cpus_allowed constraints and the like.
|
||||||
*/
|
*/
|
||||||
if (busiest->group_imb)
|
if (busiest->group_imb)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче