From 45c01e824991b2dd0a332e19efc4901acb31209f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:20:41 +0200 Subject: [PATCH 01/76] sched: prioritize non-migratable tasks over migratable ones Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm that could allow suboptimal balancing if a non-migratable task gets queued behind a running migratable one. It is discussed in this thread: http://lkml.org/lkml/2008/4/22/296 This issue has been further exacerbated by a recent checkin to sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3). >From a pure priority standpoint, the run-queue is doing the "right" thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1 wakes up at equal or lower priority (affined only to cpu1) later, it *should* wait for T0 to finish. However, in reality that is likely suboptimal from a system perspective if there are other cores that could allow T0 and T1 to run concurrently. Since T1 can not migrate, the only choice for higher concurrency is to try to move T0. This is not something we addessed in the recent rt-balancing re-work. This patch tries to enhance the balancing algorithm by accomodating this scenario. It accomplishes this by incorporating the migratability of a task into its priority calculation. Within a numerical tsk->prio, a non-migratable task is logically higher than a migratable one. We maintain this by introducing a new per-priority queue (xqueue, or exclusive-queue) for holding non-migratable tasks. The scheduler will draw from the xqueue over the standard shared-queue (squeue) when available. There are several details for utilizing this properly. 1) During task-wake-up, we not only need to check if the priority preempts the current task, but we also need to check for this non-migratable condition. Therefore, if a non-migratable task wakes up and sees an equal priority migratable task already running, it will attempt to preempt it *if* there is a likelyhood that the current task will find an immediate home. 2) Tasks only get this non-migratable "priority boost" on wake-up. Any requeuing will result in the non-migratable task being queued to the end of the shared queue. This is an attempt to prevent the system from being completely unfair to migratable tasks during things like SCHED_RR timeslicing. I am sure this patch introduces potentially "odd" behavior if you concoct a scenario where a bunch of non-migratable threads could starve migratable ones given the right pattern. I am not yet convinced that this is a problem since we are talking about tasks of equal RT priority anyway, and there never is much in the way of guarantees against starvation under that scenario anyway. (e.g. you could come up with a similar scenario with a specific timing environment verses an affinity environment). I can be convinced otherwise, but for now I think this is "ok". Signed-off-by: Gregory Haskins CC: Dmitry Adamushko CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 ++-- kernel/sched_rt.c | 75 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed171..7178b8c2351c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -151,7 +151,8 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; + struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ + struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ }; struct rt_bandwidth { @@ -7542,7 +7543,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); + INIT_LIST_HEAD(array->xqueue + i); + INIT_LIST_HEAD(array->squeue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d573205d..fefed39fafd8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) if (group_rq && rt_rq_throttled(group_rq)) return; - list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (rt_se->nr_cpus_allowed == 1) + list_add_tail(&rt_se->run_list, + array->xqueue + rt_se_prio(rt_se)); + else + list_add_tail(&rt_se->run_list, + array->squeue + rt_se_prio(rt_se)); + __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) + if (list_empty(array->squeue + rt_se_prio(rt_se)) + && list_empty(array->xqueue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. + * + * Note: We always enqueue the task to the shared-queue, regardless of its + * previous position w.r.t. exclusive vs shared. This is so that exclusive RR + * tasks fairly round-robin with all tasks on the runqueue, not just other + * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + list_del_init(&rt_se->run_list); + list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq); + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio) { resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if((p->prio == rq->curr->prio) + && p->rt.nr_cpus_allowed == 1 + && rq->curr->rt.nr_cpus_allowed != 1 + && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + cpumask_t mask; + + if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) + /* + * There appears to be other cpus that can accept + * current, so lets reschedule to try and push it away + */ + resched_task(rq->curr); + } +#endif } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); + queue = array->xqueue + idx; + if (!list_empty(queue)) + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + else { + queue = array->squeue + idx; + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + } return next; } @@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { + list_for_each_entry(rt_se, array->squeue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); + + if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) + /* + * If either the new or old weight is a "1", we need + * to requeue to properly move between shared and + * exclusive queues. + */ + requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; From f333fdc9098b71e2687e4e9b6349fcb352960d66 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 12 May 2008 21:20:55 +0200 Subject: [PATCH 02/76] sched: make !hrtick faster it is safe to ignore timers and flags when the feature is disabled. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7178b8c2351c..aa960b84b881 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; - int cpu; + int cpu, hrtick = sched_feat(HRTICK); need_resched: preempt_disable(); @@ -4149,7 +4149,8 @@ need_resched_nonpreemptible: schedule_debug(prev); - hrtick_clear(rq); + if (hrtick) + hrtick_clear(rq); /* * Do the rq-clock update outside the rq lock: @@ -4197,7 +4198,8 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - hrtick_set(rq); + if (hrtick) + hrtick_set(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:01 +0200 Subject: [PATCH 03/76] sched: use a 2-d bitmap for searching lowest-pri CPU The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/sched.c | 7 ++ kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched_cpupri.h | 36 +++++++++ kernel/sched_rt.c | 98 +++++------------------- 5 files changed, 239 insertions(+), 77 deletions(-) create mode 100644 kernel/sched_cpupri.c create mode 100644 kernel/sched_cpupri.h diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..ecdd2d335639 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/sched.c b/kernel/sched.c index aa960b84b881..8a1257b65560 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -74,6 +74,8 @@ #include #include +#include "sched_cpupri.h" + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -450,6 +452,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd) cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 000000000000..52154fefab7e --- /dev/null +++ b/kernel/sched_cpupri.c @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 000000000000..0b6a3d110fac --- /dev/null +++ b/kernel/sched_cpupri.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fefed39fafd8..44b06d75416e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) + if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { @@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { +#ifdef CONFIG_SMP + int highest_prio = rt_rq->highest_prio; +#endif + WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rq->rt.rt_nr_migratory--; } + if (rt_rq->highest_prio != highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + } + update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED @@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); -static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) -{ - int lowest_prio = -1; - int lowest_cpu = -1; - int count = 0; - int cpu; - - cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); - - /* - * Scan each rq for the lowest prio. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - struct rq *rq = cpu_rq(cpu); - - /* We look for lowest RT prio or non-rt CPU */ - if (rq->rt.highest_prio >= MAX_RT_PRIO) { - /* - * if we already found a low RT queue - * and now we found this non-rt queue - * clear the mask and set our bit. - * Otherwise just return the queue as is - * and the count==1 will cause the algorithm - * to use the first bit found. - */ - if (lowest_cpu != -1) { - cpus_clear(*lowest_mask); - cpu_set(rq->cpu, *lowest_mask); - } - return 1; - } - - /* no locking for now */ - if ((rq->rt.highest_prio > task->prio) - && (rq->rt.highest_prio >= lowest_prio)) { - if (rq->rt.highest_prio > lowest_prio) { - /* new low - clear old data */ - lowest_prio = rq->rt.highest_prio; - lowest_cpu = cpu; - count = 0; - } - count++; - } else - cpu_clear(cpu, *lowest_mask); - } - - /* - * Clear out all the set bits that represent - * runqueues that were of higher prio than - * the lowest_prio. - */ - if (lowest_cpu > 0) { - /* - * Perhaps we could add another cpumask op to - * zero out bits. Like cpu_zero_bits(cpumask, nrbits); - * Then that could be optimized to use memset and such. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - if (cpu >= lowest_cpu) - break; - cpu_clear(cpu, *lowest_mask); - } - } - - return count; -} - static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { int first; @@ -851,18 +796,13 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - int count = find_lowest_cpus(task, lowest_mask); - if (!count) + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ + + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * There is no sense in performing an optimal search if only one - * target is found. - */ - if (count == 1) - return first_cpu(*lowest_mask); - /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect @@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } /* Assumes rq->lock is held */ @@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } /* From 6d299f1b53b84e2665f402d9bcc494800aba6386 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:14 +0200 Subject: [PATCH 04/76] sched: fix SCHED_OTHER balance iterator to include all tasks The currently logic inadvertently skips the last task on the run-queue, resulting in missed balance opportunities. Signed-off-by: Gregory Haskins Signed-off-by: David Bahi CC: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched_fair.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d4..1fe4c65a8170 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - if (next == &cfs_rq->tasks) - return NULL; - - /* Skip over entities that are not tasks */ - do { + while (next != &cfs_rq->tasks) { se = list_entry(next, struct sched_entity, group_node); next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) - return NULL; + /* Skip over entities that are not tasks */ + if (entity_is_task(se)) { + p = task_of(se); + break; + } + } cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); - return p; } From d07355f5def74d060333563b36ab51b89fd44cdd Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 12 May 2008 21:21:15 +0200 Subject: [PATCH 05/76] sched: check for SD_SERIALIZE atomically in rebalance_domains() Nothing really serious here, mainly just a matter of nit-picking :-/ From: Dmitry Adamushko For CONFIG_SCHED_DEBUG && CONFIG_SYSCT configs, sd->flags can be altered while being manipulated in rebalance_domains(). Let's do an atomic check. We rely here on the atomicity of read/write accesses for aligned words. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8a1257b65560..90329f1f8941 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3668,6 +3668,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + int need_serialize; cpumask_t tmp; for_each_domain(cpu, sd) { @@ -3685,8 +3686,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; + need_serialize = sd->flags & SD_SERIALIZE; - if (sd->flags & SD_SERIALIZE) { + if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } @@ -3702,7 +3704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } sd->last_balance = jiffies; } - if (sd->flags & SD_SERIALIZE) + if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { From f7dcd80bbc8e7032443e6539ea1b830364f82200 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:20:38 +0200 Subject: [PATCH 06/76] namespacecheck: fixes in kernel/sched.c Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 90329f1f8941..02a5eeedcb94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1633,7 +1633,7 @@ inline int task_curr(const struct task_struct *p) } /* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(const int cpu) { return cpu_rq(cpu)->load.weight; } From 554ec22f075d46e4363520a407d2b7eeb5dfdd43 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:03 +0200 Subject: [PATCH 07/76] namespacecheck: more sched.c fixes [ Stephen Rothwell : build fix ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ae0be3c62375..dc36c3aea018 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -134,7 +134,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; @@ -823,23 +822,6 @@ extern int arch_reinit_sched_domains(void); #endif /* CONFIG_SMP */ -/* - * A runqueue laden with a single nice 0 task scores a weighted_cpuload of - * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a - * task of nice 0 or enough lower priority tasks to bring up the - * weighted_cpuload - */ -static inline int above_background_load(void) -{ - unsigned long cpu; - - for_each_online_cpu(cpu) { - if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) - return 1; - } - return 0; -} - struct io_context; /* See blkdev.h */ #define NGROUPS_SMALL 32 #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) From 81d41d7ece23a1c3b4bcd1604026d3a06cc4dc79 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 11 May 2008 05:55:33 +0530 Subject: [PATCH 08/76] sched: fix defined-but-unused warning Fix this warning, which appears with !CONFIG_SMP: kernel/sched.c:1216: warning: `init_hrtick' defined but not used Signed-off-by: Rabin Vincent Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 02a5eeedcb94..f3faec52c5ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1130,6 +1130,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1185,6 +1186,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { From c7aceaba042702538b23cf4e0de1b2891ad8e671 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 15 May 2008 12:09:15 +0100 Subject: [PATCH 09/76] sched: reorder task_struct to reduce padding on 64bit builds This patch removes 24 bytes of padding and allows 1 extra object per slab on my fedora based config. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index dc36c3aea018..ea2857b99596 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,7 @@ struct task_struct { #endif int prio, static_prio, normal_prio; + unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -1104,7 +1105,6 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - unsigned int rt_priority; cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; cputime_t prev_utime, prev_stime; @@ -1123,12 +1123,12 @@ struct task_struct { gid_t gid,egid,sgid,fsgid; struct group_info *group_info; kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - unsigned securebits; struct user_struct *user; + unsigned securebits; #ifdef CONFIG_KEYS + unsigned char jit_keyring; /* default keyring to attach requested keys to */ struct key *request_key_auth; /* assumed request_key authority */ struct key *thread_keyring; /* keyring private to this thread */ - unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -1215,8 +1215,8 @@ struct task_struct { # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; - struct held_lock held_locks[MAX_LOCK_DEPTH]; unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif /* journalling filesystem info */ @@ -1244,10 +1244,6 @@ struct task_struct { u64 acct_vm_mem1; /* accumulated virtual memory usage */ cputime_t acct_stimexpd;/* stime since last update */ #endif -#ifdef CONFIG_NUMA - struct mempolicy *mempolicy; - short il_next; -#endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; int cpuset_mems_generation; @@ -1266,6 +1262,10 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *mempolicy; + short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; From e21f5b153b9b4a6775d7d41964e372e13a9178ab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 23 May 2008 09:05:58 -0700 Subject: [PATCH 10/76] sched: print module list in the "scheduling while atomic" warning For the normal WARN_ON() etc we added a print-the-modules-list already, which is very useful to figure out candidates for certain types of bugs. This patch adds the same print to the "scheduling while atomic" BUG warning, for the same reason: when we get here it's very useful to see which modules are loaded, to narrow down the candidate code list. Signed-off-by: Arjan van de Ven Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched.c b/kernel/sched.c index f3faec52c5ab..84a360670b9d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4070,6 +4070,7 @@ static noinline void __schedule_bug(struct task_struct *prev) prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); + print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); From 6d6bc0ad867c46896d0994bb039e7550ecb9b51d Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 30 May 2008 14:23:45 +0200 Subject: [PATCH 11/76] sched: add comments for ifdefs in sched.c make sched.c easier to read. Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 76 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 84a360670b9d..ef4e25604bbe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,15 +292,15 @@ struct task_group root_task_group; static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif -#else +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ #define root_task_group init_task_group -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -310,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else +#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif +#endif /* CONFIG_USER_SCHED */ /* * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. @@ -1316,15 +1316,15 @@ void wake_up_idle_cpu(int cpu) if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } -#endif +#endif /* CONFIG_NO_HZ */ -#else +#else /* !CONFIG_SMP */ static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); set_tsk_thread_flag(p, tif_bit); } -#endif +#endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) @@ -2129,7 +2129,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } } } -#endif +#endif /* CONFIG_SCHEDSTATS */ out_activate: #endif /* CONFIG_SMP */ @@ -2329,7 +2329,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } -#else +#else /* !CONFIG_PREEMPT_NOTIFIERS */ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -2341,7 +2341,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif +#endif /* CONFIG_PREEMPT_NOTIFIERS */ /** * prepare_task_switch - prepare to switch tasks @@ -6300,9 +6300,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } kfree(groupmask); } -#else +#else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -#endif +#endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) { @@ -6598,7 +6598,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) cpus_or(*span, *span, *nodemask); } } -#endif +#endif /* CONFIG_NUMA */ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; @@ -6617,7 +6617,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } -#endif +#endif /* CONFIG_SCHED_SMT */ /* * multi-core sched-domains: @@ -6625,7 +6625,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_core); -#endif +#endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int @@ -6727,7 +6727,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ @@ -6764,11 +6764,11 @@ next_sg: sched_group_nodes_bycpu[cpu] = NULL; } } -#else +#else /* !CONFIG_NUMA */ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } -#endif +#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -7459,7 +7459,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) #endif return err; } -#endif +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ /* * Force a reinitialization of the sched domains hierarchy. The domains @@ -7677,8 +7677,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -7692,8 +7692,8 @@ void __init sched_init(void) root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ } #ifdef CONFIG_SMP @@ -7709,8 +7709,8 @@ void __init sched_init(void) #ifdef CONFIG_USER_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), RUNTIME_INF); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); @@ -7720,8 +7720,8 @@ void __init sched_init(void) INIT_LIST_HEAD(&root_task_group.children); init_task_group.parent = &root_task_group; list_add(&init_task_group.siblings, &root_task_group.children); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ for_each_possible_cpu(i) { struct rq *rq; @@ -8040,7 +8040,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); } -#else +#else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } @@ -8058,7 +8058,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static void free_rt_sched_group(struct task_group *tg) @@ -8129,7 +8129,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { } @@ -8147,7 +8147,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) @@ -8258,7 +8258,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } -#endif +#endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -8508,7 +8508,7 @@ static int sched_rt_global_constraints(void) return ret; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { unsigned long flags; @@ -8526,7 +8526,7 @@ static int sched_rt_global_constraints(void) return 0; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, @@ -8634,7 +8634,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, @@ -8658,7 +8658,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) { return sched_group_rt_period(cgroup_tg(cgrp)); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED From e29c98d12b3f02d6ac711c60b4a5f8a46d1cf19b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 29 May 2008 12:36:18 +0530 Subject: [PATCH 12/76] sched: update the sched-domains debug documentation SCHED_DOMAIN_DEBUG mentioned in the Documentation for sched-domains for enabling sched-domains debugging doesn't exist anymore. Update the documentation to reflect the correct way of enabling sched-domain debugging. Signed-off-by: Gautham R Shenoy Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-domains.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt index a9e990ab980f..373ceacc367e 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.txt @@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your arch_init_sched_domains function. This function will attach domains to all CPUs using cpu_attach_domain. -Implementors should change the line -#undef SCHED_DOMAIN_DEBUG -to -#define SCHED_DOMAIN_DEBUG -in kernel/sched.c as this enables an error checking parse of the sched domains +The sched-domains debugging infrastructure can be enabled by enabling +CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains which should catch most possible errors (described above). It also prints out the domain structure in a visual format. From 099f98c8a1f13501a98afbfff4756395a610581c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 29 May 2008 20:56:32 +0530 Subject: [PATCH 13/76] sched: print the sd->level in sched_domain_debug code While printing out the visual representation of the sched-domains, print the level (MC, SMT, CPU, NODE, ... ) of each of the sched_domains. Credit: Peter Zijlstra Signed-off-by: Gautham R Shenoy Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index ef4e25604bbe..dc0be113f41d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6197,6 +6197,28 @@ void __init migration_init(void) #ifdef CONFIG_SCHED_DEBUG +static inline const char *sd_level_to_string(enum sched_domain_level lvl) +{ + switch (lvl) { + case SD_LV_NONE: + return "NONE"; + case SD_LV_SIBLING: + return "SIBLING"; + case SD_LV_MC: + return "MC"; + case SD_LV_CPU: + return "CPU"; + case SD_LV_NODE: + return "NODE"; + case SD_LV_ALLNODES: + return "ALLNODES"; + case SD_LV_MAX: + return "MAX"; + + } + return "MAX"; +} + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6216,7 +6238,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s\n", str); + printk(KERN_CONT "span %s level %s\n", + str, sd_level_to_string(sd->level)); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:05 -0400 Subject: [PATCH 14/76] sched: fix cpupri hotplug support The RT folks over at RedHat found an issue w.r.t. hotplug support which was traced to problems with the cpupri infrastructure in the scheduler: https://bugzilla.redhat.com/show_bug.cgi?id=449676 This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel. This patch applies to 25.4-rt4, though it should trivially apply to most cpupri enabled kernels mentioned above. It turned out that the issue was that offline cpus could get inadvertently registered with cpupri so that they were erroneously selected during migration decisions. The end result would be an OOPS as the offline cpu had tasks routed to it. This patch generalizes the old join/leave domain interface into an online/offline interface, and adjusts the root-domain/hotplug code to utilize it. I was able to easily reproduce the issue prior to this patch, and am no longer able to reproduce it after this patch. I can offline cpus indefinately and everything seems to be in working order. Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point me in the right direction. Also thank you to Peter for reviewing the early iterations of this patch. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 4 ++-- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++----------- kernel/sched_rt.c | 24 ++++++++++++++----- 3 files changed, 60 insertions(+), 22 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ea2857b99596..d25acf600a32 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -903,8 +903,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, const cpumask_t *newmask); - void (*join_domain)(struct rq *rq); - void (*leave_domain)(struct rq *rq); + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); diff --git a/kernel/sched.c b/kernel/sched.c index dc0be113f41d..f0ed81b71282 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -529,6 +529,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) static inline void inc_load(struct rq *rq, const struct task_struct *p) { @@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void) } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); @@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -7824,6 +7849,7 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 44b06d75416e..e4821593d4de 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) static inline void rt_set_overload(struct rq *rq) { + if (!rq->online) + return; + cpu_set(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set @@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) static inline void rt_clear_overload(struct rq *rq) { + if (!rq->online) + return; + /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); cpu_clear(rq->cpu, rq->rd->rto_mask); @@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_se_prio(rt_se)); } #endif #ifdef CONFIG_SMP @@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_rq->highest_prio != highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_rq->highest_prio); } update_rt_migration(rq_of_rt_rq(rt_rq)); @@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, } /* Assumes rq->lock is held */ -static void join_domain_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void leave_domain_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); @@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = { .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, - .join_domain = join_domain_rt, - .leave_domain = leave_domain_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, From 709d4b0c60f990bccf3e10ba7c6da407ad65c97f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:10 -0400 Subject: [PATCH 15/76] sched: fix cpupri priocount A rounding error was pointed out by Peter Zijlstra which would result in the structure holding priorities to be off by one. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 0b6a3d110fac..6b38355e2676 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -4,7 +4,7 @@ #include #define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG +#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 From e539d8fcd11af811db70707d47ea436d5621d0da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2008 10:28:00 +0200 Subject: [PATCH 16/76] sched: fix the cpuprio count really Peter pointed out that the last version of the "fix" was still one off under certain circumstances. Use BITS_TO_LONG instead to get an accurate result. Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 6b38355e2676..f25811b0f931 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -3,8 +3,8 @@ #include -#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 From 1100ac91b6af02d8639d518fad5b434b1bf44ed6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Jun 2008 12:25:37 +0200 Subject: [PATCH 17/76] sched: fix cpuprio build bug this patch was not built on !SMP: kernel/sched_rt.c: In function 'inc_rt_tasks': kernel/sched_rt.c:404: error: 'struct rq' has no member named 'online' Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e4821593d4de..eaa606071d51 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -399,16 +399,19 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - rt_rq->highest_prio = rt_se_prio(rt_se); + rt_rq->highest_prio = rt_se_prio(rt_se); +#ifdef CONFIG_SMP if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); +#endif } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { struct rq *rq = rq_of_rt_rq(rt_rq); + rq->rt.rt_nr_migratory++; } From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:01 -0700 Subject: [PATCH 18/76] sched: CPU hotplug events must not destroy scheduler domains created by the cpusets First issue is not related to the cpusets. We're simply leaking doms_cur. It's allocated in arch_init_sched_domains() which is called for every hotplug event. So we just keep reallocation doms_cur without freeing it. I introduced free_sched_domains() function that cleans things up. Second issue is that sched domains created by the cpusets are completely destroyed by the CPU hotplug events. For all CPU hotplug events scheduler attaches all CPUs to the NULL domain and then puts them all into the single domain thereby destroying domains created by the cpusets (partition_sched_domains). The solution is simple, when cpusets are enabled scheduler should not create default domain and instead let cpusets do that. Which is exactly what the patch does. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 6 ++++++ kernel/sched.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e326..6090d18b58a9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/sched.c b/kernel/sched.c index f0ed81b71282..1ddb0a8c7976 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7292,6 +7292,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) { } +/* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -7439,6 +7451,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -7524,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7542,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:02 -0700 Subject: [PATCH 19/76] sched: Move cpu masks from kernel/sched.c into kernel/cpu.c kernel/cpu.c seems a more logical place for those maps since they do not really have much to do with the scheduler these days. kernel/cpu.c is now built for the UP kernel too, but it does not affect the size the kernel sections. $ size vmlinux before text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux after text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/Makefile | 4 ++-- kernel/cpu.c | 24 ++++++++++++++++++++++++ kernel/sched.c | 18 ------------------ 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index ecdd2d335639..6c55301112e0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ - exit.o itimer.o time.o softirq.o resource.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c722..b11f06dc149a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,28 @@ #include #include +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP + +/* + * Represents all cpu's that are currently online. + */ +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); + +#else /* CONFIG_SMP */ + /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -403,3 +425,5 @@ out: cpu_maps_update_done(); } #endif /* CONFIG_PM_SLEEP_SMP */ + +#endif /* CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ddb0a8c7976..f36f549e5744 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return sched_setaffinity(pid, &new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; From e958b3600484533ff801920290468adc8135f89d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Jun 2008 23:22:32 +0200 Subject: [PATCH 20/76] sched: move weighted_cpuload into #ifdef CONFIG_SMP section weighted_cpuload is only used on SMP. move it into the CONFIG_SMP section. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f36f549e5744..727bdef76161 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1637,12 +1637,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); @@ -1671,6 +1665,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + /* * Is this task likely cache-hot: */ From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2008 14:49:58 +0200 Subject: [PATCH 21/76] sched: fix hotplug cpus on ia64 Cliff Wickman wrote: > I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1) > and get a very predictable hotplug cpu problem. > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > > The script that disables the cpu always hangs (unkillable) > on the 3rd attempt. > > And a bit further: > The kstopmachine thread always sits on the run queue (real time) for about > 30 minutes before running. this fix solves some (but not all) issues between CPU hotplug and RT bandwidth throttling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++-- kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 727bdef76161..e9c24a128655 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (int)(long)hcpu; + switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); free_sched_domains(); return NOTIFY_OK; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index eaa606071d51..8ae3416e0bb4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + if (iter->rt_runtime == RUNTIME_INF) + goto next; + diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { do_div(diff, weight); @@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq) break; } } +next: spin_unlock(&iter->rt_runtime_lock); } spin_unlock(&rt_b->rt_runtime_lock); return more; } + +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + spin_unlock(&rt_rq->rt_runtime_lock); + + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; + } else { + iter->rt_runtime -= want; + want -= want; + } + spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + spin_lock(&rt_rq->rt_runtime_lock); + BUG_ON(want); +balanced: + rt_rq->rt_runtime = RUNTIME_INF; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) #ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - int more; - spin_unlock(&rt_rq->rt_runtime_lock); - more = balance_runtime(rt_rq); + balance_runtime(rt_rq); spin_lock(&rt_rq->rt_runtime_lock); - if (more) - runtime = sched_rt_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; } #endif @@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq) if (rq->rt.overloaded) rt_set_overload(rq); + __enable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } @@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq) if (rq->rt.overloaded) rt_clear_overload(rq); + __disable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Jun 2008 12:57:11 -0700 Subject: [PATCH 22/76] sched: prevent bound kthreads from changing cpus_allowed Kthreads that have called kthread_bind() are bound to specific cpus, so other tasks should not be able to change their cpus_allowed from under them. Otherwise, it is possible to move kthreads, such as the migration or software watchdog threads, so they are not allowed access to the cpu they work on. Cc: Peter Zijlstra Cc: Paul Menage Cc: Paul Jackson Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/cpuset.c | 14 +++++++++++++- kernel/kthread.c | 1 + kernel/sched.c | 6 ++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d25acf600a32..2db1485f865d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1486,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6090d18b58a9..b84354f4de36 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; + if (tsk->flags & PF_THREAD_BOUND) { + cpumask_t mask; + + mutex_lock(&callback_mutex); + mask = cs->cpus_allowed; + mutex_unlock(&callback_mutex); + if (!cpus_equal(tsk->cpus_allowed, mask)) + return -EINVAL; + } return security_task_setscheduler(tsk, 0, NULL); } @@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); + if (err) + return; from = oldcs->mems_allowed; to = cs->mems_allowed; diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e1..97747cdd37c9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; + k->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index e9c24a128655..164fe7fe0d89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) goto out; } + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpus_equal(p->cpus_allowed, *new_mask))) { + ret = -EINVAL; + goto out; + } + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { From 6492c7f83e88a3a9521793b6934d882b97afe287 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 8 Jun 2008 09:27:13 +0200 Subject: [PATCH 23/76] sched: trivial sched_features cleanup Remove unused debug/tuning features. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1c7283cb9581..62b39ca92ebd 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(NORMALIZED_SLEEPER, 1) -SCHED_FEAT(DEADLINE, 1) From e9886ca3a93d7d041d3de8e5acebe213da777d59 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 9 Jun 2008 17:12:24 +0900 Subject: [PATCH 24/76] sched: kill off dead cfs_rq_set_shares() Building with CONFIG_FAIR_GROUP_SCHED=y on UP results in an unused cfs_rq_set_shares() reference. As nothing is using this dummy function in the first place, just kill it off. Signed-off-by: Paul Mundt Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 164fe7fe0d89..07d5472dee99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,16 +1480,8 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -#else /* CONFIG_SMP */ - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -} #endif -#endif /* CONFIG_SMP */ - #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" From f7d62364b2cef85cbcd4feffdd3632ef7c3b61c2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 10 Jun 2008 20:29:19 -0700 Subject: [PATCH 25/76] sched: fix typo in Documentation/scheduler/sched-rt-group.txt Fix minor typos. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-rt-group.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 14f901f639ee..3ef339f491e0 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s = 0.00015s. So this group can be scheduled with a period of 0.005s and a run time of 0.00015s. -The remaining CPU time will be used for user input and other tass. Because +The remaining CPU time will be used for user input and other tasks. Because realtime tasks have explicitly allocated the CPU time they need to perform -their tasks, buffer underruns in the graphocs or audio can be eliminated. +their tasks, buffer underruns in the graphics or audio can be eliminated. NOTE: the above example is not fully implemented as of yet (2.6.25). We still lack an EDF scheduler to make non-uniform periods usable. From 20b6331bfed1f07ba1e5006889a5d64adc53615e Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 11 Jun 2008 00:58:30 +0200 Subject: [PATCH 26/76] sched: rework of "prioritize non-migratable tasks over migratable ones" regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f I think we can do it simpler. Please take a look at the patch below. Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and twice so on x86_64), let's add "exclusive" (the ones that are bound to this CPU) tasks to the head of the queue and "shared" ones -- to the end. In case of a few newly woken up "exclusive" tasks, they are 'stacked' (not queued as now), meaning that a task {i+1} is being placed in front of the previously woken up task {i}. But I don't think that this behavior may cause any realistic problems. There are a couple of changes on top of this one. (1) in check_preempt_curr_rt() I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt" check. enqueue_task_rt(p) and check_preempt_curr_rt() are always called one after another with rq->lock being held so the following check "p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should be enough (well, just its left part) to guarantee that 'p' has been queued in front of the 'curr'. (2) in set_cpus_allowed_rt() I don't thinks there is a need for requeue_task_rt() here. Perhaps, the only case when 'requeue' (+ reschedule) might be useful is as follows: i) weight == 1 && cpu_isset(task_cpu(p), *new_mask) i.e. a task is being bound to this CPU); ii) 'p' != rq->curr but here, 'p' has already been on this CPU for a while and was not migrated. i.e. it's possible that 'rq->curr' would not have high chances to be migrated right at this particular moment (although, has chance in a bit longer term), should we allow it to be preempted. Anyway, I think we should not perhaps make it more complex trying to address some rare corner cases. For instance, that's why a single queue approach would be preferable. Unless I'm missing something obvious, this approach gives us similar functionality at lower cost. Verified only compilation-wise. (Almost)-Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- kernel/sched_rt.c | 44 +++++++++----------------------------------- 2 files changed, 11 insertions(+), 39 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 554de4009803..cc1d558406f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -153,8 +153,7 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ - struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ + struct list_head queue[MAX_RT_PRIO]; }; struct rt_bandwidth { @@ -7620,8 +7619,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->xqueue + i); - INIT_LIST_HEAD(array->squeue + i); + INIT_LIST_HEAD(array->queue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ae3416e0bb4..f721b52acd8d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -576,16 +576,15 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); + struct list_head *queue = array->queue + rt_se_prio(rt_se); if (group_rq && rt_rq_throttled(group_rq)) return; if (rt_se->nr_cpus_allowed == 1) - list_add_tail(&rt_se->run_list, - array->xqueue + rt_se_prio(rt_se)); + list_add(&rt_se->run_list, queue); else - list_add_tail(&rt_se->run_list, - array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); @@ -598,8 +597,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->squeue + rt_se_prio(rt_se)) - && list_empty(array->xqueue + rt_se_prio(rt_se))) + if (list_empty(array->queue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -666,11 +664,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. - * - * Note: We always enqueue the task to the shared-queue, regardless of its - * previous position w.r.t. exclusive vs shared. This is so that exclusive RR - * tasks fairly round-robin with all tasks on the runqueue, not just other - * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) @@ -678,7 +671,7 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -736,9 +729,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq); - /* * Preempt the current task with a newly woken task if needed: */ @@ -764,8 +754,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) */ if((p->prio == rq->curr->prio) && p->rt.nr_cpus_allowed == 1 - && rq->curr->rt.nr_cpus_allowed != 1 - && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + && rq->curr->rt.nr_cpus_allowed != 1) { cpumask_t mask; if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) @@ -789,15 +778,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->xqueue + idx; - if (!list_empty(queue)) - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - else { - queue = array->squeue + idx; - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - } + queue = array->queue + idx; + next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; } @@ -867,7 +849,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->squeue + idx, run_list) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1249,14 +1231,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); - - if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) - /* - * If either the new or old weight is a "1", we need - * to requeue to properly move between shared and - * exclusive queues. - */ - requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; From ada18de2eb76961a4d4847f63291744c9e7beec4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:24 +0200 Subject: [PATCH 27/76] sched: debug: add some rt debug output Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched_rt.c | 14 ++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8bb713040ac9..8e077b9c91cb 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#else +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; @@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cgroup_path(cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", @@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); } +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = rt_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = &per_cpu(runqueues, cpu); @@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu) #undef PN print_cfs_stats(m, cpu); + print_rt_stats(m, cpu); print_rq(m, rq, cpu); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fee5fa7c72db..2e0ccdcf046a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1444,3 +1444,17 @@ static const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +static void print_rt_stats(struct seq_file *m, int cpu) +{ + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) + print_rt_rq(m, cpu, rt_rq); + rcu_read_unlock(); +} +#endif From b79f3833d81d54fc71d98c8064dc45f33a755a8a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:25 +0200 Subject: [PATCH 28/76] sched: rt: fix SMP bandwidth balancing for throttled groups Now we exceed the runtime and get throttled - the period rollover tick will subtract the cpu quota from the runtime and check if we're below quota. However with this cpu having a very small portion of the runtime it will not refresh as fast as it should. Therefore, also rebalance the runtime when we're throttled. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e0ccdcf046a..87b2e3bf9472 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,6 +228,28 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif +#ifdef CONFIG_SMP +static int do_balance_runtime(struct rt_rq *rt_rq); + +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} +#endif + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; @@ -247,6 +269,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) u64 runtime; spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { @@ -267,7 +291,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } #ifdef CONFIG_SMP -static int balance_runtime(struct rt_rq *rt_rq) +static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -428,17 +452,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) return 0; -#ifdef CONFIG_SMP - if (rt_rq->rt_time > runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - - runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - } -#endif + balance_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; if (rt_rq->rt_time > runtime) { rt_rq->rt_throttled = 1; From eff6549b957d15d1ad168d90b8c1eb643b9c163f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:26 +0200 Subject: [PATCH 29/76] sched: rt: move some code around Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 119 ++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 62 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 87b2e3bf9472..61d52112289c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,68 +228,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif -#ifdef CONFIG_SMP -static int do_balance_runtime(struct rt_rq *rt_rq); - -static int balance_runtime(struct rt_rq *rt_rq) -{ - int more = 0; - - if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - } - - return more; -} -#else -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} -#endif - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -{ - int i, idle = 1; - cpumask_t span; - - if (rt_b->rt_runtime == RUNTIME_INF) - return 1; - - span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { - int enqueue = 0; - struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); - struct rq *rq = rq_of_rt_rq(rt_rq); - - spin_lock(&rq->lock); - if (rt_rq->rt_time) { - u64 runtime; - - spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_throttled) - balance_runtime(rt_rq); - runtime = rt_rq->rt_runtime; - rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - enqueue = 1; - } - if (rt_rq->rt_time || rt_rq->rt_nr_running) - idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - } - - if (enqueue) - sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); - } - - return idle; -} - #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) { @@ -425,8 +363,65 @@ static void enable_runtime(struct rq *rq) spin_unlock_irqrestore(&rq->lock, flags); } +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} #endif +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED From 10b612f440a22a294e87ec7e8f03f9eea3338628 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:27 +0200 Subject: [PATCH 30/76] sched: rt: fix the bandwidth contraint computations Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5b307da827ef..1f711a58a2b4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8419,7 +8419,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_CGROUP_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_group *tgi, *parent = tg ? tg->parent : NULL; + struct task_group *tgi, *parent = tg->parent; unsigned long total = 0; if (!parent) { @@ -8443,7 +8443,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) } rcu_read_unlock(); - return total + to_ratio(period, runtime) < + return total + to_ratio(period, runtime) <= to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), parent->rt_bandwidth.rt_runtime); } @@ -8560,10 +8560,15 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { + struct task_group *tg = &root_task_group; + u64 rt_runtime, rt_period; int ret = 0; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = tg->rt_bandwidth.rt_runtime; + mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(NULL, 1, 0)) + if (!__rt_schedulable(tg, rt_period, rt_runtime)) ret = -EINVAL; mutex_unlock(&rt_constraints_mutex); From 6c3df25511c2c51f2dd36cc52a8d22363d731793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: [PATCH 31/76] sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Cc: Peter Zijlstra Reported-by: "Daniel K." --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 61d52112289c..bd90c8bb0739 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -412,7 +412,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); From bf647b62fdb948e757a7b4d18d4f16e3c763b1d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:10 +0200 Subject: [PATCH 32/76] sched: clean up some unused variables In file included from /mnt/build/linux-2.6/kernel/sched.c:1496: /mnt/build/linux-2.6/kernel/sched_rt.c: In function '__enable_runtime': /mnt/build/linux-2.6/kernel/sched_rt.c:339: warning: unused variable 'rd' /mnt/build/linux-2.6/kernel/sched_rt.c: In function 'requeue_rt_entity': /mnt/build/linux-2.6/kernel/sched_rt.c:692: warning: unused variable 'queue' Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bd90c8bb0739..6b4a6b5a4167 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -336,7 +336,6 @@ static void disable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq) { - struct root_domain *rd = rq->rd; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) @@ -689,7 +688,6 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - struct list_head *queue = array->queue + rt_se_prio(rt_se); if (on_rt_rq(rt_se)) { list_del_init(&rt_se->run_list); From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:11 +0200 Subject: [PATCH 33/76] sched: revert the revert of: weight calculations Try again.. initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3 Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++-- kernel/sched_fair.c | 105 +++++++++++++++++++++++++++------------- kernel/sched_features.h | 1 + 3 files changed, 76 insertions(+), 39 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c51d9fae8cd8..f653af684fb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1fe4c65a8170..496500988ce5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, } #endif +/* + * delta *= w / rw + */ +static inline unsigned long +calc_delta_weight(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + se->load.weight, &cfs_rq_of(se)->load); + } + + return delta; +} + +/* + * delta *= rw / w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } + + return delta; +} + /* * The idea is to set a period in which each task runs once. * @@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - } - - - return slice; + return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; - unsigned long weight; - u64 vslice; if (!se->on_rq) nr_running++; - vslice = __sched_period(nr_running); + return __sched_period(nr_running); +} + +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) +{ + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct load_weight *se_lw = &se->load; - weight = cfs_rq->load.weight; - if (!se->on_rq) - weight += se->load.weight; + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; - vslice *= NICE_0_LOAD; - do_div(vslice, weight); + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); } - return vslice; + return delta; } /* @@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; } @@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; + + /* + * convert the sleeper threshold into virtual time + */ + if (sched_feat(NORMALIZED_SLEEPER)) + thresh = calc_delta_fair(thresh, se); + + vruntime -= thresh; + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) unsigned long gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 62b39ca92ebd..afa549166d8d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,4 +1,5 @@ SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(NORMALIZED_SLEEPER, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) From c9c294a630e28eec5f2865f028ecfc58d45c0a5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:12 +0200 Subject: [PATCH 34/76] sched: fix calc_delta_asym() calc_delta_asym() is supposed to do the same as calc_delta_fair() except linearly shrink the result for negative nice processes - this causes them to have a smaller preemption threshold so that they are more easily preempted. The problem is that for task groups se->load.weight is the per cpu share of the actual task group weight; take that into account. Also provide a debug switch to disable the asymmetry (which I still don't like - but it does greatly benefit some workloads) This would explain the interactivity issues reported against group scheduling. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++++++++++++++++- kernel/sched_features.h | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 496500988ce5..2268e634812b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,6 +430,29 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; +#ifdef CONFIG_FAIR_SCHED_GROUP + struct cfs_rq *cfs_rq = se->my_q; + struct task_group *tg = NULL + + if (cfs_rq) + tg = cfs_rq->tg; + + if (tg && tg->shares < NICE_0_LOAD) { + /* + * scale shares to what it would have been had + * tg->weight been NICE_0_LOAD: + * + * weight = 1024 * shares / tg->weight + */ + lw.weight *= se->load.weight; + lw.weight /= tg->shares; + + lw.inv_weight = 0; + + se_lw = &lw; + } else +#endif + if (se->load.weight < NICE_0_LOAD) se_lw = &lw; @@ -1154,7 +1177,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + if (sched_feat(ASYM_GRAN)) + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + else + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index afa549166d8d..04123af2e678 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -7,3 +7,4 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(ASYM_GRAN, 1) From ced8aa16e1db55c33c507174c1b1f9e107445865 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:13 +0200 Subject: [PATCH 35/76] sched: fix calc_delta_asym, #2 Ok, so why are we in this mess, it was: 1/w but now we mixed that rw in the mix like: rw/w rw being \Sum w suggests: fiddling w, we should also fiddle rw, humm? Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2268e634812b..2e197b8e43f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -429,6 +429,7 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; + unsigned long rw = cfs_rq_of(se)->load.weight; #ifdef CONFIG_FAIR_SCHED_GROUP struct cfs_rq *cfs_rq = se->my_q; @@ -450,14 +451,16 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) lw.inv_weight = 0; se_lw = &lw; + rw += lw.weight - se->load.weight; } else #endif - if (se->load.weight < NICE_0_LOAD) + if (se->load.weight < NICE_0_LOAD) { se_lw = &lw; + rw += NICE_0_LOAD - se->load.weight; + } - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, se_lw); + delta = calc_delta_mine(delta, rw, se_lw); } return delta; From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:14 +0200 Subject: [PATCH 36/76] sched: revert revert of: fair-group: SMP-nice for group scheduling Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 430 +++++++++++++++++++++++++++++++++++++++--- kernel/sched_debug.c | 5 + kernel/sched_fair.c | 126 ++++++++----- kernel/sched_rt.c | 4 + 5 files changed, 490 insertions(+), 76 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072dbd..97a58b622ee1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,6 +765,7 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ + int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb3..874b6da15430 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -403,6 +403,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + shares = tg->shares; + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + #endif #include "sched_stats.h" @@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; + int unlock_aggregate; cpus_setall(*cpus); + unlock_aggregate = get_aggregate(sd); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3383,8 +3731,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3399,8 +3748,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; + sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7757,6 +8113,7 @@ void __init sched_init(void) } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif @@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); + } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8e077b9c91cb..04394ccac88d 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); +#endif +#endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e197b8e43f1..183388c4dead 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -580,6 +597,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; + struct rq_iterator cfs_rq_iterator; - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); - - p = task_of(curr); - - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - if (rem_load_move <= 0) + imbalance = (busiest_weight - this_weight) / 2; + + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) + continue; + + move_group_shares(tg, sd, busiest_cpu, this_cpu); + + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; + + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6b4a6b5a4167..765932d0399d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) rt_se->timeout = 0; enqueue_rt_entity(rt_se); + + inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + + dec_cpu_load(rq, p->se.load.weight); } /* From 76a2a6ee8a0660a29127f05989ac59ae1ce865fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:15 +0200 Subject: [PATCH 37/76] sched: sched_clock_cpu() based cpu_clock() with sched_clock_cpu() being reasonably in sync between cpus (max 1 jiffy difference) use this to provide cpu_clock(). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 76 -------------------------------------------- kernel/sched_clock.c | 12 +++++++ 2 files changed, 12 insertions(+), 76 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 874b6da15430..eb3454c410fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -818,82 +818,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -unsigned long long time_sync_thresh = 100000; - -static DEFINE_PER_CPU(unsigned long long, time_offset); -static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); - -/* - * Global lock which we take every now and then to synchronize - * the CPUs time. This method is not warp-safe, but it's good - * enough to synchronize slowly diverging time sources and thus - * it's good enough for tracing: - */ -static DEFINE_SPINLOCK(time_sync_lock); -static unsigned long long prev_global_time; - -static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) -{ - /* - * We want this inlined, to not get tracer function calls - * in this critical section: - */ - spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); - __raw_spin_lock(&time_sync_lock.raw_lock); - - if (time < prev_global_time) { - per_cpu(time_offset, cpu) += prev_global_time - time; - time = prev_global_time; - } else { - prev_global_time = time; - } - - __raw_spin_unlock(&time_sync_lock.raw_lock); - spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); - - return time; -} - -static unsigned long long __cpu_clock(int cpu) -{ - unsigned long long now; - - /* - * Only call sched_clock() if the scheduler has already been - * initialized (some code might call cpu_clock() very early): - */ - if (unlikely(!scheduler_running)) - return 0; - - now = sched_clock_cpu(cpu); - - return now; -} - -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -unsigned long long cpu_clock(int cpu) -{ - unsigned long long prev_cpu_time, time, delta_time; - unsigned long flags; - - local_irq_save(flags); - prev_cpu_time = per_cpu(prev_cpu_time, cpu); - time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); - delta_time = time-prev_cpu_time; - - if (unlikely(delta_time > time_sync_thresh)) { - time = __sync_cpu_clock(time, cpu); - per_cpu(prev_cpu_time, cpu) = time; - } - local_irq_restore(flags); - - return time; -} -EXPORT_SYMBOL_GPL(cpu_clock); - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219ab..3c696db59452 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -244,3 +244,15 @@ unsigned long long __attribute__((weak)) sched_clock(void) { return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } + +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + raw_local_irq_save(flags); + clock = sched_clock_cpu(cpu); + raw_local_irq_restore(flags); + + return clock; +} From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:16 +0200 Subject: [PATCH 38/76] sched: fix wakeup granularity and buddy granularity Uncouple buddy selection from wakeup granularity. The initial idea was that buddies could run ahead as far as a normal task can - do this by measuring a pair 'slice' just as we do for a normal task. This means we can drop the wakeup_granularity back to 5ms. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_fair.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index eb3454c410fa..7d282c52bd42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -375,6 +375,7 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; + u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 183388c4dead..509092af0330 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!cfs_rq->next) - return se; + struct rq *rq = rq_of(cfs_rq); + u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (wakeup_preempt_entity(cfs_rq->next, se) != 0) + if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + cfs_rq->pair_start = rq->clock; return se; + } return cfs_rq->next; } From 32df2ee86a580f70f2dbb90cf81f413aa655f838 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:17 +0200 Subject: [PATCH 39/76] sched: add full schedstats to /proc/sched_debug show all the schedstats in /debug/sched_debug as well. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 04394ccac88d..bbe6b31c3c56 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,8 +162,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->bkl_count); +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:18 +0200 Subject: [PATCH 40/76] sched: fix sched_domain aggregation Keeping the aggregate on the first cpu of the sched domain has two problems: - it could collide between different sched domains on different cpus - it could slow things down because of the remote accesses Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 113 ++++++++++++++++++++---------------------- kernel/sched_fair.c | 12 ++--- 3 files changed, 60 insertions(+), 66 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 97a58b622ee1..eaf821072dbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,7 +765,6 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ - int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index 7d282c52bd42..160d3c209b8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); */ static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) +aggregate(struct task_group *tg, int cpu) { - return &tg->cfs_rq[sd->first_cpu]->aggregate; + return &tg->cfs_rq[cpu]->aggregate; } -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); +typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when @@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); */ static void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) + int cpu, struct sched_domain *sd) { struct task_group *parent, *child; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, sd); + (*down)(parent, cpu, sd); list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1508,7 +1508,7 @@ down: up: continue; } - (*up)(parent, sd); + (*up)(parent, cpu, sd); child = parent; parent = parent->parent; @@ -1520,8 +1520,8 @@ up: /* * Calculate the aggregate runqueue weight. */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; unsigned long task_weight = 0; @@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) task_weight += tg->cfs_rq[i]->task_weight; } - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; + aggregate(tg, cpu)->rq_weight = rq_weight; + aggregate(tg, cpu)->task_weight = task_weight; } /* * Compute the weight of this group on the given cpus. */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long shares = 0; int i; @@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) for_each_cpu_mask(i, sd->span) shares += tg->cfs_rq[i]->shares; - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; - aggregate(tg, sd)->shares = shares; + aggregate(tg, cpu)->shares = shares; } /* * Compute the load fraction assigned to this group, relies on the aggregate * weight and this group's parent's load, i.e. top-down. */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long load; @@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) load += cpu_rq(i)->load.weight; } else { - load = aggregate(tg->parent, sd)->load; + load = aggregate(tg->parent, cpu)->load; /* * shares is our weight in the parent's rq so * shares/parent->rq_weight gives our fraction of the load */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; + load *= aggregate(tg, cpu)->shares; + load /= aggregate(tg->parent, cpu)->rq_weight + 1; } - aggregate(tg, sd)->load = load; + aggregate(tg, cpu)->load = load; } static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) +__update_group_shares_cpu(struct task_group *tg, int cpu, + struct sched_domain *sd, int tcpu) { int boost = 0; unsigned long shares; @@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * \Sum rq_weight * */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; + shares = aggregate(tg, cpu)->shares * rq_weight; + shares /= aggregate(tg, cpu)->rq_weight + 1; /* * record the actual number of shares, not the boosted amount. @@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * task went to. */ static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, +__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { unsigned long shares; shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); + __update_group_shares_cpu(tg, cpu, sd, scpu); + __update_group_shares_cpu(tg, cpu, sd, dcpu); /* * ensure we never loose shares due to rounding errors in the @@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd, * we need to walk up the tree and change all shares until we hit the root. */ static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, +move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); + __move_group_shares(tg, cpu, sd, scpu, dcpu); tg = tg->parent; } } -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, sd)->shares; + unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); + __update_group_shares_cpu(tg, cpu, sd, i); spin_unlock_irqrestore(&rq->lock, flags); } - aggregate_group_shares(tg, sd); + aggregate_group_shares(tg, cpu, sd); /* * ensure we never loose shares due to rounding errors in the * above redistribution. */ - shares -= aggregate(tg, sd)->shares; + shares -= aggregate(tg, cpu)->shares; if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; + tg->cfs_rq[cpu]->shares += shares; + aggregate(tg, cpu)->shares += shares; } } @@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) * Calculate the accumulative weight and recursive load of each task group * while walking down the tree. */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); + aggregate_group_weight(tg, cpu, sd); + aggregate_group_shares(tg, cpu, sd); + aggregate_group_load(tg, cpu, sd); } /* * Rebalance the cpu shares while walking back up the tree. */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_set_shares(tg, sd); + aggregate_group_set_shares(tg, cpu, sd); } static DEFINE_PER_CPU(spinlock_t, aggregate_lock); @@ -1731,18 +1731,18 @@ static void __init init_aggregate(void) spin_lock_init(&per_cpu(aggregate_lock, i)); } -static int get_aggregate(struct sched_domain *sd) +static int get_aggregate(int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) return 0; - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); return 1; } -static void put_aggregate(struct sched_domain *sd) +static void put_aggregate(int cpu, struct sched_domain *sd) { - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); + spin_unlock(&per_cpu(aggregate_lock, cpu)); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1756,12 +1756,12 @@ static inline void init_aggregate(void) { } -static inline int get_aggregate(struct sched_domain *sd) +static inline int get_aggregate(int cpu, struct sched_domain *sd) { return 0; } -static inline void put_aggregate(struct sched_domain *sd) +static inline void put_aggregate(int cpu, struct sched_domain *sd) { } #endif @@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); + unlock_aggregate = get_aggregate(this_cpu, sd); /* * When power savings policy is enabled for the parent domain, idle @@ -3678,7 +3678,7 @@ out_one_pinned: ld_moved = 0; out: if (unlock_aggregate) - put_aggregate(sd); + put_aggregate(this_cpu, sd); return ld_moved; } @@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 509092af0330..40cf24ab4de8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, sd)->task_weight) + if (!aggregate(tg, this_cpu)->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; + rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; + rem_load /= aggregate(tg, this_cpu)->load + 1; this_weight = tg->cfs_rq[this_cpu]->task_weight; busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; @@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + moved_load *= aggregate(tg, this_cpu)->load; + moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) From 4d8d595dfa69e1c807bf928f364668a7f30da5dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:19 +0200 Subject: [PATCH 41/76] sched: update aggregate when holding the RQs It was observed that in __update_group_shares_cpu() rq_weight > aggregate()->rq_weight This is caused by forks/wakeups in between the initial aggregate pass and locking of the RQs for load balance. To avoid this situation partially re-do the aggregation once we have the RQs locked (which avoids new tasks from appearing). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 160d3c209b8f..dae20199dc9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1721,6 +1721,11 @@ aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) aggregate_group_set_shares(tg, cpu, sd); } +static void +aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ +} + static DEFINE_PER_CPU(spinlock_t, aggregate_lock); static void __init init_aggregate(void) @@ -1740,6 +1745,11 @@ static int get_aggregate(int cpu, struct sched_domain *sd) return 1; } +static void update_aggregate(int cpu, struct sched_domain *sd) +{ + aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); +} + static void put_aggregate(int cpu, struct sched_domain *sd) { spin_unlock(&per_cpu(aggregate_lock, cpu)); @@ -1761,6 +1771,10 @@ static inline int get_aggregate(int cpu, struct sched_domain *sd) return 0; } +static inline void update_aggregate(int cpu, struct sched_domain *sd) +{ +} + static inline void put_aggregate(int cpu, struct sched_domain *sd) { } @@ -2192,6 +2206,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + /* + * now that we have both rqs locked the rq weight won't change + * anymore - so update the stats. + */ + update_aggregate(this_cpu, sd); + do { unsigned long load, avg_load; int local_group; From 53fecd8ae1900fb571086f54f664051004665b55 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:20 +0200 Subject: [PATCH 42/76] sched: kill task_group balancing The idea was to balance groups until we've reached the global goal, however Vatsa rightly pointed out that we might never reach that goal this way - hence take out this logic. [ the initial rationale for this 'feature' was to promote max concurrency within a group - it does not however affect fairness ] Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 40cf24ab4de8..b10c0d61a2a9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1422,9 +1422,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; + long rem_load, moved_load; /* * empty group @@ -1435,17 +1433,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; rem_load /= aggregate(tg, this_cpu)->load + 1; - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - - imbalance = (busiest_weight - this_weight) / 2; - - if (imbalance < 0) - imbalance = busiest_weight; - - max_load = max(rem_load, imbalance); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]); if (!moved_load) From d3f40dbab954d83383b6a516582d5c09cc216dcc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:21 +0200 Subject: [PATCH 43/76] sched: dont micro manage share losses We used to try and contain the loss of 'shares' by playing arithmetic games. Replace that by noticing that at the top sched_domain we'll always have the full weight in shares to distribute. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index dae20199dc9c..28229c5d4983 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1551,6 +1551,9 @@ aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + aggregate(tg, cpu)->shares = shares; } @@ -1642,20 +1645,8 @@ static void __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { - unsigned long shares; - - shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, cpu, sd, scpu); __update_group_shares_cpu(tg, cpu, sd, dcpu); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - if (shares) - tg->cfs_rq[dcpu]->shares += shares; } /* @@ -1675,7 +1666,6 @@ move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, static void aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1688,16 +1678,6 @@ aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain * } aggregate_group_shares(tg, cpu, sd); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= aggregate(tg, cpu)->shares; - if (shares) { - tg->cfs_rq[cpu]->shares += shares; - aggregate(tg, cpu)->shares += shares; - } } /* From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:22 +0200 Subject: [PATCH 44/76] sched: no need to aggregate task_weight We only need to know the task_weight of the busiest rq - nothing to do if there are no tasks there. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 +--------------- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 28229c5d4983..716cfc8e099e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -433,12 +433,6 @@ struct cfs_rq { * The sum of all runqueue weights within this span. */ unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; } aggregate; #endif #endif @@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); * rq_weight: * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. */ static inline struct aggregate_struct * @@ -1524,16 +1514,12 @@ static void aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; - unsigned long task_weight = 0; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu_mask(i, sd->span) rq_weight += tg->cfs_rq[i]->load.weight; - task_weight += tg->cfs_rq[i]->task_weight; - } aggregate(tg, cpu)->rq_weight = rq_weight; - aggregate(tg, cpu)->task_weight = task_weight; } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b10c0d61a2a9..03b9fbd9d648 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, this_cpu)->task_weight) + if (!tg->cfs_rq[busiest_cpu]->task_weight) continue; rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:23 +0200 Subject: [PATCH 45/76] sched: simplify the group load balancer While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 290 ++++++++++---------------------------------- kernel/sched_fair.c | 15 +-- 2 files changed, 74 insertions(+), 231 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099e..f864b751fd19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -406,34 +406,23 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. + * the part of load.weight contributed by tasks */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; + unsigned long task_weight; - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - } aggregate; + /* + * this cpu's part of tg->shares + */ + unsigned long shares; #endif #endif }; @@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, int cpu) -{ - return &tg->cfs_rq[cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - int cpu, struct sched_domain *sd) +static void +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) { struct task_group *parent, *child; @@ -1507,72 +1463,6 @@ up: rcu_read_unlock(); } -/* - * Calculate the aggregate runqueue weight. - */ -static void -aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - int i; - - for_each_cpu_mask(i, sd->span) - rq_weight += tg->cfs_rq[i]->load.weight; - - aggregate(tg, cpu)->rq_weight = rq_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static void -aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) - shares += tg->cfs_rq[i]->shares; - - if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - aggregate(tg, cpu)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static void -aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - int i; - - load = 0; - for_each_cpu_mask(i, sd->span) - load += cpu_rq(i)->load.weight; - - } else { - load = aggregate(tg->parent, cpu)->load; - - /* - * shares is our weight in the parent's rq so - * shares/parent->rq_weight gives our fraction of the load - */ - load *= aggregate(tg, cpu)->shares; - load /= aggregate(tg->parent, cpu)->rq_weight + 1; - } - - aggregate(tg, cpu)->load = load; -} - static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void __update_group_shares_cpu(struct task_group *tg, int cpu, - struct sched_domain *sd, int tcpu) + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; unsigned long rq_weight; - if (!tg->se[tcpu]) + if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[tcpu]->load.weight; + rq_weight = tg->cfs_rq[cpu]->load.weight; /* * If there are currently no tasks on the cpu pretend there is one of @@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, rq_weight = NICE_0_LOAD; } + if (unlikely(rq_weight > sd_rq_weight)) + rq_weight = sd_rq_weight; + /* * \Sum shares * rq_weight * shares = ----------------------- * \Sum rq_weight * */ - shares = aggregate(tg, cpu)->shares * rq_weight; - shares /= aggregate(tg, cpu)->rq_weight + 1; + shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); /* * record the actual number of shares, not the boosted amount. */ - tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; if (shares < MIN_SHARES) shares = MIN_SHARES; else if (shares > MAX_SHARES) shares = MAX_SHARES; - __set_se_shares(tg->se[tcpu], shares); + __set_se_shares(tg->se[cpu], shares); } /* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. */ static void -__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) -{ - __update_group_shares_cpu(tg, cpu, sd, scpu); - __update_group_shares_cpu(tg, cpu, sd, dcpu); -} - -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) -{ - while (tg) { - __move_group_shares(tg, cpu, sd, scpu, dcpu); - tg = tg->parent; - } -} - -static void -aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) { + unsigned long rq_weight = 0; + unsigned long shares = 0; int i; + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + shares += tg->cfs_rq[i]->shares; + } + + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, cpu, sd, i); + __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } - - aggregate_group_shares(tg, cpu, sd); } /* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. */ static void -aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, cpu, sd); - aggregate_group_shares(tg, cpu, sd); - aggregate_group_load(tg, cpu, sd); -} + unsigned long load; -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static void -aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - aggregate_group_set_shares(tg, cpu, sd); + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; } static void -aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) { } -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); - -static void __init init_aggregate(void) +static void update_shares(struct sched_domain *sd) { - int i; - - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(aggregate_lock, i)); + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } -static int get_aggregate(int cpu, struct sched_domain *sd) +static void update_h_load(int cpu) { - if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) - return 0; - - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); - return 1; -} - -static void update_aggregate(int cpu, struct sched_domain *sd) -{ - aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); -} - -static void put_aggregate(int cpu, struct sched_domain *sd) -{ - spin_unlock(&per_cpu(aggregate_lock, cpu)); + walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #else -static inline void init_aggregate(void) +static inline void update_shares(struct sched_domain *sd) { } -static inline int get_aggregate(int cpu, struct sched_domain *sd) -{ - return 0; -} - -static inline void update_aggregate(int cpu, struct sched_domain *sd) -{ -} - -static inline void put_aggregate(int cpu, struct sched_domain *sd) -{ -} #endif #endif @@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; - /* - * now that we have both rqs locked the rq weight won't change - * anymore - so update the stats. - */ - update_aggregate(this_cpu, sd); - do { unsigned long load, avg_load; int local_group; @@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; - int unlock_aggregate; cpus_setall(*cpus); - unlock_aggregate = get_aggregate(this_cpu, sd); - /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: + update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3663,8 +3506,8 @@ out_one_pinned: else ld_moved = 0; out: - if (unlock_aggregate) - put_aggregate(this_cpu, sd); + if (ld_moved) + update_shares(sd); return ld_moved; } @@ -8019,7 +7862,6 @@ void __init sched_init(void) } #ifdef CONFIG_SMP - init_aggregate(); init_defrootdomain(); #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03b9fbd9d648..7b8d664d6f22 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct task_group *tg; rcu_read_lock(); + update_h_load(busiest_cpu); + list_for_each_entry(tg, &task_groups, list) { + struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; long rem_load, moved_load; /* * empty group */ - if (!tg->cfs_rq[busiest_cpu]->task_weight) + if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; - rem_load /= aggregate(tg, this_cpu)->load + 1; + rem_load = rem_load_move * busiest_cfs_rq->load.weight; + rem_load /= busiest_cfs_rq->h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - - moved_load *= aggregate(tg, this_cpu)->load; - moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; + moved_load *= busiest_cfs_rq->h_load; + moved_load /= busiest_cfs_rq->load.weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) From 3e5459b4bea3ca2618cc02d56d12639f2cba531d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:24 +0200 Subject: [PATCH 46/76] sched: fix newidle smp group balancing Re-compute the shares on newidle - so we can make a decision based on recent data. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index f864b751fd19..cdd09462fc98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1579,6 +1579,13 @@ static void update_shares(struct sched_domain *sd) walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + static void update_h_load(int cpu) { walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); @@ -1595,6 +1602,10 @@ static inline void update_shares(struct sched_domain *sd) { } +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + #endif #endif @@ -3543,6 +3554,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: + update_shares_locked(this_rq, sd); group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, cpus, NULL); if (!group) { @@ -3586,6 +3598,7 @@ redo: } else sd->nr_balance_failed = 0; + update_shares_locked(this_rq, sd); return ld_moved; out_balanced: From 039a1c41b3a489e34593ea1e1687f6fdad6b13ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:25 +0200 Subject: [PATCH 47/76] sched: fix sched_balance_self() smp group balancing Finding the least idle cpu is more accurate when done with updated shares. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index cdd09462fc98..39d5495540d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2128,6 +2128,9 @@ static int sched_balance_self(int cpu, int flag) sd = tmp; } + if (sd) + update_shares(sd); + while (sd) { cpumask_t span, tmpmask; struct sched_group *group; From a8a51d5e59561aa5b4d66e19eca819b537783e8f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:26 +0200 Subject: [PATCH 48/76] sched: persistent average load per task Remove the fall-back to SCHED_LOAD_SCALE by remembering the previous value of cpu_avg_load_per_task() - this is useful because of the hierarchical group model in which task weight can be much smaller. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 39d5495540d2..6a6b0139eb32 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -554,6 +554,8 @@ struct rq { int cpu; int online; + unsigned long avg_load_per_task; + struct task_struct *migration_thread; struct list_head migration_queue; #endif @@ -1427,9 +1429,18 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + #ifdef CONFIG_FAIR_GROUP_SCHED typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); @@ -2010,18 +2021,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -/* - * Return the average load per task on the cpu's run queue - */ -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - unsigned long n = rq->nr_running; - - return n ? total / n : SCHED_LOAD_SCALE; -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:27 +0200 Subject: [PATCH 49/76] sched: hierarchical load vs affine wakeups With hierarchical grouping we can't just compare task weight to rq weight - we need to scale the weight appropriately. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7b8d664d6f22..865cb53a7ccf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1073,6 +1073,25 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; +#ifdef CONFIG_FAIR_GROUP_SCHED +static unsigned long task_h_load(struct task_struct *p) +{ + unsigned long h_load = p->se.load.weight; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + update_h_load(task_cpu(p)); + + h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + + return h_load; +} +#else +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; +} +#endif + static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, @@ -1093,9 +1112,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= current->se.load.weight; + tl -= task_h_load(current); - balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + balanced = 100*(tl + task_h_load(p)) <= imbalance*load; /* * If the currently running task will sleep within From 408ed066b11cf9ee4536573b4269ee3613bd735e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:28 +0200 Subject: [PATCH 50/76] sched: hierarchical load vs find_busiest_group find_busiest_group() has some assumptions about task weight being in the NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this by replacing it with the average task weight, which will adapt the situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6a6b0139eb32..5e2aa394a812 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; else if (idle == CPU_NEWLY_IDLE) @@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; local_group = cpu_isset(this_cpu, group->cpumask); @@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; min_cpu_load = ~0UL; @@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); - if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) __group_imb = 1; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; @@ -3267,9 +3287,9 @@ small_imbalance: if (busiest_load_per_task > this_load_per_task) imbn = 1; } else - this_load_per_task = SCHED_LOAD_SCALE; + this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; From 42a3ac7d5cee89849448b41b86faeb86f98e92f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:29 +0200 Subject: [PATCH 51/76] sched: fix load scaling in group balancing doing the load balance will change cfs_rq->load.weight (that's the whole point) but since that's part of the scale factor, we'll scale back with a different amount. Weight getting smaller would result in an inflated moved_load which causes it to stop balancing too soon. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 865cb53a7ccf..734e4c556fcb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1444,6 +1444,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, list_for_each_entry(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; long rem_load, moved_load; /* @@ -1452,8 +1454,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_cfs_rq->load.weight; - rem_load /= busiest_cfs_rq->h_load + 1; + rem_load = rem_load_move * busiest_weight; + rem_load /= busiest_h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1462,8 +1464,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - moved_load *= busiest_cfs_rq->h_load; - moved_load /= busiest_cfs_rq->load.weight + 1; + moved_load *= busiest_h_load; + moved_load /= busiest_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) From 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:30 +0200 Subject: [PATCH 52/76] sched: fix task_h_load() Currently task_h_load() computes the load of a task and uses that to either subtract it from the total, or add to it. However, removing or adding a task need not have any effect on the total load at all. Imagine adding a task to a group that is local to one cpu - in that case the total load of that cpu is unaffected. So properly compute addition/removal: s_i = S * rw_i / \Sum_j rw_j s'_i = S * (rw_i + wl) / (\Sum_j rw_j + wg) then s'_i - s_i gives the change in load. Where s_i is the shares for cpu i, S the group weight, rw_i the runqueue weight for that cpu, wl the weight we add (subtract) and wg the weight contribution to the runqueue. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 49 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 734e4c556fcb..a1694441f8b7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,22 +1074,53 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long task_h_load(struct task_struct *p) +static unsigned long effective_load(struct task_group *tg, long wl, int cpu) { - unsigned long h_load = p->se.load.weight; - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + struct sched_entity *se = tg->se[cpu]; + long wg = wl; - update_h_load(task_cpu(p)); + for_each_sched_entity(se) { +#define D(n) (likely(n) ? (n) : 1) - h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + long S, Srw, rw, s, sn; - return h_load; + S = se->my_q->tg->shares; + s = se->my_q->shares; + rw = se->my_q->load.weight; + + Srw = S * rw / D(s); + sn = S * (rw + wl) / D(Srw + wg); + + wl = sn - s; + wg = 0; +#undef D + } + + return wl; } + +static unsigned long task_load_sub(struct task_struct *p) +{ + return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) +{ + return effective_load(task_group(p), p->se.load.weight, cpu); +} + #else -static unsigned long task_h_load(struct task_struct *p) + +static unsigned long task_load_sub(struct task_struct *p) +{ + return -p->se.load.weight; +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) { return p->se.load.weight; } + #endif static int @@ -1112,9 +1143,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= task_h_load(current); + tl += task_load_sub(current); - balanced = 100*(tl + task_h_load(p)) <= imbalance*load; + balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; /* * If the currently running task will sleep within From 051c67640e771fd6ad1b676fc0c16c379b3c6f80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:31 +0200 Subject: [PATCH 53/76] sched: remove prio preference from balance decisions Priority looses much of its meaning in a hierarchical context. So don't use it in balance decisions. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5e2aa394a812..10d43f5bf0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2896,7 +2896,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { - int loops = 0, pulled = 0, pinned = 0, skip_for_load; + int loops = 0, pulled = 0, pinned = 0; struct task_struct *p; long rem_load_move = max_load_move; @@ -2912,14 +2912,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, next: if (!p || loops++ > sysctl_sched_nr_migrate) goto out; - /* - * To help distribute high priority tasks across CPUs we don't - * skip a task if it will be the highest priority task (i.e. smallest - * prio value) on its new queue regardless of its load weight - */ - skip_for_load = (p->se.load.weight >> 1) > rem_load_move + - SCHED_LOAD_SCALE_FUZZ; - if ((skip_for_load && p->prio >= *this_best_prio) || + + if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { p = iterator->next(iterator->arg); goto next; From cb5ef42a03a13f95a9ea94e6cda4f7a47497871f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:32 +0200 Subject: [PATCH 54/76] sched: optimize effective_load() s_i = S * rw_i / \Sum_j rw_j -> \Sum_j rw_j = S * rw_i / s_i -> s'_i = S * (rw_i + w) / (\Sum_j rw_j + w) delta s = s' - s = S * (rw + w) / ((S * rw / s) + w) = s * (S * (rw + w) / (S * rw + s * w) - 1) a = S*(rw+w), b = S*rw + s*w delta s = s * (a-b) / b IOW, trade one divide for two multiplies Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a1694441f8b7..0d197be3e3e9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1082,16 +1082,16 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) - long S, Srw, rw, s, sn; + long S, rw, s, a, b; S = se->my_q->tg->shares; s = se->my_q->shares; rw = se->my_q->load.weight; - Srw = S * rw / D(s); - sn = S * (rw + wl) / D(Srw + wg); + a = S*(rw + wl); + b = S*rw + s*wg; - wl = sn - s; + wl = s*(a-b)/D(b); wg = 0; #undef D } From 93b75217df39e6d75889cc6f8050343286aff4a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:33 +0200 Subject: [PATCH 55/76] sched: disable source/target_load bias The bias given by source/target_load functions can be very large, disable it by default to get faster convergence. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- kernel/sched_features.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 10d43f5bf0fc..6c5eb3bc37e0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2000,7 +2000,7 @@ static unsigned long source_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return min(rq->cpu_load[type-1], total); @@ -2015,7 +2015,7 @@ static unsigned long target_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return max(rq->cpu_load[type-1], total); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 04123af2e678..d56e3053e746 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,3 +8,4 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) +SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file From cd80917e4ff465ea77106f8e4fb631eedc4cf426 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:34 +0200 Subject: [PATCH 56/76] sched: fix shares boost logic In case the domain is empty, pretend there is a single task on each cpu, so that together with the boost logic we end up giving 1/n shares to each cpu. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 6c5eb3bc37e0..1cff969f6646 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1549,6 +1549,9 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; + if (!rq_weight) + rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; + for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:35 +0200 Subject: [PATCH 57/76] sched: update shares on wakeup We found that the affine wakeup code needs rather accurate load figures to be effective. The trouble is that updating the load figures is fairly expensive with group scheduling. Therefore ratelimit the updating. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched.c | 30 +++++++++++++++++++++++++++++- kernel/sched_features.h | 3 ++- kernel/sysctl.c | 8 ++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072dbd..835b6c6fcc56 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -783,6 +783,8 @@ struct sched_domain { unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ + u64 last_update; + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -1605,6 +1607,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_shares_ratelimit; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index 1cff969f6646..62db0891025a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -777,6 +777,12 @@ late_initcall(sched_init_debug); */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * ratelimit for updating the group shares. + * default: 0.5ms + */ +const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) static void update_shares(struct sched_domain *sd) { - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + } } static void update_shares_locked(struct rq *rq, struct sched_domain *sd) @@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + update_shares(sd); + break; + } + } + } +#endif + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d56e3053e746..7d616d2a2a3f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file +SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_WAKEUP_UPDATE, 1) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca8..fe8cdc80ff02 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_ratelimit", + .data = &sysctl_sched_shares_ratelimit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", From 243e0e7b7d3b54749ece2e879ecd7e2a11874443 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:36 +0200 Subject: [PATCH 58/76] sched: fix mult overflow It was observed these mults can overflow. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0d197be3e3e9..26ebe180cdea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1477,7 +1477,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; - long rem_load, moved_load; + u64 rem_load, moved_load; /* * empty group @@ -1485,8 +1485,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_weight; - rem_load /= busiest_h_load + 1; + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1496,7 +1496,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, continue; moved_load *= busiest_h_load; - moved_load /= busiest_weight + 1; + moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:37 +0200 Subject: [PATCH 59/76] sched: correct wakeup weight calculations rw_i = {2, 4, 1, 0} s_i = {2/7, 4/7, 1/7, 0} wakeup on cpu0, weight=1 rw'_i = {3, 4, 1, 0} s'_i = {3/8, 4/8, 1/8, 0} s_0 = S * rw_0 / \Sum rw_j -> \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct) s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) = 1 * (2+1) / (7+1) = 3/8 (correct so we find that adding 1 to cpu0 gains 5/56 in weight if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 48 ++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 62db0891025a..01d3e51b7116 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} #endif /* CONFIG_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 26ebe180cdea..bed2f71e63d9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, long wl, int cpu) +static unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { struct sched_entity *se = tg->se[cpu]; - long wg = wl; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) b = S*rw + s*wg; wl = s*(a-b)/D(b); + /* + * Assume the group is already running and will + * thus already be accounted for in the weight. + * + * That is, moving shares between CPUs, does not + * alter the group weight. + */ wg = 0; #undef D } @@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) return wl; } -static unsigned long task_load_sub(struct task_struct *p) -{ - return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return effective_load(task_group(p), p->se.load.weight, cpu); -} - #else -static unsigned long task_load_sub(struct task_struct *p) +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { - return -p->se.load.weight; -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return p->se.load.weight; + return wl; } #endif @@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, unsigned int imbalance) { struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) @@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * effect of the currently running task from the load * of the current CPU: */ - if (sync) - tl += task_load_sub(current); + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; - balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; + tl += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; + + balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:38 +0200 Subject: [PATCH 60/76] sched: incremental effective_load() Increase the accuracy of the effective_load values. Not only consider the current increment (as per the attempted wakeup), but also consider the delta between when we last adjusted the shares and the current situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 01d3e51b7116..7613f69f0978 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -427,6 +427,11 @@ struct cfs_rq { * this cpu's part of tg->shares */ unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; #endif #endif }; @@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * record the actual number of shares, not the boosted amount. */ tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bed2f71e63d9..e87f1a52f625 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, + long wl, long wg) { struct sched_entity *se = tg->se[cpu]; + long more_w; + + if (!tg->parent) + return wl; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu, S = se->my_q->tg->shares; s = se->my_q->shares; - rw = se->my_q->load.weight; + rw = se->my_q->rq_weight; a = S*(rw + wl); b = S*rw + s*wg; From f5bfb7d9ff73d72ee4f2f4830a6f0c9088d00f92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:39 +0200 Subject: [PATCH 61/76] sched: bias effective_load() error towards failing wake_affine(). Measurement shows that the difference between cgroup:/ and cgroup:/foo wake_affine() results is that the latter succeeds significantly more. Therefore bias the calculations towards failing the test. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 ++++++++++++++++++++++++++++ kernel/sched_features.h | 1 + 2 files changed, 29 insertions(+) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e87f1a52f625..9bcc0030a58b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,6 +1074,27 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * The problem is that perfectly aligning the shares is rather expensive, hence + * we try to avoid doing that too often - see update_shares(), which ratelimits + * this change. + * + * We compensate this by not only taking the current delta into account, but + * also considering the delta between when the shares were last adjusted and + * now. + * + * We still saw a performance dip, some tracing learned us that between + * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased + * significantly. Therefore try to bias the error in direction of failing + * the affine wakeup. + * + */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { @@ -1083,6 +1104,13 @@ static long effective_load(struct task_group *tg, int cpu, if (!tg->parent) return wl; + /* + * By not taking the decrease of shares on the other cpu into + * account our error leans towards reducing the affine wakeups. + */ + if (!wl && sched_feat(ASYM_EFF_LOAD)) + return wl; + /* * Instead of using this increment, also add the difference * between when the shares were last updated and now. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7d616d2a2a3f..862b06bd560a 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -10,3 +10,4 @@ SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 0) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(ASYM_EFF_LOAD, 1) From 55e12e5e7b1d7e7c05a4be10cb5fd092c039aa78 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 24 Jun 2008 23:39:43 +0530 Subject: [PATCH 62/76] sched: make sched_{rt,fair}.c ifdefs more readable Signed-off-by: Dhaval Giani Cc: Srivatsa Vaddagiri Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- kernel/sched_rt.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9bcc0030a58b..2e43d4a748c3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -921,7 +921,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta, requeue); } } -#else +#else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -1062,7 +1062,7 @@ static int wake_idle(int cpu, struct task_struct *p) } return cpu; } -#else +#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; @@ -1586,7 +1586,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -#endif +#endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 765932d0399d..47ceac9e8552 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -161,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { @@ -226,7 +226,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &def_rt_bandwidth; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) @@ -374,12 +374,12 @@ static int balance_runtime(struct rt_rq *rt_rq) return more; } -#else +#else /* !CONFIG_SMP */ static inline int balance_runtime(struct rt_rq *rt_rq) { return 0; } -#endif +#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -1472,4 +1472,4 @@ static void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif +#endif /* CONFIG_SCHED_DEBUG */ From 4c9fe8ad813b257a2b9ddf0f752105a75a7dae63 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 14:49:35 +0200 Subject: [PATCH 63/76] sched: export cpu_clock the rcutorture module relies on cpu_clock. Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 3c696db59452..ed5a8c415046 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -256,3 +256,4 @@ unsigned long long cpu_clock(int cpu) return clock; } +EXPORT_SYMBOL_GPL(cpu_clock); From 2d452c9b10caeec455eb5e56a0ef4ed485178213 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Jun 2008 15:01:59 +0200 Subject: [PATCH 64/76] sched: sched_clock_cpu() based cpu_clock(), lockdep fix Vegard Nossum reported: > WARNING: at kernel/lockdep.c:2738 check_flags+0x142/0x160() which happens due to: unsigned long long cpu_clock(int cpu) { unsigned long long clock; unsigned long flags; raw_local_irq_save(flags); as lower level functions can take locks, we must not do that, use proper lockdep-annotated irq save/restore. Reported-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ed5a8c415046..60094e257a9a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -250,9 +250,9 @@ unsigned long long cpu_clock(int cpu) unsigned long long clock; unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); clock = sched_clock_cpu(cpu); - raw_local_irq_restore(flags); + local_irq_restore(flags); return clock; } From 34e83e850f5e5ee2a18cd77a5d70d31972a632e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 15:42:36 +0200 Subject: [PATCH 65/76] sched: build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/sched.c: In function ‘sched_group_set_shares': kernel/sched.c:8635: error: implicit declaration of function ‘cfs_rq_set_shares' Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7613f69f0978..058250a63b64 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1627,11 +1627,6 @@ static void update_h_load(int cpu) walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ - cfs_rq->shares = shares; -} - #else static inline void update_shares(struct sched_domain *sd) @@ -1646,6 +1641,13 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + cfs_rq->shares = shares; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" From 30432094a7f506ad24997a3ba6aed913ab61c01d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Jun 2008 21:35:50 +0200 Subject: [PATCH 66/76] sched: fix warning This patch fixes the following warning: kernel/sched.c:1667: warning: 'cfs_rq_set_shares' defined but not used This seems the correct way to fix this; cfs_rq_set_shares() is only used in a single place, which is also inside #ifdef CONFIG_FAIR_GROUP_SCHED. Signed-off-by: Vegard Nossum Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 058250a63b64..677c80b9a6b5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1641,12 +1641,14 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_FAIR_GROUP_SCHED static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) { -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP cfs_rq->shares = shares; #endif } +#endif #include "sched_stats.h" #include "sched_idletask.c" From c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:29:55 -0600 Subject: [PATCH 67/76] sched: terminate newidle balancing once at least one task has moved over Inspired by Peter Zijlstra. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 677c80b9a6b5..d99aeabeb72f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3013,6 +3013,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + } while (class && max_load_move > total_load_moved); return total_load_moved > 0; From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:30:00 -0600 Subject: [PATCH 68/76] sched: add avg-overlap support to RT tasks We have the notion of tracking process-coupling (a.k.a. buddy-wake) via the p->se.last_wake / p->se.avg_overlap facilities, but it is only used for cfs to cfs interactions. There is no reason why an rt to cfs interaction cannot share in establishing a relationhip in a similar manner. Because PREEMPT_RT runs many kernel threads as FIFO priority, we often times have heavy interaction between RT threads waking CFS applications. This patch offers a substantial boost (50-60%+) in perfomance under those circumstances. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++++ kernel/sched_fair.c | 21 ++------------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d99aeabeb72f..bbc40c3a0657 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); @@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -2313,6 +2325,8 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: + current->se.last_wakeup = current->se.sum_exec_runtime; + task_rq_unlock(rq, &flags); return success; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e43d4a748c3..f2aa987027d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (!se->last_wakeup) - return; - - update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); - se->last_wakeup = 0; -} - static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced && curr->sched_class == &fair_sched_class) { + if (sync && balanced) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) + p->se.avg_overlap < sysctl_sched_migration_cost) return 1; } @@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - se->last_wakeup = se->sum_exec_runtime; if (unlikely(se == pse)) return; From 46ac22bab42cc868b9c1d0e915ddbc8e8065a44d Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Tue, 1 Jul 2008 14:30:06 +0530 Subject: [PATCH 69/76] sched: fix accounting in task delay accounting & migration On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote: > On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote: > > > Thanks Peter for the explanation... > > > > I agree with the above and that is the reason why I did not see weird > > values with cpu_time. But, run_delay still would suffer skews as the end > > points for delta could be taken on different cpus due to migration (more > > so on RT kernel due to the push-pull operations). With the below patch, > > I could not reproduce the issue I had seen earlier. After every dequeue, > > we take the delta and start wait measurements from zero when moved to a > > different rq. > > OK, so task delay delay accounting is broken because it doesn't take > migration into account. > > What you've done is make it symmetric wrt enqueue, and account it like > > cpu0 cpu1 > > enqueue > > dequeue > enqueue > > run > > Where you add both d1 and d2 to the run_delay,.. right? > Thanks for reviewing the patch. The above is exactly what I have done. > This seems like a good fix, however it looks like the patch will break > compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it > failing to provide a stub for sched_info_dequeue() in that case. Fixed. Pl. find the new patch below. Signed-off-by: Ankita Garg Acked-by: Peter Zijlstra Cc: Gregory Haskins Cc: rostedt@goodmis.org Cc: suresh.b.siddha@intel.com Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dhaval@linux.vnet.ibm.com Cc: vatsa@linux.vnet.ibm.com Cc: David Bahi Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_stats.h | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index bbc40c3a0657..996bc15196a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1714,6 +1714,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) p->se.last_wakeup = 0; } + sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 80179ef7450e..8385d43987e2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.cpu_time += delta; } + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) # define schedstat_set(var, val) do { var = (val); } while (0) @@ -126,6 +133,9 @@ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) {} static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} # define schedstat_inc(rq, field) do { } while (0) @@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #endif #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) * active queue, thus delaying tasks in the expired queue from running; * see scheduler_tick()). * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * Though we are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. */ static inline void sched_info_dequeued(struct task_struct *t) { - t->sched_info.last_queued = 0; + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); } /* @@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t) if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; - sched_info_dequeued(t); + sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) __sched_info_switch(prev, next); } #else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ From 62c43dd9864dbd52ff158922d1d08c75f20335af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:50 -0400 Subject: [PATCH 70/76] sched_clock: record from last tick The sched_clock code tries to keep within the gtod time by one tick (jiffy). The current code mistakenly keeps track of the delta jiffies between updates of the clock, where the the delta is used to compare with the number of jiffies that have past since an update of the gtod. The gtod is updated at each schedule tick not each sched_clock update. After one jiffy passes the clock is updated fine. But the delta is taken from the last update so if the next update happens before the next tick the delta jiffies used will be incorrect. This patch changes the code to check the delta of jiffies between ticks and not updates to match the comparison of the updates with the gtod. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219ab..e383bc7df6dd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -40,7 +40,7 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long prev_jiffies; + unsigned long tick_jiffies; u64 prev_raw; u64 tick_raw; u64 tick_gtod; @@ -71,7 +71,7 @@ void sched_clock_init(void) struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->prev_jiffies = now_jiffies; + scd->tick_jiffies = now_jiffies; scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; @@ -90,7 +90,7 @@ void sched_clock_init(void) static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->prev_jiffies; + long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; s64 delta = now - scd->prev_raw; @@ -119,7 +119,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = min_clock; scd->prev_raw = now; - scd->prev_jiffies = now_jiffies; scd->clock = clock; } @@ -179,6 +178,7 @@ u64 sched_clock_cpu(int cpu) void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); + unsigned long now_jiffies = jiffies; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -196,6 +196,7 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ + scd->tick_jiffies = now_jiffies; scd->tick_raw = now; scd->tick_gtod = now_gtod; __raw_spin_unlock(&scd->lock); From f7cce27f5605b9e137b829a47949cb2d3c7e1cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:51 -0400 Subject: [PATCH 71/76] sched_clock: widen the max and min time With keeping the max and min sched time within one jiffy of the gtod clock was too tight. Just before a schedule tick the max could easily be hit, as well as just after a schedule_tick the min could be hit. This caused the clock to jump around by a jiffy. This patch widens the minimum to last gtod + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSECS and the maximum to last gtod + (2 + delta_jiffies) * TICK_NSECS This keeps the minum to gtod or if one jiffy less than delta jiffies and the maxim 2 jiffies ahead of gtod. This may cause unstable TSCs to be a bit more sporadic, but it helps keep a clock with a stable TSC working well. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e383bc7df6dd..42b81fa38cbd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -96,14 +96,21 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + min_clock = scd->tick_gtod + + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - max_clock = min_clock + TICK_NSEC; + /* + * The clock must stay within a jiffie of the gtod. + * But since we may be at the start of a jiffy or the end of one + * we add another jiffy buffer. + */ + max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: [PATCH 72/76] sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 ++++++++++++++++- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8d..33a8f42041fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } -#else + +#ifdef CONFIG_NO_HZ +static inline void sched_clock_tick_stop(int cpu) +{ +} + +static inline void sched_clock_tick_start(int cpu) +{ +} +#endif + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NO_HZ +extern void sched_clock_tick_stop(int cpu); +extern void sched_clock_tick_start(int cpu); #endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cbd..97159e225a77 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..d63008b09a4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* From 2b8a0cf4890d7537a77b51caa8f508e4a05a0e67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 19:49:41 -0400 Subject: [PATCH 73/76] sched_clock: fix calculation of other CPU The algorithm to calculate the 'now' of another CPU is not correct. At each scheduler tick, each CPU records the last sched_clock and gtod (tick_raw and tick_gtod respectively). If the TSC is somewhat the same in speed between two clocks the algorithm would be: tick_gtod1 + (now1 - tick_raw1) = tick_gtod2 + (now2 - tick_raw2) To calculate now2 we would have: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw2 - tick_raw1) + now1 Currently the algorithm is: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw1 - tick_raw2) + now1 This solves most of the rest of the issues I've had with timestamps in ftace. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: john stultz Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 97159e225a77..55fca1e9e12a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -203,8 +203,8 @@ u64 sched_clock_cpu(int cpu) now -= my_scd->tick_raw; now += scd->tick_raw; - now -= my_scd->tick_gtod; - now += scd->tick_gtod; + now += my_scd->tick_gtod; + now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { From c0c87734f125d2fa8ebc70310f3257fa6209f2b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:31 -0400 Subject: [PATCH 74/76] sched_clock: only update deltas with local reads. Reading the CPU clock should try to stay accurate within the CPU. By reading the CPU clock from another CPU and updating the deltas can cause unneeded jumps when reading from the local CPU. This patch changes the code to update the last read TSC only when read from the local CPU. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 55fca1e9e12a..ee7cce5029ce 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static int check_max(struct sched_clock_data *scd) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -162,8 +162,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; - scd->clock = clock; + if (time) + *time = clock; + else { + scd->prev_raw = now; + scd->clock = clock; + } } static void lock_double_clock(struct sched_clock_data *data1, @@ -207,15 +211,18 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); + + __update_sched_clock(scd, now, &clock); + + __raw_spin_unlock(&scd->lock); + } else { __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now, NULL); + clock = scd->clock; + __raw_spin_unlock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; - - __raw_spin_unlock(&scd->lock); - return clock; } @@ -234,7 +241,7 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); + __update_sched_clock(scd, now, NULL); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would From a83bc47c33ab182f1e48977fd5a04024d713c75e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:32 -0400 Subject: [PATCH 75/76] sched_clock: record TSC after gtod To read the gtod we need to grab the xtime lock for read. Reading the gtod before the TSC can cause a bigger gab if the xtime lock is contended. This patch simply reverses the order to read the TSC after the gtod. The locking in the reading of the gtod handles any barriers one might think is needed. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ee7cce5029ce..28ff6bf5e02b 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -237,8 +237,8 @@ void sched_clock_tick(void) WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); __raw_spin_lock(&scd->lock); __update_sched_clock(scd, now, NULL); From c300ba252829e9325e08f0af60687add94445b25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:33 -0400 Subject: [PATCH 76/76] sched_clock: and multiplier for TSC to gtod drift The sched_clock code currently tries to keep all CPU clocks of all CPUS somewhat in sync. At every clock tick it records the gtod clock and uses that and jiffies and the TSC to calculate a CPU clock that tries to stay in sync with all the other CPUs. ftrace depends heavily on this timer and it detects when this timer "jumps". One problem is that the TSC and the gtod also drift. When the TSC is 0.1% faster or slower than the gtod it is very noticeable in ftrace. To help compensate for this, I've added a multiplier that tries to keep the CPU clock updating at the same rate as the gtod. I've tried various ways to get it to be in sync and this ended up being the most reliable. At every scheduler tick we calculate the new multiplier: multi = delta_gtod / delta_TSC This means we perform a 64 bit divide at the tick (once a HZ). A shift is used to handle the accuracy. Other methods that failed due to dynamic HZ are: (not used) multi += (gtod - tsc) / delta_gtod (not used) multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod as well as other variants. This code still allows for a slight drift between TSC and gtod, but it keeps the damage down to a minimum. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 28ff6bf5e02b..8affbfd0cdb0 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -3,6 +3,9 @@ * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt + * * Based on code by: * Ingo Molnar * Guillaume Chazarain @@ -32,6 +35,11 @@ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +#define MULTI_SHIFT 15 +/* Max is double, Min is 1/2 */ +#define MAX_MULTI (2LL << MULTI_SHIFT) +#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) + struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -45,6 +53,7 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; + s64 multi; #ifdef CONFIG_NO_HZ int check_max; #endif @@ -79,6 +88,7 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; + scd->multi = 1 << MULTI_SHIFT; #ifdef CONFIG_NO_HZ scd->check_max = 1; #endif @@ -134,8 +144,13 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + - (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; + /* + * At schedule tick the clock can be just under the gtod. We don't + * want to push it too prematurely. + */ + min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); + if (min_clock > TICK_NSEC) + min_clock -= TICK_NSEC / 2; if (unlikely(delta < 0)) { clock++; @@ -149,6 +164,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; + delta *= scd->multi; + delta >>= MULTI_SHIFT; + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; @@ -230,6 +248,7 @@ void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); unsigned long now_jiffies = jiffies; + s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -247,9 +266,23 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - scd->tick_jiffies = now_jiffies; + delta_gtod = now_gtod - scd->tick_gtod; + delta_raw = now - scd->tick_raw; + + if ((long)delta_raw > 0) { + mult = delta_gtod << MULTI_SHIFT; + do_div(mult, delta_raw); + scd->multi = mult; + if (scd->multi > MAX_MULTI) + scd->multi = MAX_MULTI; + else if (scd->multi < MIN_MULTI) + scd->multi = MIN_MULTI; + } else + scd->multi = 1 << MULTI_SHIFT; + scd->tick_raw = now; scd->tick_gtod = now_gtod; + scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -279,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; + scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog();