From 39e24d8ffbb6aa90222527dbd5991ec49fbbf323 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sat, 23 Nov 2013 18:38:01 +0900 Subject: [PATCH 01/60] sched: Fix a trivial syntax misuse Use if statement instead of while loop. Signed-off-by: Shigeru Yoshida Cc: Peter Zijlstra Cc: Jiri Kosina Link: http://lkml.kernel.org/r/20131123.183801.769652906919404319.shigeru.yoshida@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1808606ee5f..687985b0284e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3654,7 +3654,7 @@ again: } double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { + if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } From bb8cbbfee68518796df4050868e5b0f5ad078f9f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Nov 2013 15:36:12 +0100 Subject: [PATCH 02/60] tasks/fork: Remove unnecessary child->exit_state A zombie task obviously can't fork(), remove the unnecessary initialization of child->exit_state. It is zero anyway after dup_task_struct(). Note: copy_process() is huge and it has a lot of chaotic initializations, probably it makes sense to move them into the new helper called by dup_task_struct(). Signed-off-by: Oleg Nesterov Cc: David Laight Cc: Geert Uytterhoeven Cc: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131113143612.GA10540@redhat.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be9548c..b3080823a24d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1402,13 +1402,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->tgid = p->pid; } - p->pdeath_signal = 0; - p->exit_state = 0; - p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; + p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; From 86506a99a62400e9f7b7d1344bcc9ea235faf98f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Nov 2013 15:36:14 +0100 Subject: [PATCH 03/60] tasks/exit: Remove unused task_is_dead() method task_is_dead() has no users since commit 43e13cc107cf ("cred: remove task_is_dead() from __task_cred() validation"), and nobody except exit.c should rely on ->exit_state (we still have the users which should be changed). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: David Laight Cc: Geert Uytterhoeven Cc: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20131113143614.GA10547@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7e35d4b9e14a..cda8d634bc47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -168,7 +168,6 @@ extern char ___assert_task_state[1 - 2*!!( #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ From 192301e70af3f6803c6354a464ebfa742da738ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Nov 2013 16:45:38 +0100 Subject: [PATCH 04/60] sched: Check TASK_DEAD rather than EXIT_DEAD in schedule_debug() schedule_debug() ignores in_atomic() if prev->exit_state != 0. This is not what we want, ->exit_state is set by exit_notify() but we should complain until the task does the last schedule() in TASK_DEAD. See also 7407251a0e2e "PF_DEAD cleanup", I think this ancient commit explains why schedule() had to rely on ->exit_state, until that commit exit_notify() disabled preemption and set PF_DEAD which was used to detect the exiting task. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: David Laight Cc: Geert Uytterhoeven Cc: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20131113154538.GB15810@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 687985b0284e..19db8f3b0e3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2414,10 +2414,10 @@ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. + * schedule() atomically, we ignore that path. Otherwise whine + * if we are scheduling when we should not. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) __schedule_bug(prev); rcu_sleep_check(); From c44f2a020072d75d6b0cbf9f139a09719cda9367 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2013 13:52:21 +0200 Subject: [PATCH 05/60] sched/fair: Move load idx selection in find_idlest_group load_idx is used in find_idlest_group but initialized in select_task_rq_fair even when not used. The load_idx initialisation is moved in find_idlest_group and the sd_flag replaces it in the function's args. Signed-off-by: Vincent Guittot Cc: len.brown@intel.com Cc: amit.kucheria@linaro.org Cc: pjt@google.com Cc: l.majewski@samsung.com Cc: Morten.Rasmussen@arm.com Cc: cmetcalf@tilera.com Cc: tony.luck@intel.com Cc: alex.shi@intel.com Cc: preeti@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: rjw@sisk.pl Cc: paulmck@linux.vnet.ibm.com Cc: corbet@lwn.net Cc: arjan@linux.intel.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1382097147-30088-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8b652ebe027..6cb36c7ea391 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4110,12 +4110,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -4283,7 +4287,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -4292,10 +4295,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; From 380c9077b38df2962a22f00f21f6cd0db62d3390 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Fri, 15 Nov 2013 15:06:52 +0530 Subject: [PATCH 06/60] sched/fair: Clean up update_sg_lb_stats() a bit Add rq->nr_running to sgs->sum_nr_running directly instead of assigning it through an intermediate variable nr_running. Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1384508212-25032-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6cb36c7ea391..a566c0739f77 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5500,7 +5500,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running; unsigned long load; int i; @@ -5509,8 +5508,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -5518,7 +5515,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += nr_running; + sgs->sum_nr_running += rq->nr_running; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; From e6c390f2dfd04c165ce45b0032f73fba85b1f282 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:35 +0100 Subject: [PATCH 07/60] sched: Add sched_class->task_dead() method Add a new function to the scheduling class interface. It is called at the end of a context switch, if the prev task is in TASK_DEAD state. It will be useful for the scheduling classes that want to be notified when one of their tasks dies, e.g. to perform some cleanup actions, such as SCHED_DEADLINE. Signed-off-by: Dario Faggioli Reviewed-by: Paul Turner Signed-off-by: Juri Lelli Cc: bruce.ashfield@windriver.com Cc: claudio@evidence.eu.com Cc: darren@dvhart.com Cc: dhaval.giani@gmail.com Cc: fchecconi@gmail.com Cc: fweisbec@gmail.com Cc: harald.gustafsson@ericsson.com Cc: hgu1972@gmail.com Cc: insop.song@gmail.com Cc: jkacur@redhat.com Cc: johan.eker@ericsson.com Cc: liming.wang@windriver.com Cc: luca.abeni@unitn.it Cc: michael@amarulasolutions.com Cc: nicola.manica@disi.unitn.it Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: p.faure@akatech.ch Cc: rostedt@goodmis.org Cc: tommaso.cucinotta@sssup.it Cc: vincent.guittot@linaro.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-2-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ kernel/sched/sched.h | 1 + 2 files changed, 4 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 19db8f3b0e3b..25b377986547 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2003,6 +2003,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (unlikely(prev_state == TASK_DEAD)) { task_numa_free(prev); + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..b3b4a4953efc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,6 +1023,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); From 40ea2b42d7c44386cf81d5636d574193da2c8df2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 5 Dec 2013 19:10:17 +0800 Subject: [PATCH 08/60] sched/numa: Drop idx field of task_numa_env struct Drop unused idx field of task_numa_env struct. Signed-off-by: Wanpeng Li Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Mel Gorman Cc: Naoya Horiguchi Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1386241817-5051-2-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a566c0739f77..49aa01f9d4fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1037,7 +1037,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; - int imbalance_pct, idx; + int imbalance_pct; struct task_struct *best_task; long best_imp; From 1bd53a7efdc988163ec4c25f656df38dbe500632 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 12 Dec 2013 15:23:23 +0800 Subject: [PATCH 09/60] sched/numa: Drop sysctl_numa_balancing_settle_count sysctl commit 887c290e (sched/numa: Decide whether to favour task or group weights based on swap candidate relationships) drop the check against sysctl_numa_balancing_settle_count, this patch remove the sysctl. Signed-off-by: Wanpeng Li Acked-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Naoya Horiguchi Link: http://lkml.kernel.org/r/1386833006-6600-1-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 5 ----- include/linux/sched/sysctl.h | 1 - kernel/sched/fair.c | 9 --------- kernel/sysctl.c | 7 ------- 4 files changed, 22 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 26b7ee491df8..6d486404200e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -428,11 +428,6 @@ rate for each task. numa_balancing_scan_size_mb is how many megabytes worth of pages are scanned for a given scan. -numa_balancing_settle_count is how many scan periods must complete before -the schedule balancer stops pushing the task towards a preferred node. This -gives the scheduler a chance to place the task on an alternative node if the -preferred node is overloaded. - numa_balancing_migrate_deferred is how many page migrations get skipped unconditionally, after a page migration is skipped because a page is shared with other tasks. This reduces page migration overhead, and determines diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 41467f8ff8ec..31e0193cb0c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; -extern unsigned int sysctl_numa_balancing_settle_count; #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a9185f7c9446..fcb6c17f7d79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -/* - * Once a preferred node is selected the scheduler balancer will prefer moving - * a task to that node for sysctl_numa_balancing_settle_count number of PTE - * scans. This will give the process the chance to accumulate more faults on - * the preferred node but still allow the scheduler to move the task again if - * the nodes CPUs are overloaded. - */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..c8da99f905cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_settle_count", - .data = &sysctl_numa_balancing_settle_count, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "numa_balancing_migrate_deferred", .data = &sysctl_numa_balancing_migrate_deferred, From de1b301a19754778ddd9f908d266ffe1c010b2cf Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 12 Dec 2013 15:23:24 +0800 Subject: [PATCH 10/60] sched/numa: Use wrapper function task_node to get node which task is on Use wrapper function task_node to get node which task is on. Signed-off-by: Wanpeng Li Acked-by: Mel Gorman Reviewed-by: Naoya Horiguchi Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Peter Zijlstra Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386833006-6600-2-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..374fe04a5e6e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); + SEQ_printf(m, " %d", task_node(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fcb6c17f7d79..ebdb08bf2ec2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1202,7 +1202,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); + p->numa_preferred_nid = task_node(p); return -EINVAL; } @@ -1269,7 +1269,7 @@ static void numa_migrate_preferred(struct task_struct *p) p->numa_migrate_retry = jiffies + HZ; /* Success if task is already running on preferred CPU */ - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + if (task_node(p) == p->numa_preferred_nid) return; /* Otherwise, try migrate to a CPU on the preferred node */ From 82897b4fd3920ffd2456731d4f2ae1406558ef4c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 12 Dec 2013 15:23:25 +0800 Subject: [PATCH 11/60] sched/numa: Use wrapper function task_faults_idx to calculate index in group_faults Use wrapper function task_faults_idx to calculate index in group_faults. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Rik van Riel Link: http://lkml.kernel.org/r/1386833006-6600-3-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebdb08bf2ec2..37892d73f1e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -921,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; } /* From e777b63bbd589248eb151a3191ee92331a701385 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 12 Dec 2013 15:23:26 +0800 Subject: [PATCH 12/60] sched/numa: Fix period_slot recalculation The original code is as intended and was meant to scale the difference between the NUMA_PERIOD_THRESHOLD and local/remote ratio when adjusting the scan period. The period_slot recalculation can be dropped. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Rik van Riel Link: http://lkml.kernel.org/r/1386833006-6600-4-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 37892d73f1e7..4316af287f60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1342,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } From 16824255394f55adf31b9a96a9965d8c15bdac4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Dec 2013 15:08:36 +0100 Subject: [PATCH 13/60] x86, acpi, idle: Restructure the mwait idle routines People seem to delight in writing wrong and broken mwait idle routines; collapse the lot. This leaves mwait_play_dead() the sole remaining user of __mwait() and new __mwait() users are probably doing it wrong. Also remove __sti_mwait() as its unused. Cc: Arjan van de Ven Cc: Jacob Jun Pan Cc: Mike Galbraith Cc: Len Brown Cc: Rui Zhang Acked-by: Rafael Wysocki Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131212141654.616820819@infradead.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mwait.h | 40 ++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 23 ----------------- arch/x86/kernel/acpi/cstate.c | 23 ----------------- drivers/acpi/acpi_pad.c | 5 +--- drivers/acpi/processor_idle.c | 15 ----------- drivers/idle/intel_idle.c | 11 +------- drivers/thermal/intel_powerclamp.c | 4 +-- 7 files changed, 43 insertions(+), 78 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0ac6b4..361b02ef128c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 @@ -13,4 +15,42 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + } + __current_clr_polling(); +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f9..24821f5768bc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -700,29 +700,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27781bc..e69182fd01cf 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index fc6008fbce35..509452a62f96 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -193,10 +193,7 @@ static int power_saving_thread(void *data) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); stop_critical_timings(); - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(power_saving_mwait_eax, 1); + mwait_idle_with_hints(power_saving_mwait_eax, 1); start_critical_timings(); if (lapic_marked_unstable) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 644516d9bde6..f90c56c8379e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -727,11 +727,6 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; - if (cx->entry_method == ACPI_CSTATE_FFH) { - if (current_set_polling_and_test()) - return -EINVAL; - } - lapic_timer_state_broadcast(pr, cx, 1); acpi_idle_do_entry(cx); @@ -785,11 +780,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, if (unlikely(!pr)) return -EINVAL; - if (cx->entry_method == ACPI_CSTATE_FFH) { - if (current_set_polling_and_test()) - return -EINVAL; - } - /* * Must be done before busmaster disable as we might need to * access HPET ! @@ -841,11 +831,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, } } - if (cx->entry_method == ACPI_CSTATE_FFH) { - if (current_set_polling_and_test()) - return -EINVAL; - } - acpi_unlazy_tlb(smp_processor_id()); /* Tell the scheduler that we are going deep-idle: */ diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f80b700f821c..efec4055fd5e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -375,16 +375,7 @@ static int intel_idle(struct cpuidle_device *dev, if (!(lapic_timer_reliable_states & (1 << (cstate)))) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); - if (!current_set_polling_and_test()) { - - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(eax, ecx); - } + mwait_idle_with_hints(eax, ecx); if (!(lapic_timer_reliable_states & (1 << (cstate)))) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 8f181b3f842b..e8275f2df9af 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -438,9 +438,7 @@ static int clamp_thread(void *arg) */ local_touch_nmi(); stop_critical_timings(); - __monitor((void *)¤t_thread_info()->flags, 0, 0); - cpu_relax(); /* allow HT sibling to run */ - __mwait(eax, ecx); + mwait_idle_with_hints(eax, ecx); start_critical_timings(); atomic_inc(&idle_wakeup_counter); } From 7e98b71920464b8d15fa95c74366416cd3c88861 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 11:58:16 -0800 Subject: [PATCH 14/60] x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers Use static_cpu_has() to conditionalize the CLFLUSH workaround, and add memory barriers around it since the documentation is explicit that CLFLUSH is only ordered with respect to MFENCE. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/include/asm/mwait.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 361b02ef128c..19b71c439256 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,8 +43,11 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + mb(); clflush((void *)¤t_thread_info()->flags); + mb(); + } __monitor((void *)¤t_thread_info()->flags, 0, 0); if (!need_resched()) From 7d590cca7cd2cce4ed7c47d221d6f90566653ba8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 12:30:03 -0800 Subject: [PATCH 15/60] x86, idle: Add memory barriers around clflush in mwait_play_dead() For consistency with mwait_idle_with_hints(). Not sure they help, but they really won't hurt... Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a3aa02..f5252c4eec8c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void) * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. */ + mb(); clflush(mwait_ptr); + mb(); __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); From d50dde5a10f305253cbc3855307f608f8a3c5f73 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:36 +0100 Subject: [PATCH 16/60] sched: Add new scheduler syscalls to support an extended scheduling parameters ABI Add the syscalls needed for supporting scheduling algorithms with extended scheduling parameters (e.g., SCHED_DEADLINE). In general, it makes possible to specify a periodic/sporadic task, that executes for a given amount of runtime at each instance, and is scheduled according to the urgency of their own timing constraints, i.e.: - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Thus, both the data structure that holds the scheduling parameters of the tasks and the system calls dealing with it must be extended. Unfortunately, modifying the existing struct sched_param would break the ABI and result in potentially serious compatibility issues with legacy binaries. For these reasons, this patch: - defines the new struct sched_attr, containing all the fields that are necessary for specifying a task in the computational model described above; - defines and implements the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr(). Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a proof of concept and for developing and testing purposes. Making them available on other architectures is straightforward. Since no "user" for these new parameters is introduced in this patch, the implementation of the new system calls is just identical to their already existing counterpart. Future patches that implement scheduling policies able to exploit the new data structure must also take care of modifying the sched_*attr() calls accordingly with their own purposes. Signed-off-by: Dario Faggioli [ Rewrote to use sched_attr. ] Signed-off-by: Juri Lelli [ Removed sched_setscheduler2() for now. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- arch/arm/include/asm/unistd.h | 2 +- arch/arm/include/uapi/asm/unistd.h | 2 + arch/arm/kernel/calls.S | 2 + arch/x86/syscalls/syscall_32.tbl | 2 + arch/x86/syscalls/syscall_64.tbl | 2 + include/linux/sched.h | 62 +++++++ include/linux/syscalls.h | 6 + kernel/sched/core.c | 263 ++++++++++++++++++++++++++--- kernel/sched/sched.h | 9 +- 9 files changed, 326 insertions(+), 24 deletions(-) diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 141baa3f9a72..acabef1a75df 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -15,7 +15,7 @@ #include -#define __NR_syscalls (380) +#define __NR_syscalls (384) #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) #define __ARCH_WANT_STAT64 diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index af33b44990ed..fb5584d0cc05 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -406,6 +406,8 @@ #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) #define __NR_kcmp (__NR_SYSCALL_BASE+378) #define __NR_finit_module (__NR_SYSCALL_BASE+379) +#define __NR_sched_setattr (__NR_SYSCALL_BASE+380) +#define __NR_sched_getattr (__NR_SYSCALL_BASE+381) /* * This may need to be greater than __NR_last_syscall+1 in order to diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index c6ca7e376773..166e945de832 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -389,6 +389,8 @@ CALL(sys_process_vm_writev) CALL(sys_kcmp) CALL(sys_finit_module) +/* 380 */ CALL(sys_sched_setattr) + CALL(sys_sched_getattr) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb8380a1c..96bc506ac6de 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -357,3 +357,5 @@ 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65dfd14f..a12bddc7ccea 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -320,6 +320,8 @@ 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp 313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/sched.h b/include/linux/sched.h index 3a1e9857b393..86025b6c6387 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -56,6 +56,66 @@ struct sched_param { #include +#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + +/* + * Extended scheduling parameters data structure. + * + * This is needed because the original struct sched_param can not be + * altered without introducing ABI issues with legacy applications + * (e.g., in sched_getparam()). + * + * However, the possibility of specifying more than just a priority for + * the tasks may be useful for a wide variety of application fields, e.g., + * multimedia, streaming, automation and control, and many others. + * + * This variant (sched_attr) is meant at describing a so-called + * sporadic time-constrained task. In such model a task is specified by: + * - the activation period or minimum instance inter-arrival time; + * - the maximum (or average, depending on the actual scheduling + * discipline) computation time of all instances, a.k.a. runtime; + * - the deadline (relative to the actual activation time) of each + * instance. + * Very briefly, a periodic (sporadic) task asks for the execution of + * some specific computation --which is typically called an instance-- + * (at most) every period. Moreover, each instance typically lasts no more + * than the runtime and must be completed by time instant t equal to + * the instance activation time + the deadline. + * + * This is reflected by the actual fields of the sched_attr structure: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_flags for customizing the scheduler behaviour + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * @sched_deadline representative of the task's deadline + * @sched_runtime representative of the task's runtime + * @sched_period representative of the task's period + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their + * timing constraints. + */ +struct sched_attr { + u32 size; + + u32 sched_policy; + u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + u32 sched_priority; + + /* SCHED_DEADLINE */ + u64 sched_runtime; + u64 sched_deadline; + u64 sched_period; +}; + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -1958,6 +2018,8 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, + const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 94273bbe6050..40ed9e9a77e5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -38,6 +38,7 @@ struct rlimit; struct rlimit64; struct rusage; struct sched_param; +struct sched_attr; struct sel_arg_struct; struct semaphore; struct sembuf; @@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param); asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_setattr(pid_t pid, + struct sched_attr __user *attr); asmlinkage long sys_sched_getscheduler(pid_t pid); asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_getattr(pid_t pid, + struct sched_attr __user *attr, + unsigned int size); asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr); asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b21a63ed5d62..8174f889076c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2817,6 +2817,7 @@ out_unlock: __task_rq_unlock(rq); } #endif + void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -2991,22 +2992,29 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold rq lock. */ -static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) { + int policy = attr->sched_policy; + p->policy = policy; - p->rt_priority = prio; + + if (rt_policy(policy)) + p->rt_priority = attr->sched_priority; + else + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; + set_load_weight(p); } - /* * check the target process has a UID that matches the current process's */ @@ -3023,10 +3031,12 @@ static bool check_same_owner(struct task_struct *p) return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; + int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -3054,17 +3064,22 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + if (attr->sched_priority < 0 || + (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if (rt_policy(policy) != (attr->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (!can_nice(p, attr->sched_nice)) + return -EPERM; + } + if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -3074,8 +3089,8 @@ recheck: return -EPERM; /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) return -EPERM; } @@ -3123,11 +3138,16 @@ recheck: /* * If not changing anything there's no need to proceed further: */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + task_rq_unlock(rq, p, &flags); return 0; } +change: #ifdef CONFIG_RT_GROUP_SCHED if (user) { @@ -3161,7 +3181,7 @@ recheck: oldprio = p->prio; prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); + __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); @@ -3189,10 +3209,20 @@ recheck: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority + }; + return __sched_setscheduler(p, &attr, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -3209,7 +3239,11 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority + }; + return __sched_setscheduler(p, &attr, false); } static int @@ -3234,6 +3268,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, -20, 19); + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -3264,6 +3371,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) return do_sched_setscheduler(pid, -1, param); } +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @attr: structure containing the extended parameters. + */ +SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0) + return -EINVAL; + + if (sched_copy_attr(uattr, &attr)) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. @@ -3334,6 +3468,92 @@ out_unlock: return retval; } +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + goto err_size; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, usize); + if (ret) + return -EFAULT; + +out: + return ret; + +err_size: + ret = -E2BIG; + goto out; +} + +/** + * sys_sched_getattr - same as above, but with extended "sched_param" + * @pid: the pid in question. + * @attr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + */ +SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = TASK_NICE(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; @@ -6400,13 +6620,16 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { const struct sched_class *prev_class = p->sched_class; + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + }; int old_prio = p->prio; int on_rq; on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); + __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); resched_task(rq->curr); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3b4a4953efc..df023db7721c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -81,11 +81,14 @@ extern void update_cpu_load_active(struct rq *this_rq); */ #define RUNTIME_INF ((u64)~0ULL) +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + static inline int rt_policy(int policy) { - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; + return policy == SCHED_FIFO || policy == SCHED_RR; } static inline int task_has_rt_policy(struct task_struct *p) From aab03e05e8f7e26f51dee792beddcb5cca9215a5 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 28 Nov 2013 11:14:43 +0100 Subject: [PATCH 17/60] sched/deadline: Add SCHED_DEADLINE structures & implementation Introduces the data structures, constants and symbols needed for SCHED_DEADLINE implementation. Core data structure of SCHED_DEADLINE are defined, along with their initializers. Hooks for checking if a task belong to the new policy are also added where they are needed. Adds a scheduling class, in sched/dl.c and a new policy called SCHED_DEADLINE. It is an implementation of the Earliest Deadline First (EDF) scheduling algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS) that makes it possible to isolate the behaviour of tasks between each other. The typical -deadline task will be made up of a computation phase (instance) which is activated on a periodic or sporadic fashion. The expected (maximum) duration of such computation is called the task's runtime; the time interval by which each instance need to be completed is called the task's relative deadline. The task's absolute deadline is dynamically calculated as the time instant a task (better, an instance) activates plus the relative deadline. The EDF algorithms selects the task with the smallest absolute deadline as the one to be executed first, while the CBS ensures each task to run for at most its runtime every (relative) deadline length time interval, avoiding any interference between different tasks (bandwidth isolation). Thanks to this feature, also tasks that do not strictly comply with the computational model sketched above can effectively use the new policy. To summarize, this patch: - introduces the data structures, constants and symbols needed; - implements the core logic of the scheduling algorithm in the new scheduling class file; - provides all the glue code between the new scheduling class and the core scheduler and refines the interactions between sched/dl and the other existing scheduling classes. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Signed-off-by: Fabio Checconi Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 ++- include/linux/sched/deadline.h | 24 ++ include/uapi/linux/sched.h | 1 + kernel/fork.c | 4 +- kernel/hrtimer.c | 3 +- kernel/sched/Makefile | 3 +- kernel/sched/core.c | 109 +++++- kernel/sched/deadline.c | 684 +++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 26 ++ kernel/sched/stop_task.c | 2 +- 10 files changed, 882 insertions(+), 20 deletions(-) create mode 100644 include/linux/sched/deadline.h create mode 100644 kernel/sched/deadline.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 86025b6c6387..6c196794fc12 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -97,6 +97,10 @@ struct sched_param { * Given this task model, there are a multiplicity of scheduling algorithms * and policies, that can be used to ensure all the tasks will make their * timing constraints. + * + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the + * only user of this new interface. More information about the algorithm + * available in the scheduling class file or in Documentation/. */ struct sched_attr { u32 size; @@ -1088,6 +1092,45 @@ struct sched_rt_entity { #endif }; +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setscheduler2(), they will remain the same until + * the next sched_setscheduler2(). + */ + u64 dl_runtime; /* maximum runtime for each instance */ + u64 dl_deadline; /* relative deadline of each instance */ + + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* remaining runtime for this instance */ + u64 deadline; /* absolute deadline for this instance */ + unsigned int flags; /* specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_new tells if a new instance arrived. If so we must + * start executing it with full runtime and reset its absolute + * deadline; + */ + int dl_throttled, dl_new; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; +}; struct rcu_node; @@ -1124,6 +1167,7 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ @@ -2099,7 +2143,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h new file mode 100644 index 000000000000..9d303b8847df --- /dev/null +++ b/include/linux/sched/deadline.h @@ -0,0 +1,24 @@ +#ifndef _SCHED_DEADLINE_H +#define _SCHED_DEADLINE_H + +/* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and + * NORMAL/BATCH tasks. + */ + +#define MAX_DL_PRIO 0 + +static inline int dl_prio(int prio) +{ + if (unlikely(prio < MAX_DL_PRIO)) + return 1; + return 0; +} + +static inline int dl_task(struct task_struct *p) +{ + return dl_prio(p->prio); +} + +#endif /* _SCHED_DEADLINE_H */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5a0f945927ac..2d5e49a2a6d2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,6 +39,7 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_DEADLINE 6 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/kernel/fork.c b/kernel/fork.c index 6023d150a305..e6c0f1a22914 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1311,7 +1311,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(clone_flags, p); + retval = sched_fork(clone_flags, p); + if (retval) + goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, unsigned long slack; slack = current->timer_slack_ns; - if (rt_task(current)) + if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..b039035a9376 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8174f889076c..203aecdcfccb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -899,7 +899,9 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + prio = MAX_DL_PRIO-1; + else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -1717,6 +1719,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif + RB_CLEAR_NODE(&p->dl.rb_node); + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + p->dl.dl_runtime = p->dl.runtime = 0; + p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.flags = 0; + INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1768,7 +1776,7 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -1790,7 +1798,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; @@ -1807,8 +1815,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { p->sched_class = &fair_sched_class; + } if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -1837,6 +1851,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) #endif put_cpu(); + return 0; } /* @@ -2768,7 +2783,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) struct rq *rq; const struct sched_class *prev_class; - BUG_ON(prio < 0 || prio > MAX_PRIO); + BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); @@ -2800,7 +2815,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (rt_prio(prio)) + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; @@ -2835,9 +2852,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -2992,6 +3009,27 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ +static void +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + init_dl_task_timer(dl_se); + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; +} + /* Actually do priority change: must hold pi & rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr) @@ -3000,7 +3038,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->policy = policy; - if (rt_policy(policy)) + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (rt_policy(policy)) p->rt_priority = attr->sched_priority; else p->static_prio = NICE_TO_PRIO(attr->sched_nice); @@ -3008,13 +3048,39 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->normal_prio = normal_prio(p); p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; set_load_weight(p); } + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime. + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ + return attr && attr->sched_deadline != 0 && + (s64)(attr->sched_deadline - attr->sched_runtime) >= 0; +} + /* * check the target process has a UID that matches the current process's */ @@ -3053,7 +3119,8 @@ recheck: reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); policy &= ~SCHED_RESET_ON_FORK; - if (policy != SCHED_FIFO && policy != SCHED_RR && + if (policy != SCHED_DEADLINE && + policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; @@ -3068,7 +3135,8 @@ recheck: (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (attr->sched_priority != 0)) + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* @@ -3143,6 +3211,8 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; + if (dl_policy(policy)) + goto change; task_rq_unlock(rq, p, &flags); return 0; @@ -3453,6 +3523,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; + if (task_has_dl_policy(p)) { + retval = -EINVAL; + goto out_unlock; + } lp.sched_priority = p->rt_priority; rcu_read_unlock(); @@ -3510,7 +3584,7 @@ err_size: } /** - * sys_sched_getattr - same as above, but with extended "sched_param" + * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @attr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. @@ -3539,7 +3613,9 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, goto out_unlock; attr.sched_policy = p->policy; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) attr.sched_priority = p->rt_priority; else attr.sched_nice = TASK_NICE(p); @@ -3965,6 +4041,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -3991,6 +4068,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_RR: ret = 1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -6472,6 +6550,7 @@ void __init sched_init(void) rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); + init_dl_rq(&rq->dl, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6659,7 +6738,7 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!rt_task(p)) { + if (!dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..93d82b2a88bd --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,684 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli , + * Michael Trimarchi , + * Fabio Checconi + */ +#include "sched.h" + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ + return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + + return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ + return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ + struct sched_dl_entity *dl_se = &p->dl; + + return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ + dl_rq->rb_root = RB_ROOT; +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + * - the absolute deadline of the entity has to be placed at + * current time + relative deadline; + * - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + + /* + * We use the regular wall clock time to set deadlines in the + * future; in fact, we must consider execution overheads (time + * spent on hardirq context, etc.). + */ + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; + dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * We keep moving the deadline away until we get some + * available runtime for the entity. This ensures correct + * handling of situations where the runtime overrun is + * arbitrary large. + */ + while (dl_se->runtime <= 0) { + dl_se->deadline += dl_se->dl_deadline; + dl_se->runtime += dl_se->dl_runtime; + } + + /* + * At this point, the deadline really should be "in + * the future" with respect to rq->clock. If it's + * not, we are, for some reason, lagging too much! + * Anyway, after having warn userspace abut that, + * we still try to keep the things running by + * resetting the deadline and the budget of the + * entity. + */ + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { + static bool lag_once = false; + + if (!lag_once) { + lag_once = true; + printk_sched("sched: DL replenish lagged to much\n"); + } + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; + } +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks. + * + * This function returns true if: + * + * runtime / (deadline - t) > dl_runtime / dl_deadline , + * + * IOW we can't recycle current parameters. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) +{ + u64 left, right; + + /* + * left and right are the two sides of the equation above, + * after a bit of shuffling to use multiplications instead + * of divisions. + * + * Note that none of the time values involved in the two + * multiplications are absolute: dl_deadline and dl_runtime + * are the relative deadline and the maximum runtime of each + * instance, runtime is the runtime left for the last instance + * and (deadline - t), since t is rq->clock, is the time left + * to the (absolute) deadline. Even if overflowing the u64 type + * is very unlikely to occur in both cases, here we scale down + * as we want to avoid that risk at all. Scaling down by 10 + * means that we reduce granularity to 1us. We are fine with it, + * since this is only a true/false check and, anyway, thinking + * of anything below microseconds resolution is actually fiction + * (but still we want to give the user that illusion >;). + */ + left = (dl_se->dl_deadline >> 10) * (dl_se->runtime >> 10); + right = ((dl_se->deadline - t) >> 10) * (dl_se->dl_runtime >> 10); + + return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * The arrival of a new instance needs special treatment, i.e., + * the actual scheduling parameters have to be "renewed". + */ + if (dl_se->dl_new) { + setup_new_dl_entity(dl_se); + return; + } + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || + dl_entity_overflow(dl_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; + } +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + ktime_t now, act; + ktime_t soft, hard; + unsigned long range; + s64 delta; + + /* + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. + */ + act = ns_to_ktime(dl_se->deadline); + now = hrtimer_cb_get_time(&dl_se->dl_timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); + + /* + * If the expiry time already passed, e.g., because the value + * chosen as the deadline is too small, don't even try to + * start the timer in the past! + */ + if (ktime_us_delta(act, now) < 0) + return 0; + + hrtimer_set_expires(&dl_se->dl_timer, act); + + soft = hrtimer_get_softexpires(&dl_se->dl_timer); + hard = hrtimer_get_expires(&dl_se->dl_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + dl_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + raw_spin_lock(&rq->lock); + + /* + * We need to take care of a possible races here. In fact, the + * task might have changed its scheduling policy to something + * different from SCHED_DEADLINE or changed its reservation + * parameters (through sched_setscheduler()). + */ + if (!dl_task(p) || dl_se->dl_new) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + dl_se->dl_throttled = 0; + if (p->on_rq) { + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); + } +unlock: + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->dl_timer; + + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); + int rorun = dl_se->runtime <= 0; + + if (!rorun && !dmiss) + return 0; + + /* + * If we are beyond our current deadline and we are still + * executing, then we have already used some of the runtime of + * the next instance. Thus, if we do not account that, we are + * stealing bandwidth from the system at each deadline miss! + */ + if (dmiss) { + dl_se->runtime = rorun ? dl_se->runtime : 0; + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; + } + + return 1; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + u64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); + + dl_se->runtime -= delta_exec; + if (dl_runtime_exceeded(rq, dl_se)) { + __dequeue_task_dl(rq, curr, 0); + if (likely(start_dl_timer(dl_se))) + dl_se->dl_throttled = 1; + else + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + + if (!is_leftmost(curr, &rq->dl)) + resched_task(curr); + } +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_dl_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_dl_entity, rb_node); + if (dl_time_before(dl_se->deadline, entry->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->rb_leftmost = &dl_se->rb_node; + + rb_link_node(&dl_se->rb_node, parent, link); + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + + dl_rq->dl_nr_running++; +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + if (RB_EMPTY_NODE(&dl_se->rb_node)) + return; + + if (dl_rq->rb_leftmost == &dl_se->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&dl_se->rb_node); + dl_rq->rb_leftmost = next_node; + } + + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + RB_CLEAR_NODE(&dl_se->rb_node); + + dl_rq->dl_nr_running--; +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) +{ + BUG_ON(on_dl_rq(dl_se)); + + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se); + else + update_dl_entity(dl_se); + + __enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + __dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + /* + * If p is throttled, we do nothing. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + */ + if (p->dl.dl_throttled) + return; + + enqueue_dl_entity(&p->dl, flags); + inc_nr_running(rq); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_dl_entity(&p->dl); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); + + dec_nr_running(rq); +} + +/* + * Yield task semantic for -deadline tasks is: + * + * get off from the CPU until our next instance, with + * a new runtime. This is of little use now, since we + * don't have a bandwidth reclaiming mechanism. Anyway, + * bandwidth reclaiming is planned for the future, and + * yield_task_dl will indicate that some spare budget + * is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * We make the task go to sleep until its current deadline by + * forcing its runtime to zero. This way, update_curr_dl() stops + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_new=1). + */ + if (p->dl.runtime > 0) { + rq->curr->dl.dl_new = 1; + p->dl.runtime = 0; + } + update_curr_dl(rq); +} + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags) +{ + if (dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_task(rq->curr); +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ + s64 delta = p->dl.dl_runtime - p->dl.runtime; + + if (delta > 10000) + hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, + struct dl_rq *dl_rq) +{ + struct rb_node *left = dl_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct sched_dl_entity *dl_se; + struct task_struct *p; + struct dl_rq *dl_rq; + + dl_rq = &rq->dl; + + if (unlikely(!dl_rq->dl_nr_running)) + return NULL; + + dl_se = pick_next_dl_entity(rq, dl_rq); + BUG_ON(!dl_se); + + p = dl_task_of(dl_se); + p->se.exec_start = rq_clock_task(rq); +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); +#endif + return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ + update_curr_dl(rq); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ + /* + * SCHED_DEADLINE tasks cannot fork and this is achieved through + * sched_fork() + */ +} + +static void task_dead_dl(struct task_struct *p) +{ + struct hrtimer *timer = &p->dl.dl_timer; + + if (hrtimer_active(timer)) + hrtimer_try_to_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); +} + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ + if (hrtimer_active(&p->dl.dl_timer)) + hrtimer_try_to_cancel(&p->dl.dl_timer); +} + +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ + /* + * If p is throttled, don't consider the possibility + * of preempting rq->curr, the check will be done right + * after its runtime will get replenished. + */ + if (unlikely(p->dl.dl_throttled)) + return; + + if (p->on_rq || rq->curr != p) { + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); + } +} + +static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) +{ + switched_to_dl(rq, p); +} + +#ifdef CONFIG_SMP +static int +select_task_rq_dl(struct task_struct *p, int prev_cpu, int sd_flag, int flags) +{ + return task_cpu(p); +} +#endif + +const struct sched_class dl_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_dl, + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + + .check_preempt_curr = check_preempt_curr_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_dl, +#endif + + .set_curr_task = set_curr_task_dl, + .task_tick = task_tick_dl, + .task_fork = task_fork_dl, + .task_dead = task_dead_dl, + + .prio_changed = prio_changed_dl, + .switched_from = switched_from_dl, + .switched_to = switched_to_dl, +}; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df023db7721c..83eb5390f753 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -91,11 +92,21 @@ static inline int rt_policy(int policy) return policy == SCHED_FIFO || policy == SCHED_RR; } +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} + static inline int task_has_rt_policy(struct task_struct *p) { return rt_policy(p->policy); } +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -367,6 +378,15 @@ struct rt_rq { #endif }; +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; +}; + #ifdef CONFIG_SMP /* @@ -435,6 +455,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -991,6 +1012,7 @@ static const u32 prio_to_wmult[40] = { #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_REPLENISH 8 #define DEQUEUE_SLEEP 1 @@ -1046,6 +1068,7 @@ struct sched_class { for (class = sched_class_highest; class; class = class->next) extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -1081,6 +1104,8 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); + extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); @@ -1357,6 +1382,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &rt_sched_class, + .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, From 1baca4ce16b8cc7d4f50be1f7914799af30a2861 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 7 Nov 2013 14:43:38 +0100 Subject: [PATCH 18/60] sched/deadline: Add SCHED_DEADLINE SMP-related data structures & logic Introduces data structures relevant for implementing dynamic migration of -deadline tasks and the logic for checking if runqueues are overloaded with -deadline tasks and for choosing where a task should migrate, when it is the case. Adds also dynamic migrations to SCHED_DEADLINE, so that tasks can be moved among CPUs when necessary. It is also possible to bind a task to a (set of) CPU(s), thus restricting its capability of migrating, or forbidding migrations at all. The very same approach used in sched_rt is utilised: - -deadline tasks are kept into CPU-specific runqueues, - -deadline tasks are migrated among runqueues to achieve the following: * on an M-CPU system the M earliest deadline ready tasks are always running; * affinity/cpusets settings of all the -deadline tasks is always respected. Therefore, this very special form of "load balancing" is done with an active method, i.e., the scheduler pushes or pulls tasks between runqueues when they are woken up and/or (de)scheduled. IOW, every time a preemption occurs, the descheduled task might be sent to some other CPU (depending on its deadline) to continue executing (push). On the other hand, every time a CPU becomes idle, it might pull the second earliest deadline ready task from some other CPU. To enforce this, a pull operation is always attempted before taking any scheduling decision (pre_schedule()), as well as a push one after each scheduling decision (post_schedule()). In addition, when a task arrives or wakes up, the best CPU where to resume it is selected taking into account its affinity mask, the system topology, but also its deadline. E.g., from the scheduling point of view, the best CPU where to wake up (and also where to push) a task is the one which is running the task with the latest deadline among the M executing ones. In order to facilitate these decisions, per-runqueue "caching" of the deadlines of the currently running and of the first ready task is used. Queued but not running tasks are also parked in another rb-tree to speed-up pushes. Signed-off-by: Juri Lelli Signed-off-by: Dario Faggioli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-5-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 9 +- kernel/sched/deadline.c | 934 +++++++++++++++++++++++++++++++++++++++- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 34 ++ 5 files changed, 963 insertions(+), 17 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6c196794fc12..cc66f2615a6d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1201,6 +1201,7 @@ struct task_struct { struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm, *active_mm; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 203aecdcfccb..548cc04aee45 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1848,6 +1848,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu(); @@ -5040,6 +5041,7 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -5091,8 +5093,10 @@ static int init_rootdomain(struct root_domain *rd) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; @@ -5100,6 +5104,8 @@ static int init_rootdomain(struct root_domain *rd) free_rto_mask: free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: @@ -6451,6 +6457,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); + init_sched_dl_class(); } #else void __init sched_init_smp(void) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 93d82b2a88bd..fcc02c9ca16b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -10,6 +10,7 @@ * miss some of their deadlines), and won't affect any other task. * * Copyright (C) 2012 Dario Faggioli , + * Juri Lelli , * Michael Trimarchi , * Fabio Checconi */ @@ -20,6 +21,15 @@ static inline int dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } +/* + * Tells if entity @a should preempt entity @b. + */ +static inline +int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -53,8 +63,168 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) { dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP + /* zero means no -deadline tasks */ + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + + dl_rq->dl_nr_migratory = 0; + dl_rq->overloaded = 0; + dl_rq->pushable_dl_tasks_root = RB_ROOT; +#endif } +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); + /* + * Must be visible before the overload count is + * set (as in sched_rt.c). + * + * Matched by the barrier in pull_dl_task(). + */ + smp_wmb(); + atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + atomic_dec(&rq->rd->dlo_count); + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (!dl_rq->overloaded) { + dl_set_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 1; + } + } else if (dl_rq->overloaded) { + dl_clear_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 0; + } +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total++; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory++; + + update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total--; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory--; + + update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node *parent = NULL; + struct task_struct *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct task_struct, + pushable_dl_tasks); + if (dl_entity_preempt(&p->dl, &entry->dl)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + + rb_link_node(&p->pushable_dl_tasks, parent, link); + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) + return; + + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + struct rb_node *next_node; + + next_node = rb_next(&p->pushable_dl_tasks); + dl_rq->pushable_dl_tasks_leftmost = next_node; + } + + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +#endif /* CONFIG_SMP */ + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, @@ -309,6 +479,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) check_preempt_curr_dl(rq, p, 0); else resched_task(rq->curr); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif } unlock: raw_spin_unlock(&rq->lock); @@ -399,6 +577,100 @@ static void update_curr_dl(struct rq *rq) } } +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + + if (next && dl_prio(next->prio)) + return next->dl.deadline; + else + return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + if (dl_rq->earliest_dl.curr == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + /* + * If the dl_rq had no -deadline tasks, or if the new task + * has shorter deadline than the current one on dl_rq, we + * know that the previous earliest becomes our next earliest, + * as the new task becomes the earliest itself. + */ + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; + dl_rq->earliest_dl.curr = deadline; + } else if (dl_rq->earliest_dl.next == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.next)) { + /* + * On the other hand, if the new -deadline task has a + * a later deadline than the earliest one on dl_rq, but + * it is earlier than the next (if any), we must + * recompute the next-earliest. + */ + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * Since we may have removed our earliest (and/or next earliest) + * task we must recompute them. + */ + if (!dl_rq->dl_nr_running) { + dl_rq->earliest_dl.curr = 0; + dl_rq->earliest_dl.next = 0; + } else { + struct rb_node *leftmost = dl_rq->rb_leftmost; + struct sched_dl_entity *entry; + + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); + dl_rq->earliest_dl.curr = entry->deadline; + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + u64 deadline = dl_se->deadline; + + WARN_ON(!dl_prio(prio)); + dl_rq->dl_nr_running++; + + inc_dl_deadline(dl_rq, deadline); + inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + + WARN_ON(!dl_prio(prio)); + WARN_ON(!dl_rq->dl_nr_running); + dl_rq->dl_nr_running--; + + dec_dl_deadline(dl_rq, dl_se->deadline); + dec_dl_migration(dl_se, dl_rq); +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); @@ -426,7 +698,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) rb_link_node(&dl_se->rb_node, parent, link); rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); - dl_rq->dl_nr_running++; + inc_dl_tasks(dl_se, dl_rq); } static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) @@ -446,7 +718,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) rb_erase(&dl_se->rb_node, &dl_rq->rb_root); RB_CLEAR_NODE(&dl_se->rb_node); - dl_rq->dl_nr_running--; + dec_dl_tasks(dl_se, dl_rq); } static void @@ -484,12 +756,17 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; enqueue_dl_entity(&p->dl, flags); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); + inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { dequeue_dl_entity(&p->dl); + dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -527,6 +804,74 @@ static void yield_task_dl(struct rq *rq) update_curr_dl(rq); } +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); +static int latest_cpu_find(struct cpumask *span, + struct task_struct *task, + struct cpumask *later_mask); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If we are dealing with a -deadline task, we must + * decide where to wake it up. + * If it has a later deadline and the current task + * on this rq can't move (provided the waking task + * can!) we prefer to send it somewhere else. On the + * other hand, if it has a shorter deadline, we + * try to make it stay here, it might be important. + */ + if (unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + (p->nr_cpus_allowed > 1)) { + int target = find_later_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + latest_cpu_find(rq->rd->span, rq->curr, NULL) == -1) + return; + + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + latest_cpu_find(rq->rd->span, p, NULL) != -1) + return; + + resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + /* * Only called when both the current and waking task are -deadline * tasks. @@ -534,8 +879,20 @@ static void yield_task_dl(struct rq *rq) static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * In the unlikely case current and p have the same deadline + * let us try to decide what's the best thing to do... + */ + if ((s64)(p->dl.deadline - rq->curr->dl.deadline) == 0 && + !need_resched()) + check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -575,16 +932,29 @@ struct task_struct *pick_next_task_dl(struct rq *rq) p = dl_task_of(dl_se); p->se.exec_start = rq_clock_task(rq); + + /* Running task will never be pushed. */ + if (p) + dequeue_pushable_dl_task(rq, p); + #ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); #endif + +#ifdef CONFIG_SMP + rq->post_schedule = has_pushable_dl_tasks(rq); +#endif /* CONFIG_SMP */ + return p; } static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); + + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); } static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) @@ -618,16 +988,517 @@ static void set_curr_task_dl(struct rq *rq) struct task_struct *p = rq->curr; p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); } +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + + return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.rb_leftmost; + struct sched_dl_entity *dl_se; + struct task_struct *p = NULL; + +next_node: + next_node = rb_next(next_node); + if (next_node) { + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); + p = dl_task_of(dl_se); + + if (pick_dl_task(rq, p, cpu)) + return p; + + goto next_node; + } + + return NULL; +} + +static int latest_cpu_find(struct cpumask *span, + struct task_struct *task, + struct cpumask *later_mask) +{ + const struct sched_dl_entity *dl_se = &task->dl; + int cpu, found = -1, best = 0; + u64 max_dl = 0; + + for_each_cpu(cpu, span) { + struct rq *rq = cpu_rq(cpu); + struct dl_rq *dl_rq = &rq->dl; + + if (cpumask_test_cpu(cpu, &task->cpus_allowed) && + (!dl_rq->dl_nr_running || dl_time_before(dl_se->deadline, + dl_rq->earliest_dl.curr))) { + if (later_mask) + cpumask_set_cpu(cpu, later_mask); + if (!best && !dl_rq->dl_nr_running) { + best = 1; + found = cpu; + } else if (!best && + dl_time_before(max_dl, + dl_rq->earliest_dl.curr)) { + max_dl = dl_rq->earliest_dl.curr; + found = cpu; + } + } else if (later_mask) + cpumask_clear_cpu(cpu, later_mask); + } + + return found; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + int this_cpu = smp_processor_id(); + int best_cpu, cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!later_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; + + best_cpu = latest_cpu_find(task_rq(task)->rd->span, task, later_mask); + if (best_cpu == -1) + return -1; + + /* + * If we are here, some target has been found, + * the most suitable of which is cached in best_cpu. + * This is, among the runqueues where the current tasks + * have later deadlines than the task's one, the rq + * with the latest possible one. + * + * Now we check how well this matches with task's + * affinity and system topology. + * + * The last cpu where the task run is our first + * guess, since it is most likely cache-hot there. + */ + if (cpumask_test_cpu(cpu, later_mask)) + return cpu; + /* + * Check if this_cpu is to be skipped (i.e., it is + * not in the mask) or not. + */ + if (!cpumask_test_cpu(this_cpu, later_mask)) + this_cpu = -1; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + + /* + * If possible, preempting this_cpu is + * cheaper than migrating. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + /* + * Last chance: if best_cpu is valid and is + * in the mask, that becomes our choice. + */ + if (best_cpu < nr_cpu_ids && + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * At this point, all our guesses failed, we just return + * 'something', and let the caller sort the things out. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + + return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *later_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < DL_MAX_TRIES; tries++) { + cpu = find_later_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + later_rq = cpu_rq(cpu); + + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(later_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || !task->on_rq)) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; + } + } + + /* + * If the rq we found has no -deadline task, or + * its earliest one has a later deadline than our + * task, the rq is a good one. + */ + if (!later_rq->dl.dl_nr_running || + dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) + break; + + /* Otherwise we try again. */ + double_unlock_balance(rq, later_rq); + later_rq = NULL; + } + + return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->se.on_rq); + BUG_ON(!dl_task(p)); + + return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *later_rq; + + if (!rq->dl.overloaded) + return 0; + + next_task = pick_next_pushable_dl_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * If next_task preempts rq->curr, and rq->curr + * can move away, it makes sense to just reschedule + * without going further in pushing next_task. + */ + if (dl_task(rq->curr) && + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + rq->curr->nr_cpus_allowed > 1) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* Will lock the rq it'll find */ + later_rq = find_lock_later_rq(next_task, rq); + if (!later_rq) { + struct task_struct *task; + + /* + * We must check all this again, since + * find_lock_later_rq releases rq->lock and it is + * then possible that next_task has migrated. + */ + task = pick_next_pushable_dl_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task is still there. We don't try + * again, some other cpu will pull it when ready. + */ + dequeue_pushable_dl_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks */ + goto out; + + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, later_rq->cpu); + activate_task(later_rq, next_task, 0); + + resched_task(later_rq->curr); + + double_unlock_balance(rq, later_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ + /* Terminates as it moves a -deadline task */ + while (push_dl_task(rq)) + ; +} + +static int pull_dl_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + u64 dmin = LONG_MAX; + + if (likely(!dl_overloaded(this_rq))) + return 0; + + /* + * Match the barrier from dl_set_overloaded; this guarantees that if we + * see overloaded we must also see the dlo_mask bit. + */ + smp_rmb(); + + for_each_cpu(cpu, this_rq->rd->dlo_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * It looks racy, abd it is! However, as in sched_rt.c, + * we are fine with this. + */ + if (this_rq->dl.dl_nr_running && + dl_time_before(this_rq->dl.earliest_dl.curr, + src_rq->dl.earliest_dl.next)) + continue; + + /* Might drop this_rq->lock */ + double_lock_balance(this_rq, src_rq); + + /* + * If there are no more pullable tasks on the + * rq, we're done with it. + */ + if (src_rq->dl.dl_nr_running <= 1) + goto skip; + + p = pick_next_earliest_dl_task(src_rq, this_cpu); + + /* + * We found a task to be pulled if: + * - it preempts our current (if there's one), + * - it will preempt the last one we pulled (if any). + */ + if (p && dl_time_before(p->dl.deadline, dmin) && + (!this_rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + this_rq->dl.earliest_dl.curr))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->se.on_rq); + + /* + * Then we pull iff p has actually an earlier + * deadline than the current task of its runqueue. + */ + if (dl_time_before(p->dl.deadline, + src_rq->curr->dl.deadline)) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + + /* Is there any other task even earlier? */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull other tasks here */ + if (dl_task(prev)) + pull_dl_task(rq); +} + +static void post_schedule_dl(struct rq *rq) +{ + push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_dl_tasks(rq) && + p->nr_cpus_allowed > 1 && + dl_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + dl_entity_preempt(&rq->curr->dl, &p->dl))) { + push_dl_tasks(rq); + } +} + +static void set_cpus_allowed_dl(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!dl_task(p)); + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_dl_task(rq, p); + BUG_ON(!rq->dl.dl_nr_migratory); + rq->dl.dl_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_dl_task(rq, p); + rq->dl.dl_nr_migratory++; + } + + update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_set_overload(rq); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_clear_overload(rq); +} + +void init_sched_dl_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), + GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer)) + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP + /* + * Since this might be the only -deadline task on the rq, + * this is the right place to try to pull some other one + * from an overloaded cpu, if any. + */ + if (!rq->dl.dl_nr_running) + pull_dl_task(rq); +#endif } +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + int check_resched = 1; + /* * If p is throttled, don't consider the possibility * of preempting rq->curr, the check will be done right @@ -637,26 +1508,53 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; if (p->on_rq || rq->curr != p) { - if (task_has_dl_policy(rq->curr)) +#ifdef CONFIG_SMP + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + /* Only reschedule if pushing failed */ + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); - else - resched_task(rq->curr); } } +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - switched_to_dl(rq, p); -} - + if (p->on_rq || rq->curr == p) { #ifdef CONFIG_SMP -static int -select_task_rq_dl(struct task_struct *p, int prev_cpu, int sd_flag, int flags) -{ - return task_cpu(p); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + pull_dl_task(rq); + + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this + * runqueue. + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && + rq->curr == p) + resched_task(p); +#else + /* + * Again, we don't know if p has a earlier + * or later deadline, so let's blindly set a + * (maybe not needed) rescheduling point. + */ + resched_task(p); +#endif /* CONFIG_SMP */ + } else + switched_to_dl(rq, p); } -#endif const struct sched_class dl_sched_class = { .next = &rt_sched_class, @@ -671,6 +1569,12 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .pre_schedule = pre_schedule_dl, + .post_schedule = post_schedule_dl, + .task_woken = task_woken_dl, #endif .set_curr_task = set_curr_task_dl, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..a2740b775b45 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && - rt_task(rq->curr) && + (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 83eb5390f753..93ea62754f11 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -385,6 +385,31 @@ struct dl_rq { struct rb_node *rb_leftmost; unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + unsigned long dl_nr_total; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#endif }; #ifdef CONFIG_SMP @@ -404,6 +429,13 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1095,6 +1127,8 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); + +extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); From 239be4a982154ea0c979fca5846349bb68973aed Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:39 +0100 Subject: [PATCH 19/60] sched/deadline: Add SCHED_DEADLINE avg_update accounting Make the core scheduler and load balancer aware of the load produced by -deadline tasks, by updating the moving average like for sched_rt. Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-6-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fcc02c9ca16b..21f58d261134 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -564,6 +564,8 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); + sched_rt_avg_update(rq, delta_exec); + dl_se->runtime -= delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); From 755378a47192a3d1f7c3a8ca6c15c1cf76de0af2 Mon Sep 17 00:00:00 2001 From: Harald Gustafsson Date: Thu, 7 Nov 2013 14:43:40 +0100 Subject: [PATCH 20/60] sched/deadline: Add period support for SCHED_DEADLINE tasks Make it possible to specify a period (different or equal than deadline) for -deadline tasks. Relative deadlines (D_i) are used on task arrivals to generate new scheduling (absolute) deadlines as "d = t + D_i", and periods (P_i) to postpone the scheduling deadlines as "d = d + P_i" when the budget is zero. This is in general useful to model (and schedule) tasks that have slow activation rates (long periods), but have to be scheduled soon once activated (short deadlines). Signed-off-by: Harald Gustafsson Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-7-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 ++++++++-- kernel/sched/deadline.c | 10 +++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cc66f2615a6d..158f4c2dd852 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1102,6 +1102,7 @@ struct sched_dl_entity { */ u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ + u64 dl_period; /* separation of two instances (period) */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 548cc04aee45..069230b5c3fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1723,6 +1723,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); p->dl.dl_runtime = p->dl.runtime = 0; p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.dl_period = 0; p->dl.flags = 0; INIT_LIST_HEAD(&p->rt.run_list); @@ -3026,6 +3027,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_throttled = 0; dl_se->dl_new = 1; @@ -3067,19 +3069,23 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_priority = p->rt_priority; attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; attr->sched_flags = dl_se->flags; } /* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal - * than the runtime. + * than the runtime, as well as the period of being zero or + * greater than deadline. */ static bool __checkparam_dl(const struct sched_attr *attr) { return attr && attr->sched_deadline != 0 && - (s64)(attr->sched_deadline - attr->sched_runtime) >= 0; + (attr->sched_period == 0 || + (s64)(attr->sched_period - attr->sched_deadline) >= 0) && + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0; } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 21f58d261134..3958bc576d67 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -289,7 +289,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += dl_se->dl_deadline; + dl_se->deadline += dl_se->dl_period; dl_se->runtime += dl_se->dl_runtime; } @@ -329,9 +329,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) * * This function returns true if: * - * runtime / (deadline - t) > dl_runtime / dl_deadline , + * runtime / (deadline - t) > dl_runtime / dl_period , * * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. */ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { @@ -355,7 +359,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (dl_se->dl_deadline >> 10) * (dl_se->runtime >> 10); + left = (dl_se->dl_period >> 10) * (dl_se->runtime >> 10); right = ((dl_se->deadline - t) >> 10) * (dl_se->dl_runtime >> 10); return dl_time_before(right, left); From af6ace764d03900524e9b1ac621a1c520ee49fc6 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:42 +0100 Subject: [PATCH 21/60] sched/deadline: Add latency tracing for SCHED_DEADLINE tasks It is very likely that systems that wants/needs to use the new SCHED_DEADLINE policy also want to have the scheduling latency of the -deadline tasks under control. For this reason a new version of the scheduling wakeup latency, called "wakeup_dl", is introduced. As a consequence of applying this patch there will be three wakeup latency tracer: * "wakeup", that deals with all tasks in the system; * "wakeup_rt", that deals with -rt and -deadline tasks only; * "wakeup_dl", that deals with -deadline tasks only. Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-9-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 64 +++++++++++++++++++++++++++++-- kernel/trace/trace_selftest.c | 33 +++++++++------- 2 files changed, 79 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..090c4d9dcf16 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -27,6 +27,8 @@ static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -437,6 +439,7 @@ static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; + tracing_dl = 0; if (wakeup_task) put_task_struct(wakeup_task); @@ -472,9 +475,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; pc = preempt_count(); @@ -486,7 +497,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) arch_spin_lock(&wakeup_lock); /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) goto out_locked; /* reset the trace */ @@ -496,6 +508,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + wakeup_task = p; get_task_struct(wakeup_task); @@ -597,16 +618,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; @@ -674,6 +704,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = .use_max_tr = true, }; +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, +}; + __init static int init_wakeup_tracer(void) { int ret; @@ -686,6 +738,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + return 0; } core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; struct completion *x = data; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_setattr(current, &attr); /* Make it know we have a new prio */ complete(x); @@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* - * This is an RT task, do short sleeps to let - * others run. + * This will likely be the system top priority + * task, do short sleeps to let others run. */ msleep(100); } @@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tracing_max_latency; struct task_struct *p; - struct completion isrt; + struct completion is_ready; unsigned long count; int ret; - init_completion(&isrt); + init_completion(&is_ready); - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) while (p->on_rq) { /* - * Sleep to make sure the RT thread is asleep too. + * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, * but we want to make sure this test still works. */ msleep(100); } - init_completion(&isrt); + init_completion(&is_ready); wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&isrt); + wait_for_completion(&is_ready); /* stop the tracing. */ tracing_stop(); From fb00aca474405f4fa8a8519c3179fed722eabd83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2013 14:43:43 +0100 Subject: [PATCH 22/60] rtmutex: Turn the plist into an rb-tree Turn the pi-chains from plist to rb-tree, in the rt_mutex code, and provide a proper comparison function for -deadline and -priority tasks. This is done mainly because: - classical prio field of the plist is just an int, which might not be enough for representing a deadline; - manipulating such a list would become O(nr_deadline_tasks), which might be to much, as the number of -deadline task increases. Therefore, an rb-tree is used, and tasks are queued in it according to the following logic: - among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the one with the higher (lower, actually!) prio wins; - among a -priority and a -deadline task, the latter always wins; - among two -deadline tasks, the one with the earliest deadline wins. Queueing and dequeueing functions are changed accordingly, for both the list of a task's pi-waiters and the list of tasks blocked on a pi-lock. Signed-off-by: Peter Zijlstra Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-again-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 +++ include/linux/rtmutex.h | 18 ++-- include/linux/sched.h | 4 +- kernel/fork.c | 3 +- kernel/futex.c | 2 + kernel/locking/rtmutex-debug.c | 8 +- kernel/locking/rtmutex.c | 151 +++++++++++++++++++++++++------- kernel/locking/rtmutex_common.h | 22 ++--- kernel/sched/core.c | 4 - 9 files changed, 157 insertions(+), 65 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b0ed422e4e4a..f0e52383a001 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -154,6 +155,14 @@ extern struct task_group root_task_group; #define INIT_TASK_COMM "swapper" +#ifdef CONFIG_RT_MUTEXES +# define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = RB_ROOT, \ + .pi_waiters_leftmost = NULL, +#else +# define INIT_RT_MUTEXES(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -221,6 +230,7 @@ extern struct task_group root_task_group; INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ INIT_CPUSET_SEQ(tsk) \ + INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ } diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index de17134244f3..3aed8d737e1a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -13,7 +13,7 @@ #define __LINUX_RT_MUTEX_H #include -#include +#include #include extern int max_lock_depth; /* for sysctl */ @@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */ * The rt_mutex structure * * @wait_lock: spinlock to protect the structure - * @wait_list: pilist head to enqueue waiters in priority order + * @waiters: rbtree root to enqueue waiters in priority order + * @waiters_leftmost: top waiter * @owner: the mutex owner */ struct rt_mutex { raw_spinlock_t wait_lock; - struct plist_head wait_list; + struct rb_root waiters; + struct rb_node *waiters_leftmost; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; @@ -66,7 +68,7 @@ struct hrtimer_sleeper; #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ + , .waiters = RB_ROOT \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); -#ifdef CONFIG_RT_MUTEXES -# define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ - INIT_RT_MUTEX_DEBUG(tsk) -#else -# define INIT_RT_MUTEXES(tsk) -#endif - #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 158f4c2dd852..9ea15019a5b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,6 +16,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1354,7 +1355,8 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; + struct rb_root pi_waiters; + struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; #endif diff --git a/kernel/fork.c b/kernel/fork.c index e6c0f1a22914..7049ae526a54 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1087,7 +1087,8 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); + p->pi_waiters = RB_ROOT; + p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; #endif } diff --git a/kernel/futex.c b/kernel/futex.c index f6ff0191ecf7..679531c61d96 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2316,6 +2316,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * code while we sleep on uaddr. */ debug_rt_mutex_init_waiter(&rt_waiter); + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); + RB_CLEAR_NODE(&rt_waiter.tree_entry); rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } @@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..3bf0aa68dd3f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "rtmutex_common.h" @@ -91,10 +92,104 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +static inline int +rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->task->prio < right->task->prio) + return 1; + + /* + * If both tasks are dl_task(), we check their deadlines. + */ + if (dl_prio(left->task->prio) && dl_prio(right->task->prio)) + return (left->task->dl.deadline < right->task->dl.deadline); + + return 0; +} + +static void +rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &lock->waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + lock->waiters_leftmost = &waiter->tree_entry; + + rb_link_node(&waiter->tree_entry, parent, link); + rb_insert_color(&waiter->tree_entry, &lock->waiters); +} + +static void +rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->tree_entry)) + return; + + if (lock->waiters_leftmost == &waiter->tree_entry) + lock->waiters_leftmost = rb_next(&waiter->tree_entry); + + rb_erase(&waiter->tree_entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree_entry); +} + +static void +rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + task->pi_waiters_leftmost = &waiter->pi_tree_entry; + + rb_link_node(&waiter->pi_tree_entry, parent, link); + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); +} + +static void +rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + return; + + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree_entry); +} + /* - * Calculate task priority from the waiter list priority + * Calculate task priority from the waiter tree priority * - * Return task->normal_prio when the waiter list is empty or when + * Return task->normal_prio when the waiter tree is empty or when * the waiter is not allowed to do priority boosting */ int rt_mutex_getprio(struct task_struct *task) @@ -102,7 +197,7 @@ int rt_mutex_getprio(struct task_struct *task) if (likely(!task_has_pi_waiters(task))) return task->normal_prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, + return min(task_top_pi_waiter(task)->task->prio, task->normal_prio); } @@ -233,7 +328,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * When deadlock detection is off then we check, if further * priority adjustment is necessary. */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) + if (!detect_deadlock && waiter->task->prio == task->prio) goto out_unlock_pi; lock = waiter->lock; @@ -254,9 +349,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); + waiter->task->prio = task->prio; + rt_mutex_enqueue(lock, waiter); /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -280,17 +375,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, top_waiter); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } else if (top_waiter == waiter) { /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } @@ -355,7 +448,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (task->prio >= rt_mutex_top_waiter(lock)->task->prio) { if (!waiter || waiter != rt_mutex_top_waiter(lock)) return 0; } @@ -369,7 +462,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* remove the queued waiter. */ if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); task->pi_blocked_on = NULL; } @@ -379,8 +472,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) { top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, top); } raw_spin_unlock_irqrestore(&task->pi_lock, flags); } @@ -416,13 +508,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_enqueue(lock, waiter); task->pi_blocked_on = waiter; @@ -433,8 +523,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, top_waiter); + rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) @@ -486,7 +576,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * boosted mode and go back to normal after releasing * lock->wait_lock. */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + rt_mutex_dequeue_pi(current, waiter); rt_mutex_set_owner(lock, NULL); @@ -510,7 +600,7 @@ static void remove_waiter(struct rt_mutex *lock, int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); @@ -521,13 +611,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, waiter); if (rt_mutex_has_waiters(lock)) { struct rt_mutex_waiter *next; next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); + rt_mutex_enqueue_pi(owner, next); } __rt_mutex_adjust_prio(owner); @@ -537,8 +627,6 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!chain_walk) return; @@ -565,7 +653,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { + if (!waiter || waiter->task->prio == task->prio) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -638,6 +726,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); + RB_CLEAR_NODE(&waiter.pi_tree_entry); + RB_CLEAR_NODE(&waiter.tree_entry); raw_spin_lock(&lock->wait_lock); @@ -904,7 +994,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); + lock->waiters = RB_ROOT; + lock->waiters_leftmost = NULL; debug_rt_mutex_init(lock, name); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..b65442fe5ade 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @tree_entry: pi node to enqueue into the mutex waiters tree + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; + struct rb_node tree_entry; + struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -57,11 +57,11 @@ struct rt_mutex_waiter { }; /* - * Various helpers to access the waiters-plist: + * Various helpers to access the waiters-tree: */ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !plist_head_empty(&lock->wait_list); + return !RB_EMPTY_ROOT(&lock->waiters); } static inline struct rt_mutex_waiter * @@ -69,8 +69,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, - list_entry); + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, + tree_entry); BUG_ON(w->lock != lock); return w; @@ -78,14 +78,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !plist_head_empty(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, - pi_list_entry); + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, + pi_tree_entry); } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 069230b5c3fb..aebcc70b5c93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6635,10 +6635,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ From 2d3d891d3344159d5b452a645e355bbe29591e8b Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:44 +0100 Subject: [PATCH 23/60] sched/deadline: Add SCHED_DEADLINE inheritance logic Some method to deal with rt-mutexes and make sched_dl interact with the current PI-coded is needed, raising all but trivial issues, that needs (according to us) to be solved with some restructuring of the pi-code (i.e., going toward a proxy execution-ish implementation). This is under development, in the meanwhile, as a temporary solution, what this commits does is: - ensure a pi-lock owner with waiters is never throttled down. Instead, when it runs out of runtime, it immediately gets replenished and it's deadline is postponed; - the scheduling parameters (relative deadline and default runtime) used for that replenishments --during the whole period it holds the pi-lock-- are the ones of the waiting task with earliest deadline. Acting this way, we provide some kind of boosting to the lock-owner, still by using the existing (actually, slightly modified by the previous commit) pi-architecture. We would stress the fact that this is only a surely needed, all but clean solution to the problem. In the end it's only a way to re-start discussion within the community. So, as always, comments, ideas, rants, etc.. are welcome! :-) Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli [ Added !RT_MUTEXES build fix. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-11-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++- include/linux/sched/rt.h | 5 ++ kernel/fork.c | 1 + kernel/locking/rtmutex.c | 31 ++++++++--- kernel/locking/rtmutex_common.h | 1 + kernel/sched/core.c | 36 ++++++++++-- kernel/sched/deadline.c | 91 ++++++++++++++++++------------- kernel/sched/sched.h | 14 +++++ kernel/trace/trace_sched_wakeup.c | 1 + 9 files changed, 134 insertions(+), 54 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ea15019a5b6..13c53a99920f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1124,8 +1124,12 @@ struct sched_dl_entity { * @dl_new tells if a new instance arrived. If so we must * start executing it with full runtime and reset its absolute * deadline; + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section). */ - int dl_throttled, dl_new; + int dl_throttled, dl_new, dl_boosted; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -1359,6 +1363,8 @@ struct task_struct { struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; + /* Top pi_waiters task */ + struct task_struct *pi_top_task; #endif #ifdef CONFIG_DEBUG_MUTEXES diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 440434df3627..34e4ebea8fce 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); +extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { @@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + return NULL; +} # define rt_mutex_adjust_pi(p) do { } while (0) static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { diff --git a/kernel/fork.c b/kernel/fork.c index 7049ae526a54..01b450a61abd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1090,6 +1090,7 @@ static void rt_mutex_init_task(struct task_struct *p) p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; + p->pi_top_task = NULL; #endif } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3bf0aa68dd3f..2e960a2bab81 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -96,13 +96,16 @@ static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) { - if (left->task->prio < right->task->prio) + if (left->prio < right->prio) return 1; /* - * If both tasks are dl_task(), we check their deadlines. + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 1 above, + * then right waiter has a dl_prio() too. */ - if (dl_prio(left->task->prio) && dl_prio(right->task->prio)) + if (dl_prio(left->prio)) return (left->task->dl.deadline < right->task->dl.deadline); return 0; @@ -197,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) if (likely(!task_has_pi_waiters(task))) return task->normal_prio; - return min(task_top_pi_waiter(task)->task->prio, + return min(task_top_pi_waiter(task)->prio, task->normal_prio); } +struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return NULL; + + return task_top_pi_waiter(task)->task; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * @@ -210,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); - if (task->prio != prio) + if (task->prio != prio || dl_prio(prio)) rt_mutex_setprio(task, prio); } @@ -328,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * When deadlock detection is off then we check, if further * priority adjustment is necessary. */ - if (!detect_deadlock && waiter->task->prio == task->prio) + if (!detect_deadlock && waiter->prio == task->prio) goto out_unlock_pi; lock = waiter->lock; @@ -350,7 +361,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Requeue the waiter */ rt_mutex_dequeue(lock, waiter); - waiter->task->prio = task->prio; + waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); /* Release the task */ @@ -448,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->task->prio) { + if (task->prio >= rt_mutex_top_waiter(lock)->prio) { if (!waiter || waiter != rt_mutex_top_waiter(lock)) return 0; } @@ -508,6 +519,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; + waiter->prio = task->prio; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -653,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || waiter->task->prio == task->prio) { + if (!waiter || (waiter->prio == task->prio && + !dl_prio(task->prio))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index b65442fe5ade..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -54,6 +54,7 @@ struct rt_mutex_waiter { struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif + int prio; }; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aebcc70b5c93..599ee3b11b44 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -947,7 +947,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) + } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -2781,7 +2781,7 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; + int oldprio, on_rq, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2808,6 +2808,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); + p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -2817,19 +2818,42 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (dl_prio(prio)) + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || (p->pi_top_task && + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + p->dl.dl_throttled = 0; + enqueue_flag = ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag = ENQUEUE_HEAD; p->sched_class = &rt_sched_class; - else + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; p->sched_class = &fair_sched_class; + } p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3958bc576d67..7f6de4316990 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,20 +16,6 @@ */ #include "sched.h" -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - -/* - * Tells if entity @a should preempt entity @b. - */ -static inline -int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) -{ - return dl_time_before(a->deadline, b->deadline); -} - static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -242,7 +228,8 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); @@ -254,8 +241,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; dl_se->dl_new = 0; } @@ -277,11 +264,23 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setscheduler_ex(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + BUG_ON(pi_se->dl_runtime <= 0); + + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. + */ + if (dl_se->dl_deadline == 0) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -289,8 +288,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += dl_se->dl_period; - dl_se->runtime += dl_se->dl_runtime; + dl_se->deadline += pi_se->dl_period; + dl_se->runtime += pi_se->dl_runtime; } /* @@ -309,8 +308,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) lag_once = true; printk_sched("sched: DL replenish lagged to much\n"); } - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; } } @@ -337,7 +336,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) * task with deadline equal to period this is the same of using * dl_deadline instead of dl_period in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, u64 t) { u64 left, right; @@ -359,8 +359,8 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (dl_se->dl_period >> 10) * (dl_se->runtime >> 10); - right = ((dl_se->deadline - t) >> 10) * (dl_se->dl_runtime >> 10); + left = (pi_se->dl_period >> 10) * (dl_se->runtime >> 10); + right = ((dl_se->deadline - t) >> 10) * (pi_se->dl_runtime >> 10); return dl_time_before(right, left); } @@ -374,7 +374,8 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) * - using the remaining runtime with the current deadline would make * the entity exceed its bandwidth. */ -static void update_dl_entity(struct sched_dl_entity *dl_se) +static void update_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); @@ -384,14 +385,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) * the actual scheduling parameters have to be "renewed". */ if (dl_se->dl_new) { - setup_new_dl_entity(dl_se); + setup_new_dl_entity(dl_se, pi_se); return; } if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, rq_clock(rq))) { - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; } } @@ -405,7 +406,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct sched_dl_entity *dl_se) +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); @@ -414,6 +415,8 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) unsigned long range; s64 delta; + if (boosted) + return 0; /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from @@ -573,7 +576,7 @@ static void update_curr_dl(struct rq *rq) dl_se->runtime -= delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se))) + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) dl_se->dl_throttled = 1; else enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -728,7 +731,8 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -738,9 +742,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) * we want a replenishment of its runtime. */ if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se); + replenish_dl_entity(dl_se, pi_se); else - update_dl_entity(dl_se); + update_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } @@ -752,6 +756,18 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct sched_dl_entity *pi_se = &p->dl; + + /* + * Use the scheduling parameters of the top pi-waiter + * task if we have one and its (relative) deadline is + * smaller than our one... OTW we keep our runtime and + * deadline. + */ + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + pi_se = &pi_task->dl; + /* * If p is throttled, we do nothing. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on @@ -761,7 +777,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (p->dl.dl_throttled) return; - enqueue_dl_entity(&p->dl, flags); + enqueue_dl_entity(&p->dl, pi_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -985,8 +1001,7 @@ static void task_dead_dl(struct task_struct *p) { struct hrtimer *timer = &p->dl.dl_timer; - if (hrtimer_active(timer)) - hrtimer_try_to_cancel(timer); + hrtimer_cancel(timer); } static void set_curr_task_dl(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 93ea62754f11..52453a2d0a79 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -107,6 +107,20 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline +int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 090c4d9dcf16..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "trace.h" From 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:45 +0100 Subject: [PATCH 24/60] sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks In order of deadline scheduling to be effective and useful, it is important that some method of having the allocation of the available CPU bandwidth to tasks and task groups under control. This is usually called "admission control" and if it is not performed at all, no guarantee can be given on the actual scheduling of the -deadline tasks. Since when RT-throttling has been introduced each task group have a bandwidth associated to itself, calculated as a certain amount of runtime over a period. Moreover, to make it possible to manipulate such bandwidth, readable/writable controls have been added to both procfs (for system wide settings) and cgroupfs (for per-group settings). Therefore, the same interface is being used for controlling the bandwidth distrubution to -deadline tasks and task groups, i.e., new controls but with similar names, equivalent meaning and with the same usage paradigm are added. However, more discussion is needed in order to figure out how we want to manage SCHED_DEADLINE bandwidth at the task group level. Therefore, this patch adds a less sophisticated, but actually very sensible, mechanism to ensure that a certain utilization cap is not overcome per each root_domain (the single rq for !SMP configurations). Another main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones doesn't!), and thus we don't need an higher level throttling mechanism to enforce the desired bandwidth. This patch, therefore: - adds system wide deadline bandwidth management by means of: * /proc/sys/kernel/sched_dl_runtime_us, * /proc/sys/kernel/sched_dl_period_us, that determine (i.e., runtime / period) the total bandwidth available on each CPU of each root_domain for -deadline tasks; - couples the RT and deadline bandwidth management, i.e., enforces that the sum of how much bandwidth is being devoted to -rt -deadline tasks to stay below 100%. This means that, for a root_domain comprising M CPUs, -deadline tasks can be created until the sum of their bandwidths stay below: M * (sched_dl_runtime_us / sched_dl_period_us) It is also possible to disable this bandwidth management logic, and be thus free of oversubscribing the system up to any arbitrary level. Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-12-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 13 ++ kernel/sched/core.c | 441 ++++++++++++++++++++++++++++++++--- kernel/sched/deadline.c | 46 +++- kernel/sched/sched.h | 76 +++++- kernel/sysctl.c | 14 ++ 6 files changed, 555 insertions(+), 36 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 13c53a99920f..a196cb7fc6f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1104,6 +1104,7 @@ struct sched_dl_entity { u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ u64 dl_period; /* separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 31e0193cb0c5..8070a83dbedc 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -81,6 +81,15 @@ static inline unsigned int get_sysctl_timer_migration(void) extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +/* + * control SCHED_DEADLINE reservations: + * + * /proc/sys/kernel/sched_dl_period_us + * /proc/sys/kernel/sched_dl_runtime_us + */ +extern unsigned int sysctl_sched_dl_period; +extern int sysctl_sched_dl_runtime; + #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -99,4 +108,8 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int sched_dl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #endif /* _SCHED_SYSCTL_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 599ee3b11b44..c7c68e6b5c51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,6 +296,15 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* + * Maximum bandwidth available for all -deadline tasks and groups + * (if group scheduling is configured) on each CPU. + * + * default: 5% + */ +unsigned int sysctl_sched_dl_period = 1000000; +int sysctl_sched_dl_runtime = 50000; + /* @@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int __dl_span_weight(struct rq *rq) +{ + return cpumask_weight(rq->rd->span); +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int __dl_span_weight(struct rq *rq) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus = __dl_span_weight(task_rq(p)); + int err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -3053,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; } @@ -3101,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or - * greater than deadline. + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. */ static bool __checkparam_dl(const struct sched_attr *attr) @@ -3109,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr) return attr && attr->sched_deadline != 0 && (attr->sched_period == 0 || (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0; + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && + attr->sched_runtime >= (2 << (DL_SCALE - 1)); } /* @@ -3250,8 +3368,8 @@ recheck: } change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -3262,8 +3380,33 @@ change: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + cpumask_t act_affinity; + + /* + * cpus_allowed mask is statically initialized with + * CPU_MASK_ALL, span is instead dynamic. Here we + * compute the "dynamic" affinity of a task. + */ + cpumask_and(&act_affinity, &p->cpus_allowed, + cpu_active_mask); + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_equal(&act_affinity, span) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3271,6 +3414,18 @@ change: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && + dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3705,6 +3860,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && + !cpumask_equal(in_mask, span)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); again: @@ -4358,6 +4531,42 @@ out: } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * When dealing with a -deadline task, we have to check if moving it to + * a new CPU is possible or not. In fact, this is only true iff there + * is enough bandwidth available on such CPU, otherwise we want the + * whole migration progedure to fail over. + */ +static inline +bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu) +{ + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_bw *cpu_b = dl_bw_of(cpu); + int ret = 1; + u64 bw; + + if (dl_b == cpu_b) + return 1; + + raw_spin_lock(&dl_b->lock); + raw_spin_lock(&cpu_b->lock); + + bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span); + if (dl_bandwidth_enabled() && + bw < cpu_b->total_bw + p->dl.dl_bw) { + ret = 0; + goto unlock; + } + dl_b->total_bw -= p->dl.dl_bw; + cpu_b->total_bw += p->dl.dl_bw; + +unlock: + raw_spin_unlock(&cpu_b->lock); + raw_spin_unlock(&dl_b->lock); + + return ret; +} + /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -4389,6 +4598,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; + /* + * If p is -deadline, proceed only if there is enough + * bandwidth available on dest_cpu + */ + if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu)) + goto fail; + /* * If we're not on a rq, the next wake-up will ensure we're * placed properly. @@ -5128,6 +5344,8 @@ static int init_rootdomain(struct root_domain *rd) if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; + init_dl_bw(&rd->dl_bw); + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6557,13 +6775,15 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_dl_period(), global_dl_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6966,16 +7186,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -7149,10 +7359,48 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +/* + * Coupling of -rt and -deadline bandwidth. + * + * Here we check if the new -rt bandwidth value is consistent + * with the system settings for the bandwidth available + * to -deadline tasks. + * + * IOW, we want to enforce that + * + * rt_bandwidth + dl_bandwidth <= 100% + * + * is always true. + */ +static bool __sched_rt_dl_global_constraints(u64 rt_bw) +{ + unsigned long flags; + u64 dl_bw; + bool ret; + + raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags); + if (global_rt_runtime() == RUNTIME_INF || + global_dl_runtime() == RUNTIME_INF) { + ret = true; + goto unlock; + } + + dl_bw = to_ratio(def_dl_bandwidth.dl_period, + def_dl_bandwidth.dl_runtime); + + ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); +unlock: + raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags); + + return ret; +} + +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; + u64 runtime, period, bw; int ret = 0; if (sysctl_sched_rt_period <= 0) @@ -7167,6 +7415,10 @@ static int sched_rt_global_constraints(void) if (runtime > period && runtime != RUNTIME_INF) return -EINVAL; + bw = to_ratio(period, runtime); + if (!__sched_rt_dl_global_constraints(bw)) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -7189,19 +7441,19 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; + int i, ret = 0; + u64 bw; if (sysctl_sched_rt_period <= 0) return -EINVAL; - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + bw = to_ratio(global_rt_period(), global_rt_runtime()); + if (!__sched_rt_dl_global_constraints(bw)) { + ret = -EINVAL; + goto unlock; + } + for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -7209,11 +7461,92 @@ static int sched_rt_global_constraints(void) rt_rq->rt_runtime = global_rt_runtime(); raw_spin_unlock(&rt_rq->rt_runtime_lock); } +unlock: raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + return ret; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +/* + * Coupling of -dl and -rt bandwidth. + * + * Here we check, while setting the system wide bandwidth available + * for -dl tasks and groups, if the new values are consistent with + * the system settings for the bandwidth available to -rt entities. + * + * IOW, we want to enforce that + * + * rt_bandwidth + dl_bandwidth <= 100% + * + * is always true. + */ +static bool __sched_dl_rt_global_constraints(u64 dl_bw) +{ + u64 rt_bw; + bool ret; + + raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); + if (global_dl_runtime() == RUNTIME_INF || + global_rt_runtime() == RUNTIME_INF) { + ret = true; + goto unlock; + } + + rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period), + def_rt_bandwidth.rt_runtime); + + ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); +unlock: + raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); + + return ret; +} + +static bool __sched_dl_global_constraints(u64 runtime, u64 period) +{ + if (!period || (runtime != RUNTIME_INF && runtime > period)) + return -EINVAL; + + return 0; +} + +static int sched_dl_global_constraints(void) +{ + u64 runtime = global_dl_runtime(); + u64 period = global_dl_period(); + u64 new_bw = to_ratio(period, runtime); + int ret, i; + + ret = __sched_dl_global_constraints(runtime, period); + if (ret) + return ret; + + if (!__sched_dl_rt_global_constraints(new_bw)) + return -EINVAL; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(i) { + struct dl_bw *dl_b = dl_bw_of(i); + + raw_spin_lock(&dl_b->lock); + if (new_bw < dl_b->total_bw) { + raw_spin_unlock(&dl_b->lock); + return -EBUSY; + } + raw_spin_unlock(&dl_b->lock); + } + return 0; } -#endif /* CONFIG_RT_GROUP_SCHED */ int sched_rr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -7264,6 +7597,60 @@ int sched_rt_handler(struct ctl_table *table, int write, return ret; } +int sched_dl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + unsigned long flags; + + mutex_lock(&mutex); + old_period = sysctl_sched_dl_period; + old_runtime = sysctl_sched_dl_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, + flags); + + ret = sched_dl_global_constraints(); + if (ret) { + sysctl_sched_dl_period = old_period; + sysctl_sched_dl_runtime = old_runtime; + } else { + u64 new_bw; + int i; + + def_dl_bandwidth.dl_period = global_dl_period(); + def_dl_bandwidth.dl_runtime = global_dl_runtime(); + if (global_dl_runtime() == RUNTIME_INF) + new_bw = -1; + else + new_bw = to_ratio(global_dl_period(), + global_dl_runtime()); + /* + * FIXME: As above... + */ + for_each_possible_cpu(i) { + struct dl_bw *dl_b = dl_bw_of(i); + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } + } + + raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, + flags); + } + mutex_unlock(&mutex); + + return ret; +} + #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7f6de4316990..802188fb6338 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,8 @@ */ #include "sched.h" +struct dl_bandwidth def_dl_bandwidth; + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -46,6 +48,27 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) return dl_rq->rb_leftmost == &dl_se->rb_node; } +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ + raw_spin_lock_init(&dl_b->dl_runtime_lock); + dl_b->dl_period = period; + dl_b->dl_runtime = runtime; +} + +extern unsigned long to_ratio(u64 period, u64 runtime); + +void init_dl_bw(struct dl_bw *dl_b) +{ + raw_spin_lock_init(&dl_b->lock); + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); + if (global_dl_runtime() == RUNTIME_INF) + dl_b->bw = -1; + else + dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime()); + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + dl_b->total_bw = 0; +} + void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) { dl_rq->rb_root = RB_ROOT; @@ -57,6 +80,8 @@ void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else + init_dl_bw(&dl_rq->dl_bw); #endif } @@ -359,8 +384,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_period >> 10) * (dl_se->runtime >> 10); - right = ((dl_se->deadline - t) >> 10) * (pi_se->dl_runtime >> 10); + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + right = ((dl_se->deadline - t) >> DL_SCALE) * + (pi_se->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -911,8 +937,8 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((s64)(p->dl.deadline - rq->curr->dl.deadline) == 0 && - !need_resched()) + if ((p->dl.deadline == rq->curr->dl.deadline) && + !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ } @@ -1000,6 +1026,14 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct hrtimer *timer = &p->dl.dl_timer; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + /* + * Since we are TASK_DEAD we won't slip out of the domain! + */ + raw_spin_lock_irq(&dl_b->lock); + dl_b->total_bw -= p->dl.dl_bw; + raw_spin_unlock_irq(&dl_b->lock); hrtimer_cancel(timer); } @@ -1226,7 +1260,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!dl_task(p)); return p; @@ -1373,7 +1407,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * Then we pull iff p has actually an earlier diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 52453a2d0a79..ad4f4fbd002e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -73,6 +73,13 @@ extern void update_cpu_load_active(struct rq *this_rq); #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + /* * These are the 'tuning knobs' of the scheduler: */ @@ -107,7 +114,7 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline int dl_time_before(u64 a, u64 b) +static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } @@ -115,8 +122,8 @@ static inline int dl_time_before(u64 a, u64 b) /* * Tells if entity @a should preempt entity @b. */ -static inline -int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { return dl_time_before(a->deadline, b->deadline); } @@ -136,6 +143,50 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_dl_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; + +static inline u64 global_dl_period(void); +static inline u64 global_dl_runtime(void); extern struct mutex sched_domains_mutex; @@ -423,6 +474,8 @@ struct dl_rq { */ struct rb_root pushable_dl_tasks_root; struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; #endif }; @@ -449,6 +502,7 @@ struct root_domain { */ cpumask_var_t dlo_mask; atomic_t dlo_count; + struct dl_bw dl_bw; /* * The "RT overload" flag: it gets set if a CPU has more than @@ -897,7 +951,18 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline u64 global_dl_period(void) +{ + return (u64)sysctl_sched_dl_period * NSEC_PER_USEC; +} +static inline u64 global_dl_runtime(void) +{ + if (sysctl_sched_dl_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_dl_runtime * NSEC_PER_USEC; +} static inline int task_current(struct rq *rq, struct task_struct *p) { @@ -1145,6 +1210,7 @@ extern void update_max_interval(void); extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); @@ -1152,8 +1218,12 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +unsigned long to_ratio(u64 period, u64 runtime); + extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..c7fb0790ac63 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -414,6 +414,20 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, + { + .procname = "sched_dl_period_us", + .data = &sysctl_sched_dl_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_dl_handler, + }, + { + .procname = "sched_dl_runtime_us", + .data = &sysctl_sched_dl_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_dl_handler, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", From 6bfd6d72f51c51177676f2b1ba113fe0a85fdae4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 7 Nov 2013 14:43:47 +0100 Subject: [PATCH 25/60] sched/deadline: speed up SCHED_DEADLINE pushes with a push-heap Data from tests confirmed that the original active load balancing logic didn't scale neither in the number of CPU nor in the number of tasks (as sched_rt does). Here we provide a global data structure to keep track of deadlines of the running tasks in the system. The structure is composed by a bitmask showing the free CPUs and a max-heap, needed when the system is heavily loaded. The implementation and concurrent access scheme are kept simple by design. However, our measurements show that we can compete with sched_rt on large multi-CPUs machines [1]. Only the push path is addressed, the extension to use this structure also for pull decisions is straightforward. However, we are currently evaluating different (in order to decrease/avoid contention) data structures to solve possibly both problems. We are also going to re-run tests considering recent changes inside cpupri [2]. [1] http://retis.sssup.it/~jlelli/papers/Ospert11Lelli.pdf [2] http://www.spinics.net/lists/linux-rt-users/msg06778.html Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-14-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 3 + kernel/sched/cpudeadline.c | 216 +++++++++++++++++++++++++++++++++++++ kernel/sched/cpudeadline.h | 33 ++++++ kernel/sched/deadline.c | 53 +++------ kernel/sched/sched.h | 2 + 6 files changed, 269 insertions(+), 40 deletions(-) create mode 100644 kernel/sched/cpudeadline.c create mode 100644 kernel/sched/cpudeadline.h diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index b039035a9376..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -14,7 +14,7 @@ endif obj-y += core.o proc.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7c68e6b5c51..e30356d6b31f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5287,6 +5287,7 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); @@ -5345,6 +5346,8 @@ static int init_rootdomain(struct root_domain *rd) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..3bcade554343 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,216 @@ +/* + * kernel/sched/cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include "cpudeadline.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a], cp->elements[b]); + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +} + +void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(cpu > num_present_cpus()); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->cpu_to_idx[cpu]; + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->cpu_to_idx[new_cpu] = old_idx; + cp->cpu_to_idx[cpu] = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->cpu_to_idx[cpu] = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + for (i = 0; i < NR_CPUS; i++) + cp->cpu_to_idx[i] = IDX_INVALID; + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + /* + * nothing to do for the moment + */ +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include + +#define IDX_INVALID -1 + +struct array_item { + u64 dl; + int cpu; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + int cpu_to_idx[NR_CPUS]; + struct array_item elements[NR_CPUS]; + cpumask_var_t free_cpus; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 802188fb6338..0c6b1d089cd4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,8 @@ */ #include "sched.h" +#include + struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -640,6 +642,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) */ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); } else if (dl_rq->earliest_dl.next == 0 || dl_time_before(deadline, dl_rq->earliest_dl.next)) { /* @@ -663,6 +666,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; @@ -670,6 +674,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -855,9 +860,6 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP static int find_later_rq(struct task_struct *task); -static int latest_cpu_find(struct cpumask *span, - struct task_struct *task, - struct cpumask *later_mask); static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) @@ -904,7 +906,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - latest_cpu_find(rq->rd->span, rq->curr, NULL) == -1) + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; /* @@ -912,7 +914,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - latest_cpu_find(rq->rd->span, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; resched_task(rq->curr); @@ -1085,39 +1087,6 @@ next_node: return NULL; } -static int latest_cpu_find(struct cpumask *span, - struct task_struct *task, - struct cpumask *later_mask) -{ - const struct sched_dl_entity *dl_se = &task->dl; - int cpu, found = -1, best = 0; - u64 max_dl = 0; - - for_each_cpu(cpu, span) { - struct rq *rq = cpu_rq(cpu); - struct dl_rq *dl_rq = &rq->dl; - - if (cpumask_test_cpu(cpu, &task->cpus_allowed) && - (!dl_rq->dl_nr_running || dl_time_before(dl_se->deadline, - dl_rq->earliest_dl.curr))) { - if (later_mask) - cpumask_set_cpu(cpu, later_mask); - if (!best && !dl_rq->dl_nr_running) { - best = 1; - found = cpu; - } else if (!best && - dl_time_before(max_dl, - dl_rq->earliest_dl.curr)) { - max_dl = dl_rq->earliest_dl.curr; - found = cpu; - } - } else if (later_mask) - cpumask_clear_cpu(cpu, later_mask); - } - - return found; -} - static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -1134,7 +1103,8 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; - best_cpu = latest_cpu_find(task_rq(task)->rd->span, task, later_mask); + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task, later_mask); if (best_cpu == -1) return -1; @@ -1510,6 +1480,9 @@ static void rq_online_dl(struct rq *rq) { if (rq->dl.overloaded) dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } /* Assumes rq->lock is held */ @@ -1517,6 +1490,8 @@ static void rq_offline_dl(struct rq *rq) { if (rq->dl.overloaded) dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); } void init_sched_dl_class(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ad4f4fbd002e..2b7421db6c41 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,6 +10,7 @@ #include #include "cpupri.h" +#include "cpudeadline.h" #include "cpuacct.h" struct rq; @@ -503,6 +504,7 @@ struct root_domain { cpumask_var_t dlo_mask; atomic_t dlo_count; struct dl_bw dl_bw; + struct cpudl cpudl; /* * The "RT overload" flag: it gets set if a CPU has more than From e4099a5e929435cd6349343f002583f29868c900 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Dec 2013 10:03:34 +0100 Subject: [PATCH 26/60] sched/deadline: Fix up the smp-affinity mask tests For now deadline tasks are not allowed to set smp affinity; however the current tests are wrong, cure this. The test in __sched_setscheduler() also uses an on-stack cpumask_t which is a no-no. Change both tests to use cpumask_subset() such that we test the root domain span to be a subset of the cpus_allowed mask. This way we're sure the tasks can always run on all CPUs they can be balanced over, and have no effective affinity constraints. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-fyqtb1lapxca3lhsxv9cumdc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e30356d6b31f..27c6375d182a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3384,23 +3384,14 @@ change: #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy)) { cpumask_t *span = rq->rd->span; - cpumask_t act_affinity; - - /* - * cpus_allowed mask is statically initialized with - * CPU_MASK_ALL, span is instead dynamic. Here we - * compute the "dynamic" affinity of a task. - */ - cpumask_and(&act_affinity, &p->cpus_allowed, - cpu_active_mask); /* * Don't allow tasks with an affinity mask smaller than * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ - if (!cpumask_equal(&act_affinity, span) || - rq->rd->dl_bw.bw == 0) { + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { task_rq_unlock(rq, p, &flags); return -EPERM; } @@ -3420,8 +3411,7 @@ change: * of a SCHED_DEADLINE task) we need to check if enough bandwidth * is available. */ - if ((dl_policy(policy) || dl_task(p)) && - dl_overflow(p, policy, attr)) { + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { task_rq_unlock(rq, p, &flags); return -EBUSY; } @@ -3860,6 +3850,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); + /* * Since bandwidth control happens on root_domain basis, * if admission test is enabled, we only admit -deadline @@ -3870,16 +3864,12 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (task_has_dl_policy(p)) { const struct cpumask *span = task_rq(p)->rd->span; - if (dl_bandwidth_enabled() && - !cpumask_equal(in_mask, span)) { + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { retval = -EBUSY; goto out_unlock; } } #endif - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); again: retval = set_cpus_allowed_ptr(p, new_mask); @@ -4535,7 +4525,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); * When dealing with a -deadline task, we have to check if moving it to * a new CPU is possible or not. In fact, this is only true iff there * is enough bandwidth available on such CPU, otherwise we want the - * whole migration progedure to fail over. + * whole migration procedure to fail over. */ static inline bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu) From 1724813d9f2c7ff702b46d3e4a4f6d9b10a8f8c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Dec 2013 12:44:49 +0100 Subject: [PATCH 27/60] sched/deadline: Remove the sysctl_sched_dl knobs Remove the deadline specific sysctls for now. The problem with them is that the interaction with the exisiting rt knobs is nearly impossible to get right. The current (as per before this patch) situation is that the rt and dl bandwidth is completely separate and we enforce rt+dl < 100%. This is undesirable because this means that the rt default of 95% leaves us hardly any room, even though dl tasks are saver than rt tasks. Another proposed solution was (a discarted patch) to have the dl bandwidth be a fraction of the rt bandwidth. This is highly confusing imo. Furthermore neither proposal is consistent with the situation we actually want; which is rt tasks ran from a dl server. In which case the rt bandwidth is a direct subset of dl. So whichever way we go, the introduction of dl controls at this point is painful. Therefore remove them and instead share the rt budget. This means that for now the rt knobs are used for dl admission control and the dl runtime is accounted against the rt runtime. I realise that this isn't entirely desirable either; but whatever we do we appear to need to change the interface later, so better have a small interface for now. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zpyqbqds1r0vyxtxza1e7rdc@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 13 -- kernel/sched/core.c | 299 +++++++++++------------------------ kernel/sched/deadline.c | 27 +++- kernel/sched/sched.h | 18 +-- kernel/sysctl.c | 14 -- 5 files changed, 117 insertions(+), 254 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 8070a83dbedc..31e0193cb0c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -81,15 +81,6 @@ static inline unsigned int get_sysctl_timer_migration(void) extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; -/* - * control SCHED_DEADLINE reservations: - * - * /proc/sys/kernel/sched_dl_period_us - * /proc/sys/kernel/sched_dl_runtime_us - */ -extern unsigned int sysctl_sched_dl_period; -extern int sysctl_sched_dl_runtime; - #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -108,8 +99,4 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -int sched_dl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - #endif /* _SCHED_SYSCTL_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 27c6375d182a..1d33eb8143cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6771,7 +6771,7 @@ void __init sched_init(void) init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); init_dl_bandwidth(&def_dl_bandwidth, - global_dl_period(), global_dl_runtime()); + global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -7354,64 +7354,11 @@ static long sched_group_rt_period(struct task_group *tg) } #endif /* CONFIG_RT_GROUP_SCHED */ -/* - * Coupling of -rt and -deadline bandwidth. - * - * Here we check if the new -rt bandwidth value is consistent - * with the system settings for the bandwidth available - * to -deadline tasks. - * - * IOW, we want to enforce that - * - * rt_bandwidth + dl_bandwidth <= 100% - * - * is always true. - */ -static bool __sched_rt_dl_global_constraints(u64 rt_bw) -{ - unsigned long flags; - u64 dl_bw; - bool ret; - - raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags); - if (global_rt_runtime() == RUNTIME_INF || - global_dl_runtime() == RUNTIME_INF) { - ret = true; - goto unlock; - } - - dl_bw = to_ratio(def_dl_bandwidth.dl_period, - def_dl_bandwidth.dl_runtime); - - ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); -unlock: - raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags); - - return ret; -} - #ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period, bw; int ret = 0; - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - bw = to_ratio(period, runtime); - if (!__sched_rt_dl_global_constraints(bw)) - return -EINVAL; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -7435,18 +7382,8 @@ static int sched_rt_global_constraints(void) { unsigned long flags; int i, ret = 0; - u64 bw; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - bw = to_ratio(global_rt_period(), global_rt_runtime()); - if (!__sched_rt_dl_global_constraints(bw)) { - ret = -EINVAL; - goto unlock; - } - for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -7454,69 +7391,18 @@ static int sched_rt_global_constraints(void) rt_rq->rt_runtime = global_rt_runtime(); raw_spin_unlock(&rt_rq->rt_runtime_lock); } -unlock: raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ -/* - * Coupling of -dl and -rt bandwidth. - * - * Here we check, while setting the system wide bandwidth available - * for -dl tasks and groups, if the new values are consistent with - * the system settings for the bandwidth available to -rt entities. - * - * IOW, we want to enforce that - * - * rt_bandwidth + dl_bandwidth <= 100% - * - * is always true. - */ -static bool __sched_dl_rt_global_constraints(u64 dl_bw) -{ - u64 rt_bw; - bool ret; - - raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); - if (global_dl_runtime() == RUNTIME_INF || - global_rt_runtime() == RUNTIME_INF) { - ret = true; - goto unlock; - } - - rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period), - def_rt_bandwidth.rt_runtime); - - ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); -unlock: - raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); - - return ret; -} - -static bool __sched_dl_global_constraints(u64 runtime, u64 period) -{ - if (!period || (runtime != RUNTIME_INF && runtime > period)) - return -EINVAL; - - return 0; -} - static int sched_dl_global_constraints(void) { - u64 runtime = global_dl_runtime(); - u64 period = global_dl_period(); + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - int ret, i; - - ret = __sched_dl_global_constraints(runtime, period); - if (ret) - return ret; - - if (!__sched_dl_rt_global_constraints(new_bw)) - return -EINVAL; + int cpu, ret = 0; /* * Here we want to check the bandwidth not being set to some @@ -7527,20 +7413,101 @@ static int sched_dl_global_constraints(void) * cycling on root_domains... Discussion on different/better * solutions is welcome! */ - for_each_possible_cpu(i) { - struct dl_bw *dl_b = dl_bw_of(i); + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); raw_spin_lock(&dl_b->lock); - if (new_bw < dl_b->total_bw) { - raw_spin_unlock(&dl_b->lock); - return -EBUSY; - } + if (new_bw < dl_b->total_bw) + ret = -EBUSY; raw_spin_unlock(&dl_b->lock); + + if (ret) + break; } + return ret; +} + +static void sched_dl_do_global(void) +{ + u64 new_bw = -1; + int cpu; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } +} + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + return -EINVAL; + return 0; } +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + + ret = sched_rt_global_constraints(); + if (ret) + goto undo; + + ret = sched_dl_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } + mutex_unlock(&mutex); + + return ret; +} + int sched_rr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -7560,90 +7527,6 @@ int sched_rr_handler(struct ctl_table *table, int write, return ret; } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } - } - mutex_unlock(&mutex); - - return ret; -} - -int sched_dl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - unsigned long flags; - - mutex_lock(&mutex); - old_period = sysctl_sched_dl_period; - old_runtime = sysctl_sched_dl_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, - flags); - - ret = sched_dl_global_constraints(); - if (ret) { - sysctl_sched_dl_period = old_period; - sysctl_sched_dl_runtime = old_runtime; - } else { - u64 new_bw; - int i; - - def_dl_bandwidth.dl_period = global_dl_period(); - def_dl_bandwidth.dl_runtime = global_dl_runtime(); - if (global_dl_runtime() == RUNTIME_INF) - new_bw = -1; - else - new_bw = to_ratio(global_dl_period(), - global_dl_runtime()); - /* - * FIXME: As above... - */ - for_each_possible_cpu(i) { - struct dl_bw *dl_b = dl_bw_of(i); - - raw_spin_lock(&dl_b->lock); - dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); - } - } - - raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, - flags); - } - mutex_unlock(&mutex); - - return ret; -} - #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0c6b1d089cd4..ee25361becdd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -63,10 +63,10 @@ void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); - if (global_dl_runtime() == RUNTIME_INF) + if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else - dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime()); + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -612,6 +612,29 @@ static void update_curr_dl(struct rq *rq) if (!is_leftmost(curr, &rq->dl)) resched_task(curr); } + + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks + * would be able to exceed the shared quota. + * + * Account to the root rt group for now. + * + * The solution we're working towards is having the RT groups scheduled + * using deadline servers -- however there's a few nasties to figure + * out before that can happen. + */ + if (rt_bandwidth_enabled()) { + struct rt_rq *rt_rq = &rq->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + /* + * We'll let actual RT tasks worry about the overflow here, we + * have our own CBS to keep us inline -- see above. + */ + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b7421db6c41..890339099550 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,7 +176,7 @@ struct dl_bandwidth { static inline int dl_bandwidth_enabled(void) { - return sysctl_sched_dl_runtime >= 0; + return sysctl_sched_rt_runtime >= 0; } extern struct dl_bw *dl_bw_of(int i); @@ -186,9 +186,6 @@ struct dl_bw { u64 bw, total_bw; }; -static inline u64 global_dl_period(void); -static inline u64 global_dl_runtime(void); - extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -953,19 +950,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline u64 global_dl_period(void) -{ - return (u64)sysctl_sched_dl_period * NSEC_PER_USEC; -} - -static inline u64 global_dl_runtime(void) -{ - if (sysctl_sched_dl_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_dl_runtime * NSEC_PER_USEC; -} - static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7fb0790ac63..c8da99f905cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -414,20 +414,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - { - .procname = "sched_dl_period_us", - .data = &sysctl_sched_dl_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_dl_handler, - }, - { - .procname = "sched_dl_runtime_us", - .data = &sysctl_sched_dl_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_dl_handler, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", From de212f18e92c952533d57c5510d2790199c75734 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Dec 2013 11:54:45 +0100 Subject: [PATCH 28/60] sched/deadline: Fix hotplug admission control The current hotplug admission control is broken because: CPU_DYING -> migration_call() -> migrate_tasks() -> __migrate_task() cannot fail and hard assumes it _will_ move all tasks off of the dying cpu, failing this will break hotplug. The much simpler solution is a DOWN_PREPARE handler that fails when removing one CPU gets us below the total allocated bandwidth. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131220171343.GL2480@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 83 +++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 51 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d33eb8143cc..a549d9a22502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1887,9 +1887,15 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->rd->dl_bw; } -static inline int __dl_span_weight(struct rq *rq) +static inline int dl_bw_cpus(int i) { - return cpumask_weight(rq->rd->span); + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; } #else inline struct dl_bw *dl_bw_of(int i) @@ -1897,7 +1903,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->dl.dl_bw; } -static inline int __dl_span_weight(struct rq *rq) +static inline int dl_bw_cpus(int i) { return 1; } @@ -1938,8 +1944,7 @@ static int dl_overflow(struct task_struct *p, int policy, u64 period = attr->sched_period; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus = __dl_span_weight(task_rq(p)); - int err = -1; + int cpus, err = -1; if (new_bw == p->dl.dl_bw) return 0; @@ -1950,6 +1955,7 @@ static int dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { __dl_add(dl_b, new_bw); @@ -4521,42 +4527,6 @@ out: } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -/* - * When dealing with a -deadline task, we have to check if moving it to - * a new CPU is possible or not. In fact, this is only true iff there - * is enough bandwidth available on such CPU, otherwise we want the - * whole migration procedure to fail over. - */ -static inline -bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - struct dl_bw *cpu_b = dl_bw_of(cpu); - int ret = 1; - u64 bw; - - if (dl_b == cpu_b) - return 1; - - raw_spin_lock(&dl_b->lock); - raw_spin_lock(&cpu_b->lock); - - bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span); - if (dl_bandwidth_enabled() && - bw < cpu_b->total_bw + p->dl.dl_bw) { - ret = 0; - goto unlock; - } - dl_b->total_bw -= p->dl.dl_bw; - cpu_b->total_bw += p->dl.dl_bw; - -unlock: - raw_spin_unlock(&cpu_b->lock); - raw_spin_unlock(&dl_b->lock); - - return ret; -} - /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -4588,13 +4558,6 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; - /* - * If p is -deadline, proceed only if there is enough - * bandwidth available on dest_cpu - */ - if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu)) - goto fail; - /* * If we're not on a rq, the next wake-up will ensure we're * placed properly. @@ -5052,13 +5015,31 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned long flags; + long cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); + set_cpu_active(cpu, false); + + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + struct dl_bw *dl_b = dl_bw_of(cpu); + bool overflow; + int cpus; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (overflow) + return notifier_from_errno(-EBUSY); + } return NOTIFY_OK; - default: - return NOTIFY_DONE; } + + return NOTIFY_DONE; } static int __init migration_init(void) From 7caff66f361c44d0fbc74ed1cfa60a357fc84cf2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:38 +0100 Subject: [PATCH 29/60] sched: Reduce trigger_load_balance() parameters The cpu information is already stored in the struct rq, so no need to pass it as parameter to the trigger_load_balance function. Cc: linaro-kernel@lists.linaro.org Cc: preeti.lkml@gmail.com Cc: mingo@redhat.com Cc: peterz@infradead.org Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-2-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 4 +++- kernel/sched/sched.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a549d9a22502..392c6f87906e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2436,7 +2436,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + trigger_load_balance(rq); #endif rq_last_tick_reset(rq); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b73f4ba62b24..b35d32201617 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6876,8 +6876,10 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { + int cpu = rq->cpu; + /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 890339099550..c2119fd20f8b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1175,7 +1175,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq, int cpu); +extern void trigger_load_balance(struct rq *rq); extern void idle_balance(int this_cpu, struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); From 4a725627f21df6b280a19f50bc849daaab3b1544 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:39 +0100 Subject: [PATCH 30/60] sched: Reduce nohz_kick_needed() parameters The cpu information is already stored in the struct rq, so no need to pass it as parameter to the nohz_kick_needed function. The caller of this function just called idle_cpu() before to fill the rq->idle_balance field. Use rq->cpu and rq->idle_balance. Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-3-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b35d32201617..c47b1ceeaae9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6788,14 +6788,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_power *sgp; - int nr_busy; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -6885,7 +6885,7 @@ void trigger_load_balance(struct rq *rq) likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + if (nohz_kick_needed(rq) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif } From 63f609b160151c9e86b26b935c9671a23afe299f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:40 +0100 Subject: [PATCH 31/60] sched: Pass 'struct rq' to on_null_domain() The on_null_domain() function is getting the cpu to retrieve the struct rq associated with it. Pass 'struct rq' directly to the function as the caller already has the info. Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-4-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c47b1ceeaae9..fc0afc55b4a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6868,9 +6868,9 @@ static void run_rebalance_domains(struct softirq_action *h) nohz_idle_balance(this_cpu, idle); } -static inline int on_null_domain(int cpu) +static inline int on_null_domain(struct rq *rq) { - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(rq->sd); } /* @@ -6882,10 +6882,10 @@ void trigger_load_balance(struct rq *rq) /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + likely(!on_null_domain(rq))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq) && likely(!on_null_domain(cpu))) + if (nohz_kick_needed(rq) && likely(!on_null_domain(rq))) nohz_balancer_kick(cpu); #endif } From 3dd0337d6df7d54c82ecebfa6485040f686bf8b1 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:41 +0100 Subject: [PATCH 32/60] sched: Remove unused parameter from find_new_ilb() The 'call_cpu' is never used in the function. Remove it. Reviewed-by: Preeti U Murthy Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-5-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc0afc55b4a6..5fda3c4411f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6509,7 +6509,7 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -6530,7 +6530,7 @@ static void nohz_balancer_kick(int cpu) nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; From 0aeeeebac8d8304501680f12862784341f4bee7b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:42 +0100 Subject: [PATCH 33/60] sched: Remove unused parameter from nohz_balancer_kick() The cpu parameter is no longer needed in nohz_balancer_kick, let's remove the parameter. Reviewed-by: Preeti U Murthy Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-6-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5fda3c4411f6..b91cd93a32a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6524,7 +6524,7 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; @@ -6878,15 +6878,13 @@ static inline int on_null_domain(struct rq *rq) */ void trigger_load_balance(struct rq *rq) { - int cpu = rq->cpu; - /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(rq))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq) && likely(!on_null_domain(rq))) - nohz_balancer_kick(cpu); + nohz_balancer_kick(); #endif } From f7ed0a895ead0f093f59898ff9cf4e20768a5f09 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:43 +0100 Subject: [PATCH 34/60] sched: Pass 'struct rq' to rebalance_domains() The cpu information is stored in the struct rq and the caller of the rebalance_domains function pass the cpu to retrieve the struct rq but it already has the struct rq info. Replace the cpu parameter with the struct rq. Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-7-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b91cd93a32a3..ff4e0dfaecc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6640,10 +6640,10 @@ void update_max_interval(void) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; - struct rq *rq = cpu_rq(cpu); + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -6769,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) update_idle_cpu_load(rq); raw_spin_unlock_irq(&rq->lock); - rebalance_domains(balance_cpu, CPU_IDLE); + rebalance_domains(rq, CPU_IDLE); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -6858,7 +6858,7 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the From 208cb16ba325552a3935bfc002e34561b0d512d7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:44 +0100 Subject: [PATCH 35/60] sched: Pass 'struct rq' to nohz_idle_balance() The cpu information is stored in the struct rq. Pass the struct rq to nohz_idle_balance, so all the functions called in run_rebalance_domains have the same parameters and the 'this_cpu' variable becomes pointless. Signed-off-by: Daniel Lezcano [ Added !SMP build fix. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-8-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff4e0dfaecc6..d7220d124f16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6740,9 +6740,9 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -6844,7 +6844,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -6853,8 +6853,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; @@ -6865,7 +6864,7 @@ static void run_rebalance_domains(struct softirq_action *h) * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); + nohz_idle_balance(this_rq, idle); } static inline int on_null_domain(struct rq *rq) From c726099ec224be8078d91072207053ff9a1ad6fc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jan 2014 12:34:45 +0100 Subject: [PATCH 36/60] sched: Factor out the on_null_domain() checks in trigger_load_balance() The test on_null_domain is done twice in the trigger_load_balance function. Move the test at the begin of the function, so there is only one check. Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389008085-9069-9-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7220d124f16..b24b6cfde9aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6878,11 +6878,13 @@ static inline int on_null_domain(struct rq *rq) void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(rq))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq) && likely(!on_null_domain(rq))) + if (nohz_kick_needed(rq)) nohz_balancer_kick(); #endif } From 9ea4c380066fbe23fe0da7f4abfabc444f2467f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: [PATCH 37/60] locking: Optimize lock_bh functions Currently all _bh_ lock functions do two preempt_count operations: local_bh_disable(); preempt_disable(); and for the unlock: preempt_enable_no_resched(); local_bh_enable(); Since its a waste of perfectly good cycles to modify the same variable twice when you can do it in one go; use the new __local_bh_{dis,en}able_ip() functions that allow us to provide a preempt_count value to add/sub. So define SOFTIRQ_LOCK_OFFSET as the offset a _bh_ lock needs to add/sub to be done in one go. As a bonus it gets rid of the preempt_enable_no_resched() usage. This reduces a 1000 loops of: spin_lock_bh(&bh_lock); spin_unlock_bh(&bh_lock); from 53596 cycles to 51995 cycles. I didn't do enough measurements to say for absolute sure that the result is significant but the the few runs I did for each suggest it is so. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: rui.zhang@intel.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/preempt_mask.h | 15 +++++++++++++++ include/linux/rwlock_api_smp.h | 12 ++++-------- include/linux/spinlock_api_smp.h | 12 ++++-------- include/linux/spinlock_api_up.h | 16 +++++++++++----- kernel/softirq.c | 4 ++-- 5 files changed, 36 insertions(+), 23 deletions(-) diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h index d169820203dd..b8d96bca4357 100644 --- a/include/linux/preempt_mask.h +++ b/include/linux/preempt_mask.h @@ -78,6 +78,21 @@ # define PREEMPT_CHECK_OFFSET 0 #endif +/* + * The preempt_count offset needed for things like: + * + * spin_lock_bh() + * + * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and + * softirqs, such that unlock sequences of: + * + * spin_unlock(); + * local_bh_enable(); + * + * Work as expected. + */ +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) + /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 9c9f0495d37c..5b9b84b20407 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock) static inline void __raw_read_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); } @@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock) static inline void __raw_write_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } @@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, @@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_write_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } #endif /* __LINUX_RWLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index bdb9993f0fda..42dfab89e740 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock) static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } @@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); do_raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); if (do_raw_spin_trylock(lock)) { spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; } - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); return 0; } diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index af1f47229e70..d0d188861ad6 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -24,11 +24,14 @@ * flags straight, to suppress compiler warnings of unused lock * variables, and to add the proper checker annotations: */ +#define ___LOCK(lock) \ + do { __acquire(lock); (void)(lock); } while (0) + #define __LOCK(lock) \ - do { preempt_disable(); __acquire(lock); (void)(lock); } while (0) + do { preempt_disable(); ___LOCK(lock); } while (0) #define __LOCK_BH(lock) \ - do { local_bh_disable(); __LOCK(lock); } while (0) + do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0) #define __LOCK_IRQ(lock) \ do { local_irq_disable(); __LOCK(lock); } while (0) @@ -36,12 +39,15 @@ #define __LOCK_IRQSAVE(lock, flags) \ do { local_irq_save(flags); __LOCK(lock); } while (0) +#define ___UNLOCK(lock) \ + do { __release(lock); (void)(lock); } while (0) + #define __UNLOCK(lock) \ - do { preempt_enable(); __release(lock); (void)(lock); } while (0) + do { preempt_enable(); ___UNLOCK(lock); } while (0) #define __UNLOCK_BH(lock) \ - do { preempt_enable_no_resched(); local_bh_enable(); \ - __release(lock); (void)(lock); } while (0) + do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \ + ___UNLOCK(lock); } while (0) #define __UNLOCK_IRQ(lock) \ do { local_irq_enable(); __UNLOCK(lock); } while (0) diff --git a/kernel/softirq.c b/kernel/softirq.c index 11025ccc06dd..7500cce1ebfd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,7 +107,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) /* * Were softirqs turned off above: */ - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_off(ip); raw_local_irq_restore(flags); @@ -133,7 +133,7 @@ static void __local_bh_enable(unsigned int cnt) { WARN_ON_ONCE(!irqs_disabled()); - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); preempt_count_sub(cnt); } From 62b94a08da1bae9d187d49dfcd6665af393750f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 16:52:19 +0100 Subject: [PATCH 38/60] sched/preempt: Take away preempt_enable_no_resched() from modules Discourage drivers/modules to be creative with preemption. Sadly all is implemented in macros and inline so if they want to do evil they still can, but at least try and discourage some. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Eliezer Tamir Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Rusty Russell Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-fn7h6vu8wtgxk0ih402qcijx@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 22 ++++++++++++++++++++-- include/linux/uaccess.h | 5 ++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index a3d9dc8c2c00..dd9ddf8af205 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -64,7 +64,11 @@ do { \ } while (0) #else -#define preempt_enable() preempt_enable_no_resched() +#define preempt_enable() \ +do { \ + barrier(); \ + preempt_count_dec(); \ +} while (0) #define preempt_check_resched() do { } while (0) #endif @@ -93,7 +97,11 @@ do { \ __preempt_schedule_context(); \ } while (0) #else -#define preempt_enable_notrace() preempt_enable_no_resched_notrace() +#define preempt_enable_notrace() \ +do { \ + barrier(); \ + __preempt_count_dec(); \ +} while (0) #endif #else /* !CONFIG_PREEMPT_COUNT */ @@ -116,6 +124,16 @@ do { \ #endif /* CONFIG_PREEMPT_COUNT */ +#ifdef MODULE +/* + * Modules have no business playing preemption tricks. + */ +#undef sched_preempt_enable_no_resched +#undef preempt_enable_no_resched +#undef preempt_enable_no_resched_notrace +#undef preempt_check_resched +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 9d8cf056e661..ecd3319dac33 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -25,13 +25,16 @@ static inline void pagefault_disable(void) static inline void pagefault_enable(void) { +#ifndef CONFIG_PREEMPT /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); preempt_count_dec(); - preempt_check_resched(); +#else + preempt_enable(); +#endif } #ifndef ARCH_HAS_NOCACHE_UACCESS From 5dd12c2152743747ca9f50ef80281e54cc416dc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 18:04:39 +0100 Subject: [PATCH 39/60] sched/clock, x86: Use mul_u64_u32_shr() for native_sched_clock() Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul. Before: 0000000000000560 : 560: 44 8b 1d 00 00 00 00 mov 0x0(%rip),%r11d # 567 567: 55 push %rbp 568: 48 89 e5 mov %rsp,%rbp 56b: 45 85 db test %r11d,%r11d 56e: 75 4f jne 5bf 570: 0f 31 rdtsc 572: 89 c0 mov %eax,%eax 574: 48 c1 e2 20 shl $0x20,%rdx 578: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 57f: 48 09 c2 or %rax,%rdx 582: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 589: 65 8b 04 25 00 00 00 mov %gs:0x0,%eax 590: 00 591: 48 98 cltq 593: 48 8b 34 c5 00 00 00 mov 0x0(,%rax,8),%rsi 59a: 00 59b: 48 89 d0 mov %rdx,%rax 59e: 81 e2 ff 03 00 00 and $0x3ff,%edx 5a4: 48 c1 e8 0a shr $0xa,%rax 5a8: 48 0f af 14 0e imul (%rsi,%rcx,1),%rdx 5ad: 48 0f af 04 0e imul (%rsi,%rcx,1),%rax 5b2: 5d pop %rbp 5b3: 48 03 04 3e add (%rsi,%rdi,1),%rax 5b7: 48 c1 ea 0a shr $0xa,%rdx 5bb: 48 01 d0 add %rdx,%rax 5be: c3 retq After: 0000000000000550 : 550: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # 556 556: 55 push %rbp 557: 48 89 e5 mov %rsp,%rbp 55a: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 55e: 85 ff test %edi,%edi 560: 75 2c jne 58e 562: 0f 31 rdtsc 564: 89 c0 mov %eax,%eax 566: 48 c1 e2 20 shl $0x20,%rdx 56a: 48 09 c2 or %rax,%rdx 56d: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax 574: 00 00 576: 89 c0 mov %eax,%eax 578: 48 f7 e2 mul %rdx 57b: 65 48 8b 0c 25 00 00 mov %gs:0x0,%rcx 582: 00 00 584: c9 leaveq 585: 48 0f ac d0 0a shrd $0xa,%rdx,%rax 58a: 48 01 c8 add %rcx,%rax 58d: c3 retq MAINLINE POST sched_clock_stable: 1 1 (cold) sched_clock: 329841 331312 (cold) local_clock: 301773 310296 (warm) sched_clock: 38375 38247 (warm) local_clock: 100371 102713 (warm) rdtsc: 27340 27289 sched_clock_stable: 0 0 (cold) sched_clock: 382634 372706 (cold) local_clock: 396890 399275 (warm) sched_clock: 38194 38124 (warm) local_clock: 143452 148698 (warm) rdtsc: 27345 27365 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0c..10a78c037910 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -57,10 +58,8 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); return ns; } From 57c67da274f3fab38e08d2c9edf08b89e1d9c71d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:39:25 +0100 Subject: [PATCH 40/60] sched/clock, x86: Move some cyc2ns() code around There are no __cycles_2_ns() users outside of arch/x86/kernel/tsc.c, so move it there. There are no cycles_2_ns() users. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-01lslnavfgo3kmbo4532zlcj@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 59 ------------------ arch/x86/kernel/tsc.c | 112 +++++++++++++++++++---------------- 2 files changed, 61 insertions(+), 110 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 10a78c037910..b4c667693a21 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,66 +13,7 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com - */ - DECLARE_PER_CPU(unsigned long, cyc2ns); DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} - #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f560..b4a04ac1d7aa 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -38,6 +38,66 @@ static int __read_mostly tsc_unstable; static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + return ns; +} + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now, *offset; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + if (cpu_khz) { + *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + + cpu_khz / 2) / cpu_khz; + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); + } + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -62,7 +122,7 @@ u64 native_sched_clock(void) rdtscll(this_offset); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(this_offset); } /* We need to define a real function for sched_clock, to override the @@ -589,56 +649,6 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) From 20d1c86a57762f0a33a78988e3fc8818316badd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:40:29 +0100 Subject: [PATCH 41/60] sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs Use a ring-buffer like multi-version object structure which allows always having a coherent object; we use this to avoid having to disable IRQs while reading sched_clock() and avoids a problem when getting an NMI while changing the cyc2ns data. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 331312 257223 (cold) local_clock: 301773 310296 309889 (warm) sched_clock: 38375 38247 25280 (warm) local_clock: 100371 102713 85268 (warm) rdtsc: 27340 27289 24247 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 372706 301224 (cold) local_clock: 396890 399275 399870 (warm) sched_clock: 38194 38124 25630 (warm) local_clock: 143452 148698 129629 (warm) rdtsc: 27345 27365 24307 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-s567in1e5ekq2nlyhn8f987r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 23 +++- arch/x86/kernel/cpu/perf_event.c | 14 +- arch/x86/kernel/tsc.c | 229 +++++++++++++++++++++++++++---- arch/x86/platform/uv/tlb_uv.c | 66 +++++---- 4 files changed, 276 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index b4c667693a21..3de54ef0aea5 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,7 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. + * + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. + * + * Without an offset (a) the above would not be possible. + * + * See the comment near cycles_2_ns() for details on how we compute (b). + */ +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614d..9f97bd03f74f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,6 +1883,8 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; @@ -1891,13 +1893,17 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!sched_clock_stable) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b4a04ac1d7aa..92b090b2b79e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -39,7 +39,119 @@ static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; -/* Accelerators for sched_clock() +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -61,49 +173,106 @@ int tsc_clocksource_reliable; * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + return ns; } +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); rdtscll(tsc_now); ns_now = cycles_2_ns(tsc_now); - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + cyc2ns_write_end(cpu, data); + +done: sched_clock_idle_wakeup_event(0); local_irq_restore(flags); } - /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -119,10 +288,10 @@ u64 native_sched_clock(void) } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -678,11 +847,21 @@ void tsc_restore_sched_clock_state(void) local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -1005,8 +1184,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d7220397..dfe605ac1bcd 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long ns; + + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + static inline unsigned long cycles_2_us(unsigned long long cyc) { - unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number From ef08f0fff87630d4f67ceb09514d8b444df833f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:31:23 +0100 Subject: [PATCH 42/60] sched/clock: Remove local_irq_disable() from the clocks Now that x86 no longer requires IRQs disabled for sched_clock() and ia64 never had this requirement (it doesn't seem to do cpufreq at all), we can remove the requirement of disabling IRQs. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 257223 221876 (cold) local_clock: 301773 309889 234692 (warm) sched_clock: 38375 25280 25602 (warm) local_clock: 100371 85268 33265 (warm) rdtsc: 27340 24247 24214 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 301224 235941 (cold) local_clock: 396890 399870 297017 (warm) sched_clock: 38194 25630 25233 (warm) local_clock: 143452 129629 71234 (warm) rdtsc: 27345 24307 24245 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-36e5kohiasnr106d077mgubp@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..59371549ddf0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) * local_clock() -- is cpu_clock() on the current cpu. * + * sched_clock_cpu(i) + * * How: * * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ #include #include @@ -242,20 +234,20 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); - if (sched_clock_stable) return sched_clock(); if (unlikely(!sched_clock_running)) return 0ull; + preempt_disable(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable(); return clock; } @@ -316,14 +308,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); - - return clock; + return sched_clock_cpu(cpu); } /* @@ -335,14 +320,7 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); - - return clock; + return sched_clock_cpu(raw_smp_processor_id()); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ From 35af99e646c7f7ea46dc2977601e9e71a51dadd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:38:42 +0100 Subject: [PATCH 43/60] sched/clock, x86: Use a static_key for sched_clock_stable In order to avoid the runtime condition and variable load turn sched_clock_stable into a static_key. Also provide a shorter implementation of local_clock() and cpu_clock(int) when sched_clock_stable==1. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 221876 215295 (cold) local_clock: 301773 234692 220773 (warm) sched_clock: 38375 25602 25659 (warm) local_clock: 100371 33265 27242 (warm) rdtsc: 27340 24214 24208 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 235941 237019 (cold) local_clock: 396890 297017 294819 (warm) sched_clock: 38194 25233 25609 (warm) local_clock: 143452 71234 71232 (warm) rdtsc: 27345 24245 24243 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-eummbdechzz37mwmpags1gjr@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/tsc.c | 6 ++--- include/linux/sched.h | 4 +++- kernel/sched/clock.c | 41 ++++++++++++++++++++++++++------ kernel/sched/debug.c | 2 +- kernel/time/tick-sched.c | 2 +- kernel/trace/ring_buffer.c | 2 +- 9 files changed, 46 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023bdd6b2..8bc79cddd9a2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b342c026..1a439c047ff3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9f97bd03f74f..b88645191fe5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1890,7 +1890,7 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; data = cyc2ns_read_begin(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 92b090b2b79e..53c123537245 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -822,7 +822,7 @@ static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable) + if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -842,7 +842,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; local_irq_save(flags); @@ -984,7 +984,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - sched_clock_stable = 0; + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a196cb7fc6f2..a03875221663 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1994,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) * but then during bootup it turns out that sched_clock() * is reliable after all: */ -extern int sched_clock_stable; +extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); +extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 59371549ddf0..c9b34c4e3ecc 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -58,6 +58,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -74,7 +75,27 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; + +int sched_clock_stable(void) +{ + if (static_key_false(&__sched_clock_stable)) + return false; + return true; +} + +void set_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + static_key_slow_dec(&__sched_clock_stable); +} + +void clear_sched_clock_stable(void) +{ + /* XXX worry about clock continuity */ + if (sched_clock_stable()) + static_key_slow_inc(&__sched_clock_stable); +} struct sched_clock_data { u64 tick_raw; @@ -234,7 +255,7 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - if (sched_clock_stable) + if (sched_clock_stable()) return sched_clock(); if (unlikely(!sched_clock_running)) @@ -257,7 +278,7 @@ void sched_clock_tick(void) struct sched_clock_data *scd; u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; if (unlikely(!sched_clock_running)) @@ -308,7 +329,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(cpu); + + return sched_clock(); } /* @@ -320,7 +344,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - return sched_clock_cpu(raw_smp_processor_id()); + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(raw_smp_processor_id()); + + return sched_clock(); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -340,12 +367,12 @@ u64 sched_clock_cpu(int cpu) u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + return sched_clock(); } u64 local_clock(void) { - return sched_clock_cpu(0); + return sched_clock(); } #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 374fe04a5e6e..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) PN(cpu_clk); P(jiffies); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); + P(sched_clock_stable()); #endif #undef PN #undef P diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..c833249ab0fb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -177,7 +177,7 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) { + if (!sched_clock_stable()) { trace_tick_stop(0, "unstable sched clock\n"); /* * Don't allow the user to think they can get diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..294b8a271a04 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (unlikely(test_time_stamp(delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; + local_clock_stable = sched_clock_stable(); #endif WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", From 6577e42a3e1633afe762f47da9e00061ee4b9a5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Dec 2013 18:55:53 +0100 Subject: [PATCH 44/60] sched/clock: Fix up clear_sched_clock_stable() The below tells us the static_key conversion has a problem; since the exact point of clearing that flag isn't too important, delay the flip and use a workqueue to process it. [ ] TSC synchronization [CPU#0 -> CPU#22]: [ ] Measured 8 cycles TSC warp between CPUs, turning off TSC clock. [ ] [ ] ====================================================== [ ] [ INFO: possible circular locking dependency detected ] [ ] 3.13.0-rc3-01745-g848b0d0322cb-dirty #637 Not tainted [ ] ------------------------------------------------------- [ ] swapper/0/1 is trying to acquire lock: [ ] (jump_label_mutex){+.+...}, at: [] jump_label_lock+0x17/0x20 [ ] [ ] but task is already holding lock: [ ] (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2b/0x60 [ ] [ ] which lock already depends on the new lock. [ ] [ ] [ ] the existing dependency chain (in reverse order) is: [ ] [ ] -> #1 (cpu_hotplug.lock){+.+.+.}: [ ] [] lock_acquire+0x90/0x130 [ ] [] mutex_lock_nested+0x63/0x3e0 [ ] [] get_online_cpus+0x3c/0x60 [ ] [] arch_jump_label_transform+0x37/0x130 [ ] [] __jump_label_update+0x5f/0x80 [ ] [] jump_label_update+0x9d/0xb0 [ ] [] static_key_slow_inc+0x9d/0xb0 [ ] [] sched_feat_set+0xf5/0x100 [ ] [] set_numabalancing_state+0x2c/0x30 [ ] [] numa_policy_init+0x1af/0x1b7 [ ] [] start_kernel+0x35d/0x41f [ ] [] x86_64_start_reservations+0x2a/0x2c [ ] [] x86_64_start_kernel+0xfb/0xfe [ ] [ ] -> #0 (jump_label_mutex){+.+...}: [ ] [] __lock_acquire+0x1701/0x1eb0 [ ] [] lock_acquire+0x90/0x130 [ ] [] mutex_lock_nested+0x63/0x3e0 [ ] [] jump_label_lock+0x17/0x20 [ ] [] static_key_slow_inc+0x6b/0xb0 [ ] [] clear_sched_clock_stable+0x15/0x20 [ ] [] mark_tsc_unstable+0x23/0x70 [ ] [] check_tsc_sync_source+0x14b/0x150 [ ] [] native_cpu_up+0x3a2/0x890 [ ] [] _cpu_up+0xdb/0x160 [ ] [] cpu_up+0x79/0x90 [ ] [] smp_init+0x60/0x8c [ ] [] kernel_init_freeable+0x8c/0x197 [ ] [] kernel_init+0xe/0x130 [ ] [] ret_from_fork+0x7c/0xb0 [ ] [ ] other info that might help us debug this: [ ] [ ] Possible unsafe locking scenario: [ ] [ ] CPU0 CPU1 [ ] ---- ---- [ ] lock(cpu_hotplug.lock); [ ] lock(jump_label_mutex); [ ] lock(cpu_hotplug.lock); [ ] lock(jump_label_mutex); [ ] [ ] *** DEADLOCK *** [ ] [ ] 2 locks held by swapper/0/1: [ ] #0: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_maps_update_begin+0x17/0x20 [ ] #1: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2b/0x60 [ ] [ ] stack backtrace: [ ] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc3-01745-g848b0d0322cb-dirty #637 [ ] Hardware name: Supermicro X8DTN/X8DTN, BIOS 4.6.3 01/08/2010 [ ] ffffffff82c9c270 ffff880236843bb8 ffffffff8165c5f5 ffffffff82c9c270 [ ] ffff880236843bf8 ffffffff81658c02 ffff880236843c80 ffff8802368586a0 [ ] ffff880236858678 0000000000000001 0000000000000002 ffff880236858000 [ ] Call Trace: [ ] [] dump_stack+0x4e/0x7a [ ] [] print_circular_bug+0x1f9/0x207 [ ] [] __lock_acquire+0x1701/0x1eb0 [ ] [] ? __atomic_notifier_call_chain+0x8f/0xb0 [ ] [] lock_acquire+0x90/0x130 [ ] [] ? jump_label_lock+0x17/0x20 [ ] [] ? jump_label_lock+0x17/0x20 [ ] [] mutex_lock_nested+0x63/0x3e0 [ ] [] ? jump_label_lock+0x17/0x20 [ ] [] jump_label_lock+0x17/0x20 [ ] [] static_key_slow_inc+0x6b/0xb0 [ ] [] clear_sched_clock_stable+0x15/0x20 [ ] [] mark_tsc_unstable+0x23/0x70 [ ] [] check_tsc_sync_source+0x14b/0x150 [ ] [] native_cpu_up+0x3a2/0x890 [ ] [] _cpu_up+0xdb/0x160 [ ] [] cpu_up+0x79/0x90 [ ] [] smp_init+0x60/0x8c [ ] [] kernel_init_freeable+0x8c/0x197 [ ] [] ? rest_init+0xd0/0xd0 [ ] [] kernel_init+0xe/0x130 [ ] [] ret_from_fork+0x7c/0xb0 [ ] [] ? rest_init+0xd0/0xd0 [ ] ------------[ cut here ]------------ [ ] WARNING: CPU: 0 PID: 1 at /usr/src/linux-2.6/kernel/smp.c:374 smp_call_function_many+0xad/0x300() [ ] Modules linked in: [ ] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc3-01745-g848b0d0322cb-dirty #637 [ ] Hardware name: Supermicro X8DTN/X8DTN, BIOS 4.6.3 01/08/2010 [ ] 0000000000000009 ffff880236843be0 ffffffff8165c5f5 0000000000000000 [ ] ffff880236843c18 ffffffff81093d8c 0000000000000000 0000000000000000 [ ] ffffffff81ccd1a0 ffffffff810ca951 0000000000000000 ffff880236843c28 [ ] Call Trace: [ ] [] dump_stack+0x4e/0x7a [ ] [] warn_slowpath_common+0x8c/0xc0 [ ] [] ? sched_clock_tick+0x1/0xa0 [ ] [] warn_slowpath_null+0x1a/0x20 [ ] [] smp_call_function_many+0xad/0x300 [ ] [] ? arch_unregister_cpu+0x30/0x30 [ ] [] ? arch_unregister_cpu+0x30/0x30 [ ] [] ? sched_clock_tick+0x1/0xa0 [ ] [] smp_call_function+0x46/0x80 [ ] [] ? arch_unregister_cpu+0x30/0x30 [ ] [] on_each_cpu+0x3c/0xa0 [ ] [] ? sched_clock_idle_sleep_event+0x20/0x20 [ ] [] ? sched_clock_tick+0x1/0xa0 [ ] [] text_poke_bp+0x64/0xd0 [ ] [] ? sched_clock_idle_sleep_event+0x20/0x20 [ ] [] arch_jump_label_transform+0xae/0x130 [ ] [] __jump_label_update+0x5f/0x80 [ ] [] jump_label_update+0x9d/0xb0 [ ] [] static_key_slow_inc+0x9d/0xb0 [ ] [] clear_sched_clock_stable+0x15/0x20 [ ] [] mark_tsc_unstable+0x23/0x70 [ ] [] check_tsc_sync_source+0x14b/0x150 [ ] [] native_cpu_up+0x3a2/0x890 [ ] [] _cpu_up+0xdb/0x160 [ ] [] cpu_up+0x79/0x90 [ ] [] smp_init+0x60/0x8c [ ] [] kernel_init_freeable+0x8c/0x197 [ ] [] ? rest_init+0xd0/0xd0 [ ] [] kernel_init+0xe/0x130 [ ] [] ret_from_fork+0x7c/0xb0 [ ] [] ? rest_init+0xd0/0xd0 [ ] ---[ end trace 6ff1df5620c49d26 ]--- [ ] tsc: Marking TSC unstable due to check_tsc_sync_source failed Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-v55fgqj3nnyqnngmvuu8ep6h@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c9b34c4e3ecc..6bd6a6731b21 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -59,6 +59,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -90,13 +91,23 @@ void set_sched_clock_stable(void) static_key_slow_dec(&__sched_clock_stable); } -void clear_sched_clock_stable(void) +static void __clear_sched_clock_stable(struct work_struct *work) { /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); } +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ + if (keventd_up()) + schedule_work(&sched_clock_work); + else + __clear_sched_clock_stable(&sched_clock_work); +} + struct sched_clock_data { u64 tick_raw; u64 tick_gtod; From 10b033d434c25a6c9e0f4f4dc2418af1b8236c63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:01:40 +0100 Subject: [PATCH 45/60] sched/clock, x86: Avoid a runtime condition in native_sched_clock() Use a static_key to avoid touching tsc_disabled and a runtime condition in native_sched_clock() -- less cachelines touched is always better. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 215295 213039 (cold) local_clock: 301773 220773 216084 (warm) sched_clock: 38375 25659 25231 (warm) local_clock: 100371 27242 27601 (warm) rdtsc: 27340 24208 24203 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 237019 240055 (cold) local_clock: 396890 294819 299942 (warm) sched_clock: 38194 25609 25276 (warm) local_clock: 143452 71232 73232 (warm) rdtsc: 27345 24243 24244 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-hrz87bo37qke25bty6pnfy4b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 53c123537245..6377fb28b958 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -37,6 +38,8 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; + int tsc_clocksource_reliable; /* @@ -282,7 +285,7 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (unlikely(tsc_disabled)) { + if (!static_key_false(&__use_tsc)) { /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } @@ -1193,7 +1196,9 @@ void __init tsc_init(void) return; /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); From 0bd3a173d711857fc9f583eb5825386cc08f3948 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: [PATCH 46/60] sched/preempt, locking: Rework local_bh_{dis,en}able() Currently local_bh_disable() is out-of-line for no apparent reason. So inline it to save a few cycles on call/return nonsense, the function body is a single add on x86 (a few loads and store extra on load/store archs). Also expose two new local_bh functions: __local_bh_{dis,en}able_ip(unsigned long ip, unsigned int cnt); Which implement the actual local_bh_{dis,en}able() behaviour. The next patch uses the exposed @cnt argument to optimize bh lock functions. With build fixes from Jacob Pan. Cc: rjw@rjwysocki.net Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 32 +++++++++++++++++++++++++++++--- include/linux/hardirq.h | 1 + include/linux/preempt_mask.h | 1 - kernel/softirq.c | 35 ++++++----------------------------- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 27b1bcffe408..86c12c93e3cf 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -1,9 +1,35 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H -extern void local_bh_disable(void); +#include +#include + +#ifdef CONFIG_TRACE_IRQFLAGS +extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); +#else +static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +{ + preempt_count_add(cnt); + barrier(); +} +#endif + +static inline void local_bh_disable(void) +{ + __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); +} + extern void _local_bh_enable(void); -extern void local_bh_enable(void); -extern void local_bh_enable_ip(unsigned long ip); +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); + +static inline void local_bh_enable_ip(unsigned long ip) +{ + __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); +} + +static inline void local_bh_enable(void) +{ + __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); +} #endif /* _LINUX_BH_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d9cf963ac832..12d5f972f23f 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -5,6 +5,7 @@ #include #include #include +#include extern void synchronize_irq(unsigned int irq); diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h index b8d96bca4357..dbeec4d4a3be 100644 --- a/include/linux/preempt_mask.h +++ b/include/linux/preempt_mask.h @@ -2,7 +2,6 @@ #define LINUX_PREEMPT_MASK_H #include -#include /* * We put the hardirq and softirq counter into the preemption diff --git a/kernel/softirq.c b/kernel/softirq.c index 7500cce1ebfd..9e368ef35f9a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -89,7 +89,7 @@ static void wakeup_softirqd(void) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip, unsigned int cnt) +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -114,21 +114,9 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - preempt_count_add(cnt); - barrier(); -} +EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -void local_bh_disable(void) -{ - __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(local_bh_disable); - static void __local_bh_enable(unsigned int cnt) { WARN_ON_ONCE(!irqs_disabled()); @@ -151,7 +139,7 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS @@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); + preempt_count_sub(cnt - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) { /* @@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip(_RET_IP_); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); +EXPORT_SYMBOL(__local_bh_enable_ip); /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, @@ -230,7 +207,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 12:22:37 +0100 Subject: [PATCH 47/60] sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding With various drivers wanting to inject idle time; we get people calling idle routines outside of the idle loop proper. Therefore we need to be extra careful about not missing TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations. While looking at this, I also realized there's a small window in the existing idle loop where we can miss TIF_NEED_RESCHED; when it hits right after the tif_need_resched() test at the end of the loop but right before the need_resched() test at the start of the loop. So move preempt_fold_need_resched() out of the loop where we're guaranteed to have TIF_NEED_RESCHED set. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mwait.h | 2 +- include/linux/preempt.h | 15 +++++++++++++++ include/linux/sched.h | 15 +++++++++++++++ kernel/cpu/idle.c | 17 ++++++++++------- kernel/sched/core.c | 3 +-- 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 19b71c439256..1da25a5f96f9 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -53,7 +53,7 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) if (!need_resched()) __mwait(eax, ecx); } - __current_clr_polling(); + current_clr_polling(); } #endif /* _ASM_X86_MWAIT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd9ddf8af205..59749fc48328 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,6 +134,21 @@ do { \ #undef preempt_check_resched #endif +#ifdef CONFIG_PREEMPT +#define preempt_set_need_resched() \ +do { \ + set_preempt_need_resched(); \ +} while (0) +#define preempt_fold_need_resched() \ +do { \ + if (tif_need_resched()) \ + set_preempt_need_resched(); \ +} while (0) +#else +#define preempt_set_need_resched() do { } while (0) +#define preempt_fold_need_resched() do { } while (0) +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/sched.h b/include/linux/sched.h index a03875221663..ffccdad050b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2745,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_task() */ + + preempt_fold_need_resched(); +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a387..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -105,14 +105,17 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 392c6f87906e..0326c06953eb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1510,8 +1510,7 @@ void scheduler_ipi(void) * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ - if (tif_need_resched()) - set_preempt_need_resched(); + preempt_fold_need_resched(); if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) From 1774e9f3e5c8b38de3b3bc8bd0eacd280f655baf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: [PATCH 48/60] sched, net: Clean up preempt_enable_no_resched() abuse The only valid use of preempt_enable_no_resched() is if the very next line is schedule() or if we know preemption cannot actually be enabled by that statement due to known more preempt_count 'refs'. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: rjw@rjwysocki.net Cc: Eliezer Tamir Cc: David S. Miller Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4638e6f0238..82de78603686 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1623,11 +1623,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && net_dma_find_channel()) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif From 37089834528be3ef8cbf927e47c753b3e272a856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: [PATCH 49/60] sched, net: Fixup busy_loop_us_clock() The only valid use of preempt_enable_no_resched() is if the very next line is schedule() or if we know preemption cannot actually be enabled by that statement due to known more preempt_count 'refs'. This busy_poll stuff looks to be completely and utterly broken, sched_clock() can return utter garbage with interrupts enabled (rare but still) and it can drift unbounded between CPUs. This means that if you get preempted/migrated and your new CPU is years behind on the previous CPU we get to busy spin for a _very_ long time. There is a _REASON_ sched_clock() warns about preemptability - papering over it with a preempt_disable()/preempt_enable_no_resched() is just terminal brain damage on so many levels. Replace sched_clock() usage with local_clock() which has a bounded drift between CPUs (<2 jiffies). There is a further problem with the entire busy wait poll thing in that the spin time is additive to the syscall timeout, not inclusive. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: David S. Miller Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: Eliezer Tamir Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/net/busy_poll.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 829627d7b846..1d67fb6b23a0 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void) return sysctl_net_busy_poll; } -/* a wrapper to make debug_smp_processor_id() happy - * we can use sched_clock() because we don't care much about precision - * we only care that the average is bounded - */ -#ifdef CONFIG_DEBUG_PREEMPT static inline u64 busy_loop_us_clock(void) { - u64 rc; - - preempt_disable_notrace(); - rc = sched_clock(); - preempt_enable_no_resched_notrace(); - - return rc >> 10; + return local_clock() >> 10; } -#else /* CONFIG_DEBUG_PREEMPT */ -static inline u64 busy_loop_us_clock(void) -{ - return sched_clock() >> 10; -} -#endif /* CONFIG_DEBUG_PREEMPT */ static inline unsigned long sk_busy_loop_end_time(struct sock *sk) { From 130816ce4d5f69167324f7272e70aa3d641677c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Dec 2013 12:21:17 +0100 Subject: [PATCH 50/60] sched, thermal: Clean up preempt_enable_no_resched() abuse The only valid use of preempt_enable_no_resched() is if the very next line is schedule() or if we know preemption cannot actually be enabled by that statement due to known more preempt_count 'refs'. Signed-off-by: Peter Zijlstra Cc: rjw@rjwysocki.net Cc: Eliezer Tamir Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-zcfvacdlvlr63qmnn5i58vuj@git.kernel.org Signed-off-by: Ingo Molnar --- drivers/thermal/intel_powerclamp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index e8275f2df9af..d833c8f5b465 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -443,7 +443,7 @@ static int clamp_thread(void *arg) atomic_inc(&idle_wakeup_counter); } tick_nohz_idle_exit(); - preempt_enable_no_resched(); + preempt_enable(); } del_timer_sync(&wakeup_timer); clear_bit(cpunr, cpu_clamping_mask); From 54b278b50b8566c227437d6198412cb5a6619e6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 12 Jan 2014 22:24:56 +0100 Subject: [PATCH 51/60] m68k: Fix build warning in mac_via.h Fengguang Wu's kbuild test robot reported the following new m68k warnings: In file included from drivers/nubus/nubus.c:22:0: >> arch/m68k/include/asm/mac_via.h:262:47: warning: 'struct irq_desc' declared inside parameter list [enabled by default] >> arch/m68k/include/asm/mac_via.h:262:47: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] Caused by the reworking of the generic local_bh{dis,en}able() code. To fix it, forward declare 'struct irq_desc'. Reported-by: Fengguang Wu Fixes: c795eb55e740 ("sched/preempt, locking: Rework local_bh_{dis,en}able()") Signed-off-by: Peter Zijlstra Acked-by: geert@linux-m68k.org Link: http://lkml.kernel.org/r/20140112212456.GQ7572@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/m68k/include/asm/mac_via.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h index aeeedf8b2d25..fe3fc9ae1b69 100644 --- a/arch/m68k/include/asm/mac_via.h +++ b/arch/m68k/include/asm/mac_via.h @@ -254,6 +254,8 @@ extern volatile __u8 *via1,*via2; extern int rbv_present,via_alt_mapping; +struct irq_desc; + extern void via_register_interrupts(void); extern void via_irq_enable(int); extern void via_irq_disable(int); From 88f1ebbc256e93bc029ec70b366612801f2c98ad Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 14 Jan 2014 08:06:36 +0800 Subject: [PATCH 52/60] sched/deadline: Fix sparse static warnings new sparse warnings: >> kernel/sched/cpudeadline.c:38:6: sparse: symbol 'cpudl_exchange' was not declared. Should it be static? >> kernel/sched/cpudeadline.c:46:6: sparse: symbol 'cpudl_heapify' was not declared. Should it be static? >> kernel/sched/cpudeadline.c:71:6: sparse: symbol 'cpudl_change_key' was not declared. Should it be static? >> kernel/sched/cpudeadline.c:195:15: sparse: memset with byte count of 163928 Signed-off-by: Fengguang Wu Signed-off-by: Peter Zijlstra Cc: Juri Lelli Fixes: 6bfd6d72f51c ("sched/deadline: speed up SCHED_DEADLINE pushes with a push-heap") Link: http://lkml.kernel.org/r/52d47f8c.EYJsA5+mELPBk4t6\%fengguang.wu@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 3bcade554343..045fc74e3f09 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -35,7 +35,7 @@ static inline int dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -void cpudl_exchange(struct cpudl *cp, int a, int b) +static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; @@ -43,7 +43,7 @@ void cpudl_exchange(struct cpudl *cp, int a, int b) swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); } -void cpudl_heapify(struct cpudl *cp, int idx) +static void cpudl_heapify(struct cpudl *cp, int idx) { int l, r, largest; @@ -68,7 +68,7 @@ void cpudl_heapify(struct cpudl *cp, int idx) } } -void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); From d8bf52311ecefd9b11a5af2b28ed7f41c6023516 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jan 2014 11:23:34 +0800 Subject: [PATCH 53/60] sched/deadline: Remove unused variables fix these new sparse warnings: >> kernel/sched/core.c:305:14: sparse: symbol 'sysctl_sched_dl_period' was not declared. Should it be static? >> kernel/sched/core.c:306:5: sparse: symbol 'sysctl_sched_dl_runtime' was not declared. Should it be static? Better still, they're completely unused so remove them. Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/n/tip-ke0shkG7vMnzmcdqhhiymyem@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0326c06953eb..138711b5bf19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,17 +296,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * Maximum bandwidth available for all -deadline tasks and groups - * (if group scheduling is configured) on each CPU. - * - * default: 5% - */ -unsigned int sysctl_sched_dl_period = 1000000; -int sysctl_sched_dl_runtime = 50000; - - - /* * __task_rq_lock - lock the rq @p resides on. */ From 71362650b555a5b24c732e455484cc7cac1c8588 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 14 Jan 2014 12:03:51 +0100 Subject: [PATCH 54/60] sched/deadline: No need to check p if dl_se is valid Dan Carpenter reported new 'Smatch' warnings: > tree: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core > head: 130816ce4d5f69167324f7272e70aa3d641677c6 > commit: 1baca4ce16b8cc7d4f50be1f7914799af30a2861 [17/50] sched/deadline: Add SCHED_DEADLINE SMP-related data structures & logic > > kernel/sched/deadline.c:937 pick_next_task_dl() warn: variable dereferenced before check 'p' (see line 934) BUG_ON() already fires if pick_next_dl_entity() doesn't return a valid dl_se. No need to check if p is valid afterward. Reported-by: Dan Carpenter Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Fixes: 1baca4ce16b8 ("sched/deadline: Add SCHED_DEADLINE SMP-related data structures & logic") Link: http://lkml.kernel.org/r/52D54E25.6060100@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ee25361becdd..0de248202879 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1007,8 +1007,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq) p->se.exec_start = rq_clock_task(rq); /* Running task will never be pushed. */ - if (p) - dequeue_pushable_dl_task(rq, p); + dequeue_pushable_dl_task(rq, p); #ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) From 5778fccf361c9ba443b45d822f3d875f64c80084 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 14 Jan 2014 16:10:39 +0100 Subject: [PATCH 55/60] sched/core: Fix htmldocs warnings Fengguang Wu's kbuild test robot reported the following new htmldocs warnings: >>> Warning(kernel/sched/core.c:3380): No description found for parameter 'uattr' >>> Warning(kernel/sched/core.c:3380): Excess function parameter 'attr' description in 'sys_sched_setattr' >>> Warning(kernel/sched/core.c:3520): No description found for parameter 'uattr' >>> Warning(kernel/sched/core.c:3520): Excess function parameter 'attr' description in 'sys_sched_getattr' The second argument to sys_sched_{setattr,getattr}() is named uattr (not attr). Reported-by: Fengguang Wu Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Dario Faggioli Fixes: d50dde5a10f3 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Link: http://lkml.kernel.org/r/52D5552D.5000102@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 138711b5bf19..26af3702ebf1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3614,7 +3614,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) /** * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. - * @attr: structure containing the extended parameters. + * @uattr: structure containing the extended parameters. */ SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) { @@ -3756,7 +3756,7 @@ err_size: /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. - * @attr: structure containing the extended parameters. + * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, From e3de300d1212b42aa9d0d6031b12fca06ac00dd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jan 2014 12:30:15 +0100 Subject: [PATCH 56/60] sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls Previously sched_setscheduler() and sched_setparam() would not affect the nice value of a task, restore this behaviour. Signed-off-by: Peter Zijlstra Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Cc: Michael wang Cc: Daniel Lezcano Fixes: d50dde5a10f3 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Link: http://lkml.kernel.org/r/20140115113015.GB31570@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26af3702ebf1..c1b3d7e04f0f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3451,7 +3451,8 @@ int sched_setscheduler(struct task_struct *p, int policy, { struct sched_attr attr = { .sched_policy = policy, - .sched_priority = param->sched_priority + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), }; return __sched_setscheduler(p, &attr, true); } @@ -3481,7 +3482,8 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { struct sched_attr attr = { .sched_policy = policy, - .sched_priority = param->sched_priority + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), }; return __sched_setscheduler(p, &attr, false); } From 39fd8fd22b3224ec6819d33b3e34ae4da6a35f05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jan 2014 16:33:20 +0100 Subject: [PATCH 57/60] sched: Fix up scheduler syscall LTP fails Wu reported LTP failures: > ltp.sched_setparam02.1.TFAIL > ltp.sched_setparam02.2.TFAIL > ltp.sched_setparam02.3.TFAIL > ltp.sched_setparam03.1.TFAIL There were 2 things wrong; firstly __setscheduler() failed on sched_setparam()'s policy = -1, fix that by reading from p->policy in that case. Secondly, getparam() (and getattr()) would still report !0 sched_priority for !FIFO/RR tasks after having been such. So unconditionally set p->rt_priority. Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Dario Faggioli Fixes: d50dde5a10f3 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Link: http://lkml.kernel.org/r/20140115153320.GH31570@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1b3d7e04f0f..e9212eb354b8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3172,15 +3172,23 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, { int policy = attr->sched_policy; + if (policy == -1) /* setparam */ + policy = p->policy; + p->policy = policy; if (dl_policy(policy)) __setparam_dl(p, attr); - else if (rt_policy(policy)) - p->rt_priority = attr->sched_priority; - else + else if (fair_policy(policy)) p->static_prio = NICE_TO_PRIO(attr->sched_nice); + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); p->prio = rt_mutex_getprio(p); From 0bb040a44381261c0729636abbe03caeedb7d72e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jan 2014 17:15:13 +0100 Subject: [PATCH 58/60] sched: Fix up attr::sched_priority warning Fengguang Wu reported the following build warning: > kernel/sched/core.c:3067 __sched_setscheduler() warn: unsigned 'attr->sched_priority' is never less than zero. Since it doesn't make sense for attr::sched_priority to be negative, remove the check, since we already test for an upper limit any actual negative values passed in through the old param::sched_priority field will still be detected. Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Dario Faggioli Fixes: d50dde5a10f3 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Link: http://lkml.kernel.org/n/tip-fid9nalzii2r5voxtf4eh5kz@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9212eb354b8..5a6ccdf4b39d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3282,8 +3282,7 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (attr->sched_priority < 0 || - (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || From 7479f3c9cf67edf5e8a76b21ea3726757f35cf53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jan 2014 17:05:04 +0100 Subject: [PATCH 59/60] sched: Move SCHED_RESET_ON_FORK into attr::sched_flags I noticed the new sched_{set,get}attr() calls didn't properly deal with the SCHED_RESET_ON_FORK hack. Instead of propagating the flags in high bits nonsense use the brand spanking new attr::sched_flags field. Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Dario Faggioli Link: http://lkml.kernel.org/r/20140115162242.GJ31570@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 5 +++++ kernel/sched/core.c | 42 +++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 2d5e49a2a6d2..34f9d7387d13 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -40,8 +40,13 @@ /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 +/* + * For the sched_{set,get}attr() calls + */ +#define SCHED_FLAG_RESET_ON_FORK 0x01 #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5a6ccdf4b39d..93a2836b6220 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3267,8 +3267,7 @@ recheck: reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); if (policy != SCHED_DEADLINE && policy != SCHED_FIFO && policy != SCHED_RR && @@ -3277,6 +3276,9 @@ recheck: return -EINVAL; } + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -3443,6 +3445,26 @@ change: return 0; } +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* + * Fixup the legacy SCHED_RESET_ON_FORK hack + */ + if (policy & SCHED_RESET_ON_FORK) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check); +} /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. @@ -3456,12 +3478,7 @@ change: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - return __sched_setscheduler(p, &attr, true); + return _sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -3487,12 +3504,7 @@ EXPORT_SYMBOL_GPL(sched_setattr); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - return __sched_setscheduler(p, &attr, false); + return _sched_setscheduler(p, policy, param, false); } static int @@ -3792,6 +3804,8 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, goto out_unlock; attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; if (task_has_dl_policy(p)) __getparam_dl(p, &attr); else if (task_has_rt_policy(p)) From eaad45132c564ce377e6dce05e78e08e456d5315 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jan 2014 17:54:25 +0100 Subject: [PATCH 60/60] sched: Fix __sched_setscheduler() nice test With the introduction of sched_attr::sched_nice we need to check if we've got permission to actually change the nice value. Daniel found that can_nice() would always fail; and upon inspection it turns out that can_nice() only tests to see if we can lower the nice value, but it doesn't validate if we're lowering or not. Therefore amend the test to only call can_nice() when we lower the nice value. Reported-and-Tested-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Cc: Daniel Lezcano Fixes: d50dde5a10f3 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Link: http://lkml.kernel.org/r/20140116165425.GA9481@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 93a2836b6220..36c951b7eef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3296,7 +3296,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { - if (!can_nice(p, attr->sched_nice)) + if (attr->sched_nice < TASK_NICE(p) && + !can_nice(p, attr->sched_nice)) return -EPERM; }