From c1dcc927dae01dfd4904ee82ce2c00b50eab6dc3 Mon Sep 17 00:00:00 2001 From: Soren Brinkmann Date: Tue, 26 Nov 2013 17:04:50 -0800 Subject: [PATCH 1/2] clocksource: cadence_ttc: Fix mutex taken inside interrupt context When the kernel is compiled with: CONFIG_HIGH_RES_TIMERS=no CONFIG_HZ_PERIODIC=yes CONFIG_DEBUG_ATOMIC_SLEEP=yes The following WARN appears: WARNING: CPU: 1 PID: 0 at linux/kernel/mutex.c:856 mutex_trylock+0x70/0x1fc() DEBUG_LOCKS_WARN_ON(in_interrupt()) Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.12.0-xilinx-dirty #93 [] (unwind_backtrace+0x0/0x11c) from [] (show_stack+0x10/0x14) [] (show_stack+0x10/0x14) from [] (dump_stack+0x7c/0xc0) [] (dump_stack+0x7c/0xc0) from [] (warn_slowpath_common+0x60/0x84) [] (warn_slowpath_common+0x60/0x84) from [] (warn_slowpath_fmt+0x2c/0x3c) [] (warn_slowpath_fmt+0x2c/0x3c) from [] (mutex_trylock+0x70/0x1fc) [] (mutex_trylock+0x70/0x1fc) from [] (clk_prepare_lock+0xc/0xe4) [] (clk_prepare_lock+0xc/0xe4) from [] (clk_get_rate+0xc/0x44) [] (clk_get_rate+0xc/0x44) from [] (ttc_set_mode+0x34/0x78) [] (ttc_set_mode+0x34/0x78) from [] (clockevents_set_mode+0x28/0x5c) [] (clockevents_set_mode+0x28/0x5c) from [] (tick_broadcast_on_off+0x190/0x1c0) [] (tick_broadcast_on_off+0x190/0x1c0) from [] (clockevents_notify+0x58/0x1ac) [] (clockevents_notify+0x58/0x1ac) from [] (cpuidle_setup_broadcast_timer+0x20/0x24) [] (cpuidle_setup_broadcast_timer+0x20/0x24) from [] (generic_smp_call_function_single_interrupt+0) [] (generic_smp_call_function_single_interrupt+0xe0/0x130) from [] (handle_IPI+0x88/0x118) [] (handle_IPI+0x88/0x118) from [] (gic_handle_irq+0x58/0x60) [] (gic_handle_irq+0x58/0x60) from [] (__irq_svc+0x44/0x78) Exception stack(0xef099fa0 to 0xef099fe8) 9fa0: 00000001 ef092100 00000000 ef092100 ef098000 00000015 c0399f2c c0579d74 9fc0: 0000406a 413fc090 00000000 00000000 00000000 ef099fe8 c00666ec c000f46c 9fe0: 20000113 ffffffff [] (__irq_svc+0x44/0x78) from [] (arch_cpu_idle+0x34/0x3c) [] (arch_cpu_idle+0x34/0x3c) from [] (cpu_startup_entry+0xa8/0x10c) [] (cpu_startup_entry+0xa8/0x10c) from [<000085a4>] (0x85a4) We are in an interrupt context (IPI) and we are calling clk_get_rate in the set_mode function which in turn ends up by getting a mutex... Even if that does not hang, it is a potential kernel deadlock. It is not allowed to call clk_get_rate() from interrupt context. To avoid such calls the timer input frequency is stored in the driver's data struct which makes it accessible to the driver in any context. [dlezcano] completed the changelog with the WARN trace and added a more detailed description. Tested on zync zc702. Acked-by: Daniel Lezcano Tested-by: Daniel Lezcano Signed-off-by: Soren Brinkmann Signed-off-by: Daniel Lezcano --- drivers/clocksource/cadence_ttc_timer.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c index b2bb3a4bc205..a92350b55d32 100644 --- a/drivers/clocksource/cadence_ttc_timer.c +++ b/drivers/clocksource/cadence_ttc_timer.c @@ -67,11 +67,13 @@ * struct ttc_timer - This definition defines local timer structure * * @base_addr: Base address of timer + * @freq: Timer input clock frequency * @clk: Associated clock source * @clk_rate_change_nb Notifier block for clock rate changes */ struct ttc_timer { void __iomem *base_addr; + unsigned long freq; struct clk *clk; struct notifier_block clk_rate_change_nb; }; @@ -196,9 +198,8 @@ static void ttc_set_mode(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_PERIODIC: - ttc_set_interval(timer, - DIV_ROUND_CLOSEST(clk_get_rate(ttce->ttc.clk), - PRESCALE * HZ)); + ttc_set_interval(timer, DIV_ROUND_CLOSEST(ttce->ttc.freq, + PRESCALE * HZ)); break; case CLOCK_EVT_MODE_ONESHOT: case CLOCK_EVT_MODE_UNUSED: @@ -273,6 +274,8 @@ static void __init ttc_setup_clocksource(struct clk *clk, void __iomem *base) return; } + ttccs->ttc.freq = clk_get_rate(ttccs->ttc.clk); + ttccs->ttc.clk_rate_change_nb.notifier_call = ttc_rate_change_clocksource_cb; ttccs->ttc.clk_rate_change_nb.next = NULL; @@ -298,16 +301,14 @@ static void __init ttc_setup_clocksource(struct clk *clk, void __iomem *base) __raw_writel(CNT_CNTRL_RESET, ttccs->ttc.base_addr + TTC_CNT_CNTRL_OFFSET); - err = clocksource_register_hz(&ttccs->cs, - clk_get_rate(ttccs->ttc.clk) / PRESCALE); + err = clocksource_register_hz(&ttccs->cs, ttccs->ttc.freq / PRESCALE); if (WARN_ON(err)) { kfree(ttccs); return; } ttc_sched_clock_val_reg = base + TTC_COUNT_VAL_OFFSET; - setup_sched_clock(ttc_sched_clock_read, 16, - clk_get_rate(ttccs->ttc.clk) / PRESCALE); + setup_sched_clock(ttc_sched_clock_read, 16, ttccs->ttc.freq / PRESCALE); } static int ttc_rate_change_clockevent_cb(struct notifier_block *nb, @@ -334,6 +335,9 @@ static int ttc_rate_change_clockevent_cb(struct notifier_block *nb, ndata->new_rate / PRESCALE); local_irq_restore(flags); + /* update cached frequency */ + ttc->freq = ndata->new_rate; + /* fall through */ } case PRE_RATE_CHANGE: @@ -367,6 +371,7 @@ static void __init ttc_setup_clockevent(struct clk *clk, if (clk_notifier_register(ttcce->ttc.clk, &ttcce->ttc.clk_rate_change_nb)) pr_warn("Unable to register clock notifier.\n"); + ttcce->ttc.freq = clk_get_rate(ttcce->ttc.clk); ttcce->ttc.base_addr = base; ttcce->ce.name = "ttc_clockevent"; @@ -396,7 +401,7 @@ static void __init ttc_setup_clockevent(struct clk *clk, } clockevents_config_and_register(&ttcce->ce, - clk_get_rate(ttcce->ttc.clk) / PRESCALE, 1, 0xfffe); + ttcce->ttc.freq / PRESCALE, 1, 0xfffe); } /** From 9722c2dac708e9468cc0dc30218ef76946ffbc9d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 6 Jan 2014 11:39:12 +0000 Subject: [PATCH 2/2] sched: Calculate effective load even if local weight is 0 Thomas Hellstrom bisected a regression where erratic 3D performance is experienced on virtual machines as measured by glxgears. It identified commit 58d081b5 ("sched/numa: Avoid overloading CPUs on a preferred NUMA node") as the problem which had modified the behaviour of effective_load. Effective load calculates the difference to the system-wide load if a scheduling entity was moved to another CPU. The task group is not heavier as a result of the move but overall system load can increase/decrease as a result of the change. Commit 58d081b5 ("sched/numa: Avoid overloading CPUs on a preferred NUMA node") changed effective_load to make it suitable for calculating if a particular NUMA node was compute overloaded. To reduce the cost of the function, it assumed that a current sched entity weight of 0 was uninteresting but that is not the case. wake_affine() uses a weight of 0 for sync wakeups on the grounds that it is assuming the waking task will sleep and not contribute to load in the near future. In this case, we still want to calculate the effective load of the sched entity hierarchy. As effective_load is no longer used by task_numa_compare since commit fb13c7ee (sched/numa: Use a system-wide search to find swap/migration candidates), this patch simply restores the historical behaviour. Reported-and-tested-by: Thomas Hellstrom Signed-off-by: Rik van Riel [ Wrote changelog] Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140106113912.GC6178@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7395d97e4cb..e64b0794060e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3923,7 +3923,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent || !wl) /* the trivial, non-cgroup case */ + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) {