From b93e0b8fa819c3d5641794ed9a07e643416aa0fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 May 2014 18:10:21 +0200 Subject: [PATCH 1/6] irq_work: Split raised and lazy lists An irq work can be handled from two places: from the tick if the work carries the "lazy" flag and the tick is periodic, or from a self IPI. We merge all these works in a single list and we use some per cpu latch to avoid raising a self-IPI when one is already pending. Now we could do away with this ugly latch if only the list was only made of non-lazy works. Just enqueueing a work on the empty list would be enough to know if we need to raise an IPI or not. Also we are going to implement remote irq work queuing. Then the per CPU latch will need to become atomic in the global scope. That's too bad because, here as well, just enqueueing a work on an empty list of non-lazy works would be enough to know if we need to raise an IPI or not. So lets take a way out of this: split the works in two distinct lists, one for the works that can be handled by the next tick and another one for those handled by the IPI. Just checking if the latter is empty when we queue a new work is enough to know if we need to raise an IPI. Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/irq_work.c | 53 +++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e2fa78..126f254614bf 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -19,8 +19,8 @@ #include -static DEFINE_PER_CPU(struct llist_head, irq_work_list); -static DEFINE_PER_CPU(int, irq_work_raised); +static DEFINE_PER_CPU(struct llist_head, raised_list); +static DEFINE_PER_CPU(struct llist_head, lazy_list); /* * Claim the entry so that no one else will poke at it. @@ -70,15 +70,13 @@ bool irq_work_queue(struct irq_work *work) /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - - /* - * If the work is not "lazy" or the tick is stopped, raise the irq - * work interrupt (if supported by the arch), otherwise, just wait - * for the next tick. - */ - if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { - if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) arch_irq_work_raise(); } @@ -90,10 +88,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue); bool irq_work_needs_cpu(void) { - struct llist_head *this_list; + struct llist_head *raised, *lazy; - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + raised = &__get_cpu_var(raised_list); + lazy = &__get_cpu_var(lazy_list); + if (llist_empty(raised) && llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ @@ -102,28 +101,18 @@ bool irq_work_needs_cpu(void) return true; } -static void __irq_work_run(void) +static void irq_work_run_list(struct llist_head *list) { unsigned long flags; struct irq_work *work; - struct llist_head *this_list; struct llist_node *llnode; - - /* - * Reset the "raised" state right before we check the list because - * an NMI may enqueue after we find the list empty from the runner. - */ - __this_cpu_write(irq_work_raised, 0); - barrier(); - - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) - return; - BUG_ON(!irqs_disabled()); - llnode = llist_del_all(this_list); + if (llist_empty(list)) + return; + + llnode = llist_del_all(list); while (llnode != NULL) { work = llist_entry(llnode, struct irq_work, llnode); @@ -148,6 +137,12 @@ static void __irq_work_run(void) } } +static void __irq_work_run(void) +{ + irq_work_run_list(&__get_cpu_var(raised_list)); + irq_work_run_list(&__get_cpu_var(lazy_list)); +} + /* * Run the irq_work entries on this cpu. Requires to be ran from hardirq * context with local IRQs disabled. From 478850160636c4f0b2558451df0e42f8c5a10939 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 8 May 2014 01:37:48 +0200 Subject: [PATCH 2/6] irq_work: Implement remote queueing irq work currently only supports local callbacks. However its code is mostly ready to run remote callbacks and we have some potential user. The full nohz subsystem currently open codes its own remote irq work on top of the scheduler ipi when it wants a CPU to reevaluate its next tick. However this ad hoc solution bloats the scheduler IPI. Lets just extend the irq work subsystem to support remote queuing on top of the generic SMP IPI to handle this kind of user. This shouldn't add noticeable overhead. Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Eric Dumazet Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/irq_work.h | 5 +++++ kernel/irq_work.c | 25 ++++++++++++++++++++++++- kernel/smp.c | 9 +++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 19ae05d4b8ec..bf9422c3aefe 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -33,6 +33,11 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } bool irq_work_queue(struct irq_work *work); + +#ifdef CONFIG_SMP +bool irq_work_queue_on(struct irq_work *work, int cpu); +#endif + void irq_work_run(void); void irq_work_sync(struct irq_work *work); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 126f254614bf..4b0a890a304a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) */ } +#ifdef CONFIG_SMP /* - * Enqueue the irq_work @entry unless it's already pending + * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ +bool irq_work_queue_on(struct irq_work *work, int cpu) +{ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return false; + + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); + + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue_on); +#endif + +/* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ diff --git a/kernel/smp.c b/kernel/smp.c index 306f8180b0d5..a1812d184aed 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,7 @@ * * (C) Jens Axboe 2008 */ +#include #include #include #include @@ -210,6 +211,14 @@ void generic_smp_call_function_single_interrupt(void) csd->func(csd->info); csd_unlock(csd); } + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /* From 3d36aebc2e78923095575df954f3f3b430ac0a30 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Jun 2014 16:17:33 +0200 Subject: [PATCH 3/6] nohz: Support nohz full remote kick Remotely kicking a full nohz CPU in order to make it re-evaluate its next tick is currently implemented using the scheduler IPI. However this bloats a scheduler fast path with an off-topic feature. The scheduler tick was abused here for its cool "callable anywhere/anytime" properties. But now that the irq work subsystem can queue remote callbacks, it's a perfect fit to safely queue IPIs when interrupts are disabled without worrying about concurrent callers. So lets implement remote kick on top of irq work. This is going to be used when a new event requires the next tick to be recalculated: more than 1 task competing on the CPU, timer armed, ... Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 9 ++++++++- kernel/time/tick-sched.c | 10 ++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index b84773cb9f4c..8a4987f2294a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -181,7 +181,13 @@ static inline bool tick_nohz_full_cpu(int cpu) extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); -extern void tick_nohz_full_kick(void); +extern void tick_nohz_full_kick_cpu(int cpu); + +static inline void tick_nohz_full_kick(void) +{ + tick_nohz_full_kick_cpu(smp_processor_id()); +} + extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else @@ -189,6 +195,7 @@ static inline void tick_nohz_init(void) { } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void __tick_nohz_full_check(void) { } +static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..3d63944a3eca 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,13 +224,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { }; /* - * Kick the current CPU if it's full dynticks in order to force it to + * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick(void) +void tick_nohz_full_kick_cpu(int cpu) { - if (tick_nohz_full_cpu(smp_processor_id())) - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void nohz_full_kick_ipi(void *info) From 53c5fa16b4c843f1df91f7498e3c7bf95e0eaefa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Jun 2014 16:20:21 +0200 Subject: [PATCH 4/6] nohz: Switch to nohz full remote kick on timer enqueue When a new timer is enqueued on a full dynticks target, that CPU must re-evaluate the next tick to handle the timer correctly. This is currently performed through the scheduler IPI. Meanwhile this happens at the cost of off-topic workarounds in that fast path to make it call irq_exit(). As we plan to remove this hack off the scheduler IPI, lets use the nohz full kick instead. Pretty much any IPI fits for that job as long at it calls irq_exit(). The nohz full kick just happens to be handy and readily available here. If it happens to be too much an overkill in the future, we can still turn that timer kick into an empty IPI. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..feb54965e16f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -684,10 +684,16 @@ static void wake_up_idle_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { + /* + * We just need the target to call irq_exit() and re-evaluate + * the next tick. The nohz full kick at least implies that. + * If needed we can still optimize that later with an + * empty IRQ. + */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) - smp_send_reschedule(cpu); + tick_nohz_full_kick_cpu(cpu); return true; } From fd2ac4f4a65a7f34b0bc6433fcca1192d7ba8b8e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2014 21:12:53 +0100 Subject: [PATCH 5/6] nohz: Use nohz own full kick on 2nd task enqueue Now that we have a nohz full remote kick based on irq work, lets use it to notify a CPU that it's exiting single task mode. This unbloats a bit the scheduler IPI that the nohz code was abusing for its cool "callable anywhere/anytime" properties. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 5 +---- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feb54965e16f..13f5857a15ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1574,9 +1574,7 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) - && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -1593,7 +1591,6 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - tick_nohz_full_check(); sched_ttwu_pending(); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..599a72aff5ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1223,7 +1223,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); - smp_send_reschedule(rq->cpu); + tick_nohz_full_kick_cpu(rq->cpu); } } #endif From 3882ec643997757824cd5f25180cd8a787b9dbe1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2014 22:54:04 +0100 Subject: [PATCH 6/6] nohz: Use IPI implicit full barrier against rq->nr_running r/w A full dynticks CPU is allowed to stop its tick when a single task runs. Meanwhile when a new task gets enqueued, the CPU must be notified so that it can restart its tick to maintain local fairness and other accounting details. This notification is performed by way of an IPI. Then when the target receives the IPI, we expect it to see the new value of rq->nr_running. Hence the following ordering scenario: CPU 0 CPU 1 write rq->running get IPI smp_wmb() smp_rmb() send IPI read rq->nr_running But Paul Mckenney says that nowadays IPIs imply a full barrier on all architectures. So we can safely remove this pair and rely on the implicit barriers that come along IPI send/receive. Lets just comment on this new assumption. Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Ingo Molnar Cc: Kevin Hilman Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 9 +++++---- kernel/sched/sched.h | 10 ++++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13f5857a15ba..7f3063c153d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,10 +740,11 @@ bool sched_can_stop_tick(void) rq = this_rq(); - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); - - /* More than one running task need preemption */ + /* + * More than one running task need preemption. + * nr_running update is assumed to be visible + * after IPI is sent from wakers. + */ if (rq->nr_running > 1) return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 599a72aff5ea..eb8567610295 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1221,8 +1221,14 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { - /* Order rq->nr_running write against the IPI */ - smp_wmb(); + /* + * Tick is needed if more than one task runs on a CPU. + * Send the target an IPI to kick it out of nohz mode. + * + * We assume that IPI implies full memory barrier and the + * new value of rq->nr_running is visible on reception + * from the target. + */ tick_nohz_full_kick_cpu(rq->cpu); } }