From 2e672ab2d491713541963afca3a5967ccc2376e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 16:54:49 -0700 Subject: [PATCH 01/72] rcu: Avoid ->dynticks_nmi_nesting store tearing NMIs can nest, and store tearing could in theory happen on carries from one byte to the next. This commit therefore adds the WRITE_ONCE() macros preventing this. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..c5d960f86cf8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1103,7 +1103,8 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } - rdtp->dynticks_nmi_nesting += incby; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); barrier(); } @@ -1135,12 +1136,13 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { - rdtp->dynticks_nmi_nesting -= 2; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - rdtp->dynticks_nmi_nesting = 0; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } From a0eb22bf64a755bb162b421120b9fbe7d012b85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 19:45:10 -0700 Subject: [PATCH 02/72] rcu: Reduce dyntick-idle state space Both extended-quiescent-state entry and exit first update the nesting counter and then adjust the dyntick-idle state. This means that there are four states: (1) Both nesting and dyntick idle indicate idle, (2) Nesting indicates idle but dyntick idle does not, (3) Nesting indicates non-idle and dyntick idle does not, and (4) Both nesting and dyntick idle indicate non-idle. This commit simplifies the state space by eliminating #3, reversing the order of updates on exit from extended quiescent state. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5d960f86cf8..49f661bb8ffe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -928,21 +928,21 @@ void rcu_irq_exit_irqson(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(long long oldval, int user) +static void rcu_eqs_exit_common(long long newval, int user) { RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), - oldval, rdtp->dynticks_nesting); + rdtp->dynticks_nesting, newval); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -967,8 +967,8 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { __this_cpu_inc(disable_rcu_irq_enter); + rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(oldval, user); __this_cpu_dec(disable_rcu_irq_enter); } } @@ -1037,7 +1037,7 @@ void rcu_user_exit(void) void rcu_irq_enter(void) { struct rcu_dynticks *rdtp; - long long oldval; + long long newval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1046,14 +1046,13 @@ void rcu_irq_enter(void) if (rdtp->dynticks_nmi_nesting) return; - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); + newval = rdtp->dynticks_nesting + 1; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && newval == 0); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick(TPS("++="), rdtp->dynticks_nesting, newval); else - rcu_eqs_exit_common(oldval, true); + rcu_eqs_exit_common(newval, true); + rdtp->dynticks_nesting++; } /* From fd581a91ac16187625ec509414d08d37827472c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 21:56:20 -0700 Subject: [PATCH 03/72] rcu: Move rcu_nmi_{enter,exit}() to prepare for consolidation This is a code-motion-only commit that prepares to define rcu_irq_enter() in terms of rcu_nmi_enter() and rcu_irq_exit() in terms of rcu_irq_exit(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 150 +++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 49f661bb8ffe..419f3c38e1b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -866,6 +866,44 @@ void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + rcu_dynticks_eqs_enter(); +} + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -1012,6 +1050,43 @@ void rcu_user_exit(void) } #endif /* CONFIG_NO_HZ_FULL */ +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; + + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (rcu_dynticks_curr_cpu_in_eqs()) { + rcu_dynticks_eqs_exit(); + incby = 1; + } + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); + barrier(); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -1070,81 +1145,6 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and - * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know - * that the CPU is active. This implementation permits nested NMIs, as - * long as the nesting level does not overflow an int. (You will probably - * run out of stack space first.) - * - * If you add or remove a call to rcu_nmi_enter(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_enter(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; - - /* Complain about underflow. */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); - - /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed - * to be in the outermost NMI handler that interrupted an RCU-idle - * period (observation due to Andy Lutomirski). - */ - if (rcu_dynticks_curr_cpu_in_eqs()) { - rcu_dynticks_eqs_exit(); - incby = 1; - } - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdtp->dynticks_nmi_nesting + incby); - barrier(); -} - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); - - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdtp->dynticks_nmi_nesting != 1) { - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ - rdtp->dynticks_nmi_nesting - 2); - return; - } - - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - rcu_dynticks_eqs_enter(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is idle * From 6136d6e48a0138f6be5bb3427dbeb0ba07a546a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 08:28:04 -0700 Subject: [PATCH 04/72] rcu: Clamp ->dynticks_nmi_nesting at eqs entry/exit In preparation for merging dyntick-idle irq handling into the NMI algorithm, clamp ->dynticks_nmi_nesting value to allow for interrupts that enter but never leave and vice versa. It is important that the clamping happen outside of the extended quiescent state. Otherwise, there will be short windows where irqs and NMIs fail to convince RCU to start watching. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/tree.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471de342a..f4a411964c41 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,8 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) +#define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) + /* * Grace-period counter management. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 419f3c38e1b6..142cdd4a50c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -818,6 +818,7 @@ static void rcu_eqs_enter(bool user) struct rcu_dynticks *rdtp; rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -1008,6 +1009,7 @@ static void rcu_eqs_exit(bool user) rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; __this_cpu_dec(disable_rcu_irq_enter); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } From 58721f5da4bcd5187566f4159a4fc88f70bf74f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 10:42:22 -0700 Subject: [PATCH 05/72] rcu: Define rcu_irq_{enter,exit}() in terms of rcu_nmi_{enter,exit}() RCU currently uses two different mechanisms for tracking irqs and NMIs. This is unnecessary complexity: Given that NMIs can nest and given that RCU's tracking handles such nesting, the NMI tracking mechanism can also be used to track irqs. This commit therefore defines rcu_irq_enter() in terms of rcu_nmi_enter() and rcu_irq_exit() in terms of rcu_nmi_exit(). Unfortunately, callers must still distinguish between the irq and NMI functions because additional actions are taken when an irq interrupts idle or nohz_full usermode execution, and these actions cannot always be taken from NMI handlers. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 59 +++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 38 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 142cdd4a50c9..fde0e840563f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -266,6 +266,7 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; @@ -914,8 +915,8 @@ void rcu_nmi_exit(void) * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -926,23 +927,14 @@ void rcu_nmi_exit(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 1); - if (rdtp->dynticks_nesting <= 1) { - rcu_eqs_enter_common(true); - } else { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); - rdtp->dynticks_nesting--; - } + if (rdtp->dynticks_nmi_nesting == 1) + rcu_prepare_for_idle(); + rcu_nmi_exit(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_enter(); } /* @@ -1097,12 +1089,12 @@ void rcu_nmi_enter(void) * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -1113,23 +1105,14 @@ void rcu_nmi_enter(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp; - long long newval; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - newval = rdtp->dynticks_nesting + 1; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && newval == 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("++="), rdtp->dynticks_nesting, newval); - else - rcu_eqs_exit_common(newval, true); - rdtp->dynticks_nesting++; + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_exit(); + rcu_nmi_enter(); + if (rdtp->dynticks_nmi_nesting == 1) + rcu_cleanup_after_idle(); } /* From 51a1fd30f13090be7750fed86cf3728afaf4e394 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 14:43:40 -0700 Subject: [PATCH 06/72] rcu: Make ->dynticks_nesting be a simple counter Now that ->dynticks_nesting counts only process-level dyntick-idle entry and exit, there is no need for the elaborate segmented counter with its guard fields and overflow checking. This commit therefore makes ->dynticks_nesting be a simple counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 27 +-------------------------- kernel/rcu/tree.c | 40 ++++++++++++++++++++-------------------- kernel/rcu/tree.h | 1 - 3 files changed, 21 insertions(+), 47 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f4a411964c41..afe0559d1867 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -30,32 +30,7 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - * - * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH - * that counts the number of process-based reasons why RCU cannot - * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE - * is the value used to increment or decrement this field. - * - * The rest of the bits could in principle be used to count interrupts, - * but this would mean that a negative-one value in the interrupt - * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. - * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK - * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. - * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon - * initial exit from idle. - */ -#define DYNTICK_TASK_NEST_WIDTH 7 -#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) -#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) -#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) -#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) -#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ - DYNTICK_TASK_FLAG) - +/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ #define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fde0e840563f..d123474fe829 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -265,7 +265,7 @@ void rcu_bh_qs(void) #endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; @@ -813,6 +813,10 @@ static void rcu_eqs_enter_common(bool user) /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ static void rcu_eqs_enter(bool user) { @@ -821,11 +825,11 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting == 1) rcu_eqs_enter_common(user); else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; + rdtp->dynticks_nesting--; } /** @@ -836,10 +840,6 @@ static void rcu_eqs_enter(bool user) * critical sections can occur in irq handlers in idle, a possibility * handled by irq_enter() and irq_exit().) * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - * * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -984,6 +984,10 @@ static void rcu_eqs_exit_common(long long newval, int user) /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. */ static void rcu_eqs_exit(bool user) { @@ -994,12 +998,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + if (oldval) { + rdtp->dynticks_nesting++; } else { __this_cpu_inc(disable_rcu_irq_enter); - rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_eqs_exit_common(1, user); + rdtp->dynticks_nesting = 1; __this_cpu_dec(disable_rcu_irq_enter); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } @@ -1011,11 +1015,6 @@ static void rcu_eqs_exit(bool user) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - * * If you add or remove a call to rcu_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -1219,7 +1218,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; } /* @@ -3709,7 +3709,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; @@ -3738,7 +3738,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rdp->dynticks->dynticks_nesting = 1; rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 46a5d1991450..dbd7e3753bed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -39,7 +39,6 @@ */ struct rcu_dynticks { long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ From 844ccdd7dce2c1a6ea9b437fcf8c3265b136e4a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 16:51:47 -0700 Subject: [PATCH 07/72] rcu: Eliminate rcu_irq_enter_disabled() Now that the irq path uses the rcu_nmi_{enter,exit}() algorithm, rcu_irq_enter() and rcu_irq_exit() may be used from any context. There is thus no need for rcu_irq_enter_disabled() and for the checks using it. This commit therefore eliminates rcu_irq_enter_disabled(). Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - include/linux/tracepoint.h | 5 +---- kernel/rcu/tree.c | 22 ++-------------------- kernel/trace/trace.c | 11 ----------- 5 files changed, 3 insertions(+), 37 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b3dbf9502fd0..ce9beec35e34 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -111,7 +111,6 @@ static inline void rcu_cpu_stall_reset(void) { } static inline void rcu_idle_enter(void) { } static inline void rcu_idle_exit(void) { } static inline void rcu_irq_enter(void) { } -static inline bool rcu_irq_enter_disabled(void) { return false; } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 37d6fd3b7ff8..fd996cdf1833 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -85,7 +85,6 @@ void rcu_irq_enter(void); void rcu_irq_exit(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); -bool rcu_irq_enter_disabled(void); void exit_rcu(void); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a26ffbe09e71..c94f466d57ef 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -137,11 +137,8 @@ extern void syscall_unregfunc(void); \ if (!(cond)) \ return; \ - if (rcucheck) { \ - if (WARN_ON_ONCE(rcu_irq_enter_disabled())) \ - return; \ + if (rcucheck) \ rcu_irq_enter_irqson(); \ - } \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d123474fe829..444aa2b3f24d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,20 +270,6 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -/* - * There's a few places, currently just in the tracing infrastructure, - * that uses rcu_irq_enter() to make sure RCU is watching. But there's - * a small location where that will not even work. In those cases - * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() - * can be called. - */ -static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); - -bool rcu_irq_enter_disabled(void) -{ - return this_cpu_read(disable_rcu_irq_enter); -} - /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. @@ -792,10 +778,8 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ - rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - __this_cpu_dec(disable_rcu_irq_enter); + rdtp->dynticks_nesting = 0; + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); /* @@ -1001,10 +985,8 @@ static void rcu_eqs_exit(bool user) if (oldval) { rdtp->dynticks_nesting++; } else { - __this_cpu_inc(disable_rcu_irq_enter); rcu_eqs_exit_common(1, user); rdtp->dynticks_nesting = 1; - __this_cpu_dec(disable_rcu_irq_enter); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..dbce1be3bab8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2682,17 +2682,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, if (unlikely(in_nmi())) return; - /* - * It is possible that a function is being traced in a - * location that RCU is not watching. A call to - * rcu_irq_enter() will make sure that it is, but there's - * a few internal rcu functions that could be traced - * where that wont work either. In those cases, we just - * do nothing. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - rcu_irq_enter_irqson(); __ftrace_trace_stack(buffer, flags, skip, pc, NULL); rcu_irq_exit_irqson(); From bd2b879a1ca55486fdb9dcac691bfd3dd79c83d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 12:29:01 -0700 Subject: [PATCH 08/72] rcu: Add tracing to irq/NMI dyntick-idle transitions Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 14 ++++++++------ kernel/rcu/tree.c | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 59d40c454aa0..4674b21247f7 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -421,16 +421,18 @@ TRACE_EVENT(rcu_fqs, /* * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode, "End" for - * leaving it, "--=" for events moving towards idle, and "++=" for events - * moving away from idle. "Error on entry: not idle task" and "Error on - * exit: not idle task" indicate that a non-idle task is erroneously + * as argument: "Start" for entering dyntick-idle mode, "Startirq" for + * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it + * to irq/NMI, "--=" for events moving towards idle, and "++=" for events + * moving away from idle. "Error on entry: not idle task" and "Error + * on exit: not idle task" indicate that a non-idle task is erroneously * toying with the idle loop. * * These events also take a pair of numbers, which indicate the nesting * depth before and after the event of interest. Note that task-related - * events use the upper bits of each number, while interrupt-related - * events use the lower bits. + * and interrupt-related events use two separate counters, and that the + * "++=" and "--=" events for irq/NMI will change the counter by two, + * otherwise by one. */ TRACE_EVENT(rcu_dyntick, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 444aa2b3f24d..d069ba2d8412 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -880,12 +880,15 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting - 2); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } @@ -1057,6 +1060,9 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting + incby); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdtp->dynticks_nmi_nesting + incby); barrier(); From 84585aa8b6ad24e5bdfba9db4a320a6aeed192ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 15:55:16 -0700 Subject: [PATCH 09/72] rcu: Shrink ->dynticks_{nmi_,}nesting from long long to long Because the ->dynticks_nesting field now only contains the process-based nesting level instead of a value encoding both the process nesting level and the irq "nesting" level, we no longer need a long long, even on 32-bit systems. This commit therefore changes both the ->dynticks_nesting and ->dynticks_nmi_nesting fields to long. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 8 ++++---- kernel/rcu/rcu.h | 2 +- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 4674b21247f7..b0a48231ea0e 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -436,14 +436,14 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(const char *polarity, long long oldnesting, long long newnesting), + TP_PROTO(const char *polarity, long oldnesting, long newnesting), TP_ARGS(polarity, oldnesting, newnesting), TP_STRUCT__entry( __field(const char *, polarity) - __field(long long, oldnesting) - __field(long long, newnesting) + __field(long, oldnesting) + __field(long, newnesting) ), TP_fast_assign( @@ -452,7 +452,7 @@ TRACE_EVENT(rcu_dyntick, __entry->newnesting = newnesting; ), - TP_printk("%s %llx %llx", __entry->polarity, + TP_printk("%s %lx %lx", __entry->polarity, __entry->oldnesting, __entry->newnesting) ); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index afe0559d1867..6334f2c1abd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -31,7 +31,7 @@ #endif /* #else #ifdef CONFIG_RCU_TRACE */ /* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ -#define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) +#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d069ba2d8412..92de3bacda07 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -946,7 +946,7 @@ void rcu_irq_exit_irqson(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(long long newval, int user) +static void rcu_eqs_exit_common(long newval, int user) { RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) @@ -979,7 +979,7 @@ static void rcu_eqs_exit_common(long long newval, int user) static void rcu_eqs_exit(bool user) { struct rcu_dynticks *rdtp; - long long oldval; + long oldval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1043,7 +1043,7 @@ void rcu_user_exit(void) void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; + long incby = 2; /* Complain about underflow. */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index dbd7e3753bed..6488a3b0e729 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -38,8 +38,8 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3975f1..e94e754464cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1687,7 +1687,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], From dec98900eae1e22467182e58688abe5fae98bd5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 16:24:29 -0700 Subject: [PATCH 10/72] rcu: Add ->dynticks field to rcu_dyntick trace event Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 13 ++++++++----- kernel/rcu/tree.c | 16 +++++++--------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index b0a48231ea0e..d103de9f8c10 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -436,24 +436,27 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(const char *polarity, long oldnesting, long newnesting), + TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks), - TP_ARGS(polarity, oldnesting, newnesting), + TP_ARGS(polarity, oldnesting, newnesting, dynticks), TP_STRUCT__entry( __field(const char *, polarity) __field(long, oldnesting) __field(long, newnesting) + __field(int, dynticks) ), TP_fast_assign( __entry->polarity = polarity; __entry->oldnesting = oldnesting; __entry->newnesting = newnesting; + __entry->dynticks = atomic_read(&dynticks); ), - TP_printk("%s %lx %lx", __entry->polarity, - __entry->oldnesting, __entry->newnesting) + TP_printk("%s %lx %lx %#3x", __entry->polarity, + __entry->oldnesting, __entry->newnesting, + __entry->dynticks & 0xfff) ); /* @@ -801,7 +804,7 @@ TRACE_EVENT(rcu_barrier, grplo, grphi, gp_tasks) do { } \ while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) +#define trace_rcu_dyntick(polarity, oldnesting, newnesting, dyntick) do { } while (0) #define trace_rcu_prep_idle(reason) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 92de3bacda07..5febb76809f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -761,13 +761,13 @@ static void rcu_eqs_enter_common(bool user) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0, rdtp->dynticks); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -880,15 +880,14 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting - 2); + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0); + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } @@ -953,14 +952,13 @@ static void rcu_eqs_exit_common(long newval, int user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval, rdtp->dynticks); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on exit: not idle task"), - rdtp->dynticks_nesting, newval); + trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, newval, rdtp->dynticks); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -1062,7 +1060,7 @@ void rcu_nmi_enter(void) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting + incby); + rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdtp->dynticks_nmi_nesting + incby); barrier(); From 914955e18ca09fc404d7fc3614bb04c96a03692c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 13:50:57 -0700 Subject: [PATCH 11/72] rcu: Stop duplicating lockdep checks in RCU's idle-entry code The three RCU_LOCKDEP_WARN() calls in rcu_eqs_enter_common() are redundant with other lockdep checks, so this commit removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5febb76809f6..80cada11f544 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -781,17 +781,6 @@ static void rcu_eqs_enter_common(bool user) rdtp->dynticks_nesting = 0; rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); - - /* - * It is illegal to enter an extended quiescent state while - * in an RCU read-side critical section. - */ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); } /* From 2342172fd6c148506456862d795c7f155baf6797 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 15:03:10 -0700 Subject: [PATCH 12/72] rcu: Avoid ->dynticks_nesting store tearing Although ->dynticks_nesting is updated only by process level, it is accessed from hardirq to check for interrupt-from-idle quiescent states. Store tearing is thus possible, so this commit applies WRITE_ONCE() to ->dynticks_nesting stores. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 80cada11f544..b2ded4d436c6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -778,7 +778,7 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - rdtp->dynticks_nesting = 0; + WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); } @@ -976,7 +976,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting++; } else { rcu_eqs_exit_common(1, user); - rdtp->dynticks_nesting = 1; + WRITE_ONCE(rdtp->dynticks_nesting, 1); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } @@ -3713,7 +3713,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = 1; + rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ From 215bba9f59e35e64b9936da62632b2fa3ede647c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 16:37:03 -0700 Subject: [PATCH 13/72] rcu: Fold rcu_eqs_enter_common() into rcu_eqs_enter() There is now only one call to rcu_eqs_enter_common() and there is no other reason to keep it separate. This commit therefore inlines it into its sole call site, saving a few lines of code in the process. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2ded4d436c6..5c8a5796c71f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -749,16 +749,27 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * rcu_eqs_enter_common - current CPU is entering an extended quiescent state + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. * - * Enter idle, doing appropriate accounting. The caller must have - * disabled interrupts. + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter_common(bool user) +static void rcu_eqs_enter(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp; + + rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting != 1) { + rdtp->dynticks_nesting--; + return; + } lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); @@ -783,28 +794,6 @@ static void rcu_eqs_enter_common(bool user) rcu_dynticks_task_enter(); } -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -static void rcu_eqs_enter(bool user) -{ - struct rcu_dynticks *rdtp; - - rdtp = this_cpu_ptr(&rcu_dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (rdtp->dynticks_nesting == 1) - rcu_eqs_enter_common(user); - else - rdtp->dynticks_nesting--; -} - /** * rcu_idle_enter - inform RCU that current CPU is entering idle * From 9dd238e28640d5514bbd0ff2d425f32409981d85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 16:56:26 -0700 Subject: [PATCH 14/72] rcu: Fold rcu_eqs_exit_common() into rcu_eqs_exit() There is now only one call to rcu_eqs_exit_common() and there is no other reason to keep it separate. This commit therefore inlines it into its sole call site, saving a few lines of code in the process. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 50 +++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5c8a5796c71f..46a8e06bf03e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -916,34 +916,6 @@ void rcu_irq_exit_irqson(void) local_irq_restore(flags); } -/* - * rcu_eqs_exit_common - current CPU moving away from extended quiescent state - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_exit_common(long newval, int user) -{ - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) - - rcu_dynticks_task_exit(); - rcu_dynticks_eqs_exit(); - rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, newval, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. @@ -963,11 +935,25 @@ static void rcu_eqs_exit(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { rdtp->dynticks_nesting++; - } else { - rcu_eqs_exit_common(1, user); - WRITE_ONCE(rdtp->dynticks_nesting, 1); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + return; } + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + rcu_cleanup_after_idle(); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); + + trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + rcu_ftrace_dump(DUMP_ORIG); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } + WRITE_ONCE(rdtp->dynticks_nesting, 1); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** From e68bbb266dcfed201d8d54a2828ef820d747f083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 19:55:31 -0700 Subject: [PATCH 15/72] rcu: Simplify rcu_eqs_{enter,exit}() non-idle task debug code The code that checks for non-idle non-nohz_idle-usermode tasks invoking rcu_eqs_enter() and rcu_eqs_exit() prints a considerable quantity of helpful information. However, these checks fire rarely, so the extra complexity is no longer worth it. This commit therefore replaces this debug code with simple WARN_ON_ONCE() statements. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 +++++------- kernel/rcu/tree.c | 24 ++---------------------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d103de9f8c10..adf47c635c8e 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -424,15 +424,13 @@ TRACE_EVENT(rcu_fqs, * as argument: "Start" for entering dyntick-idle mode, "Startirq" for * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it * to irq/NMI, "--=" for events moving towards idle, and "++=" for events - * moving away from idle. "Error on entry: not idle task" and "Error - * on exit: not idle task" indicate that a non-idle task is erroneously - * toying with the idle loop. + * moving away from idle. * * These events also take a pair of numbers, which indicate the nesting - * depth before and after the event of interest. Note that task-related - * and interrupt-related events use two separate counters, and that the - * "++=" and "--=" events for irq/NMI will change the counter by two, - * otherwise by one. + * depth before and after the event of interest, and a third number that is + * the ->dynticks counter. Note that task-related and interrupt-related + * events use two separate counters, and that the "++=" and "--=" events + * for irq/NMI will change the counter by two, otherwise by one. */ TRACE_EVENT(rcu_dyntick, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46a8e06bf03e..4d374d2bc925 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -773,17 +773,7 @@ static void rcu_eqs_enter(bool user) lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); @@ -941,17 +931,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, 1, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdtp->dynticks_nesting, 1); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } From 3af3999b9a325d462c9353389b7507c4b7bc5428 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Oct 2017 13:48:14 -0700 Subject: [PATCH 16/72] doc: Update dyntick-idle design documentation for NMI/irq consolidation Signed-off-by: Paul E. McKenney --- .../Data-Structures/Data-Structures.html | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 38d6d800761f..1ac011de606e 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1182,8 +1182,8 @@ CPU (and from tracing) unless otherwise stated. Its fields are as follows:
-  1   int dynticks_nesting;
-  2   int dynticks_nmi_nesting;
+  1   long dynticks_nesting;
+  2   long dynticks_nmi_nesting;
   3   atomic_t dynticks;
   4   bool rcu_need_heavy_qs;
   5   unsigned long rcu_qs_ctr;
@@ -1191,15 +1191,31 @@ Its fields are as follows:
 

The ->dynticks_nesting field counts the -nesting depth of normal interrupts. -In addition, this counter is incremented when exiting dyntick-idle -mode and decremented when entering it. +nesting depth of process execution, so that in normal circumstances +this counter has value zero or one. +NMIs, irqs, and tracers are counted by the ->dynticks_nmi_nesting +field. +Because NMIs cannot be masked, changes to this variable have to be +undertaken carefully using an algorithm provided by Andy Lutomirski. +The initial transition from idle adds one, and nested transitions +add two, so that a nesting level of five is represented by a +->dynticks_nmi_nesting value of nine. This counter can therefore be thought of as counting the number of reasons why this CPU cannot be permitted to enter dyntick-idle -mode, aside from non-maskable interrupts (NMIs). -NMIs are counted by the ->dynticks_nmi_nesting -field, except that NMIs that interrupt non-dyntick-idle execution -are not counted. +mode, aside from process-level transitions. + +

However, it turns out that when running in non-idle kernel context, +the Linux kernel is fully capable of entering interrupt handlers that +never exit and perhaps also vice versa. +Therefore, whenever the ->dynticks_nesting field is +incremented up from zero, the ->dynticks_nmi_nesting field +is set to a large positive number, and whenever the +->dynticks_nesting field is decremented down to zero, +the the ->dynticks_nmi_nesting field is set to zero. +Assuming that the number of misnested interrupts is not sufficient +to overflow the counter, this approach corrects the +->dynticks_nmi_nesting field every time the corresponding +CPU enters the idle loop from process context.

The ->dynticks field counts the corresponding CPU's transitions to and from dyntick-idle mode, so that this counter @@ -1231,14 +1247,16 @@ in response.   Quick Quiz: - Why not just count all NMIs? - Wouldn't that be simpler and less error prone? + Why not simply combine the ->dynticks_nesting + and ->dynticks_nmi_nesting counters into a + single counter that just counts the number of reasons that + the corresponding CPU is non-idle? Answer: - It seems simpler only until you think hard about how to go about - updating the rcu_dynticks structure's - ->dynticks field. + Because this would fail in the presence of interrupts whose + handlers never return and of handlers that manage to return + from a made-up interrupt.   From 584c005951866792d552f21f7445e8104ce10f9c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:12:57 -0400 Subject: [PATCH 17/72] tracing, rcu: Remove no longer used trace event rcu_prep_idle Commit c0f4dfd4f90 ("rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks") removed the only instances of trace_rcu_prep_idle, but did not remove the TRACE_EVENT() that creates it. As defined trace events take up memory within the kernel even when they are not used, this is a waste of space. Remove the obsolete event. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 40 -------------------------------------- 1 file changed, 40 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index adf47c635c8e..9bafeaf4e0e0 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -457,45 +457,6 @@ TRACE_EVENT(rcu_dyntick, __entry->dynticks & 0xfff) ); -/* - * Tracepoint for RCU preparation for idle, the goal being to get RCU - * processing done so that the current CPU can shut off its scheduling - * clock and enter dyntick-idle mode. One way to accomplish this is - * to drain all RCU callbacks from this CPU, and the other is to have - * done everything RCU requires for the current grace period. In this - * latter case, the CPU will be awakened at the end of the current grace - * period in order to process the remainder of its callbacks. - * - * These tracepoints take a string as argument: - * - * "No callbacks": Nothing to do, no callbacks on this CPU. - * "In holdoff": Nothing to do, holding off after unsuccessful attempt. - * "Begin holdoff": Attempt failed, don't retry until next jiffy. - * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. - * "Dyntick with lazy callbacks": Entering dyntick-idle w/lazy callbacks. - * "More callbacks": Still more callbacks, try again to clear them out. - * "Callbacks drained": All callbacks processed, off to dyntick idle! - * "Timer": Timer fired to cause CPU to continue processing callbacks. - * "Demigrate": Timer fired on wrong CPU, woke up correct CPU. - * "Cleanup after idle": Idle exited, timer canceled. - */ -TRACE_EVENT(rcu_prep_idle, - - TP_PROTO(const char *reason), - - TP_ARGS(reason), - - TP_STRUCT__entry( - __field(const char *, reason) - ), - - TP_fast_assign( - __entry->reason = reason; - ), - - TP_printk("%s", __entry->reason) -); - /* * Tracepoint for the registration of a single RCU callback function. * The first argument is the type of RCU, the second argument is @@ -803,7 +764,6 @@ TRACE_EVENT(rcu_barrier, while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting, dyntick) do { } while (0) -#define trace_rcu_prep_idle(reason) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ do { } while (0) From d633198088bd9e358566c470ed182994403acc7a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Oct 2017 13:52:30 -0700 Subject: [PATCH 18/72] srcu: Prohibit call_srcu() use under raw spinlocks Invoking queue_delayed_work() while holding a raw spinlock is forbidden in -rt kernels, which is exactly what __call_srcu() does, indirectly via srcu_funnel_gp_start(). This commit therefore downgrades Tree SRCU's locking from raw to non-raw spinlocks, which works because call_srcu() is not ever called while holding a raw spinlock. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 8 +-- kernel/rcu/srcutree.c | 109 ++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 45 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index a949f4f9e4d7..4eda108abee0 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -40,7 +40,7 @@ struct srcu_data { unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ /* Update-side state. */ - raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; + spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -58,7 +58,7 @@ struct srcu_data { * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { - raw_spinlock_t __private lock; + spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children */ /* having CBs, but only */ /* is > ->srcu_gq_seq. */ @@ -78,7 +78,7 @@ struct srcu_struct { struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - raw_spinlock_t __private lock; /* Protect counters */ + spinlock_t __private lock; /* Protect counters */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -107,7 +107,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name) \ { \ .sda = &name##_srcu_data, \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = 0 - 1, \ __SRCU_DEP_MAP_INIT(name) \ } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d5880089ff6..d5cea81378cc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irq_rcu_node(p) \ + spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + /* * Initialize SRCU combining tree. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_irq_rcu_node(snp); + spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - raw_spin_unlock_irq_rcu_node(snp); + spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); @@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp) if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); } } @@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL @@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - raw_spin_lock_rcu_node(sdp); + spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the raw_spin_lock_irq_rcu_node() + * period. This pairs with the spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); From dac95906003fec1b4801115830cc14ec61c74960 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 11:23:10 -0700 Subject: [PATCH 19/72] torture: Suppress CPU stall warnings during shutdown ftrace dump The torture_shutdown() function directly invokes ftrace_dump(), which can result in RCU CPU stall warnings when the ftrace buffer is large, which it usually is. This commit therefore invoks rcu_ftrace_dump() in place of ftrace_dump(), suppressing RCU CPU stall warnings during this time. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/torture.c b/kernel/torture.c index 637e172835d8..52781e838541 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -47,6 +47,7 @@ #include #include #include +#include "rcu/rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -500,7 +501,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - ftrace_dump(DUMP_ALL); + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } From cf8d8b00518d9228d603fcd17de47c31deb70b8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Oct 2017 11:00:33 -0700 Subject: [PATCH 20/72] torture: Prepare scripting for shift from %p to %pK Because %p prints "(null)" and %pK prints "0000000000000000" or (on 32-bit systems) "00000000", this commit adjusts torture-test scripting accordingly. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-torture.sh b/tools/testing/selftests/rcutorture/bin/parse-torture.sh index f12c38909b00..5987e50cfeb4 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-torture.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-torture.sh @@ -55,7 +55,7 @@ then exit fi -grep --binary-files=text 'torture:.*ver:' $file | grep --binary-files=text -v '(null)' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' | +grep --binary-files=text 'torture:.*ver:' $file | egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' | awk ' BEGIN { ver = 0; From a0982dfa03efca6c239c52cabebcea4afb93ea6b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Oct 2017 16:24:28 -0700 Subject: [PATCH 21/72] sched: Stop resched_cpu() from sending IPIs to offline CPUs The rcutorture test suite occasionally provokes a splat due to invoking resched_cpu() on an offline CPU: WARNING: CPU: 2 PID: 8 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x37/0x40 Modules linked in: CPU: 2 PID: 8 Comm: rcu_preempt Not tainted 4.14.0-rc4+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff902ede9daf00 task.stack: ffff96c50010c000 RIP: 0010:native_smp_send_reschedule+0x37/0x40 RSP: 0018:ffff96c50010fdb8 EFLAGS: 00010096 RAX: 000000000000002e RBX: ffff902edaab4680 RCX: 0000000000000003 RDX: 0000000080000003 RSI: 0000000000000000 RDI: 00000000ffffffff RBP: ffff96c50010fdb8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 00000000299f36ae R12: 0000000000000001 R13: ffffffff9de64240 R14: 0000000000000001 R15: ffffffff9de64240 FS: 0000000000000000(0000) GS:ffff902edfc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000f7d4c642 CR3: 000000001e0e2000 CR4: 00000000000006e0 Call Trace: resched_curr+0x8f/0x1c0 resched_cpu+0x2c/0x40 rcu_implicit_dynticks_qs+0x152/0x220 force_qs_rnp+0x147/0x1d0 ? sync_rcu_exp_select_cpus+0x450/0x450 rcu_gp_kthread+0x5a9/0x950 kthread+0x142/0x180 ? force_qs_rnp+0x1d0/0x1d0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x27/0x40 Code: 14 01 0f 92 c0 84 c0 74 14 48 8b 05 14 4f f4 00 be fd 00 00 00 ff 90 a0 00 00 00 5d c3 89 fe 48 c7 c7 38 89 ca 9d e8 e5 56 08 00 <0f> ff 5d c3 0f 1f 44 00 00 8b 05 52 9e 37 02 85 c0 75 38 55 48 ---[ end trace 26df9e5df4bba4ac ]--- This splat cannot be generated by expedited grace periods because they always invoke resched_cpu() on the current CPU, which is good because expedited grace periods require that resched_cpu() unconditionally succeed. However, other parts of RCU can tolerate resched_cpu() acting as a no-op, at least as long as it doesn't happen too often. This commit therefore makes resched_cpu() invoke resched_curr() only if the CPU is either online or is the current CPU. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..c85dfb746f8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } From 2fe2582649aa2355f79acddb86bd4d6c5363eb63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Oct 2017 17:00:18 -0700 Subject: [PATCH 22/72] sched: Stop switched_to_rt() from sending IPIs to offline CPUs The rcutorture test suite occasionally provokes a splat due to invoking rt_mutex_lock() which needs to boost the priority of a task currently sitting on a runqueue that belongs to an offline CPU: WARNING: CPU: 0 PID: 12 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x37/0x40 Modules linked in: CPU: 0 PID: 12 Comm: rcub/7 Not tainted 4.14.0-rc4+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff9ed3de5f8cc0 task.stack: ffffbbf80012c000 RIP: 0010:native_smp_send_reschedule+0x37/0x40 RSP: 0018:ffffbbf80012fd10 EFLAGS: 00010082 RAX: 000000000000002f RBX: ffff9ed3dd9cb300 RCX: 0000000000000004 RDX: 0000000080000004 RSI: 0000000000000086 RDI: 00000000ffffffff RBP: ffffbbf80012fd10 R08: 000000000009da7a R09: 0000000000007b9d R10: 0000000000000001 R11: ffffffffbb57c2cd R12: 000000000000000d R13: ffff9ed3de5f8cc0 R14: 0000000000000061 R15: ffff9ed3ded59200 FS: 0000000000000000(0000) GS:ffff9ed3dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000080686f0 CR3: 000000001b9e0000 CR4: 00000000000006f0 Call Trace: resched_curr+0x61/0xd0 switched_to_rt+0x8f/0xa0 rt_mutex_setprio+0x25c/0x410 task_blocks_on_rt_mutex+0x1b3/0x1f0 rt_mutex_slowlock+0xa9/0x1e0 rt_mutex_lock+0x29/0x30 rcu_boost_kthread+0x127/0x3c0 kthread+0x104/0x140 ? rcu_report_unblock_qs_rnp+0x90/0x90 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x22/0x30 Code: f0 00 0f 92 c0 84 c0 74 14 48 8b 05 34 74 c5 00 be fd 00 00 00 ff 90 a0 00 00 00 5d c3 89 fe 48 c7 c7 a0 c6 fc b9 e8 d5 b5 06 00 <0f> ff 5d c3 0f 1f 44 00 00 8b 05 a2 d1 13 02 85 c0 75 38 55 48 But the target task's priority has already been adjusted, so the only purpose of switched_to_rt() invoking resched_curr() is to wake up the CPU running some task that needs to be preempted by the boosted task. But the CPU is offline, which presumably means that the task must be migrated to some other CPU, and that this other CPU will undertake any needed preemption at the time of migration. Because the runqueue lock is held when resched_curr() is invoked, we know that the boosted task cannot go anywhere, so it is not necessary to invoke resched_curr() in this particular case. This commit therefore makes switched_to_rt() refrain from invoking resched_curr() when the target CPU is offline. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..f242f642ef53 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2206,7 +2206,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } From ffa53c5863ddb265f9a25729023f4d0409cdacf7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 06:48:39 -0700 Subject: [PATCH 23/72] netfilter: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 85f643c1e227..4efaa3066c78 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1044,7 +1044,7 @@ static void gc_worker(struct work_struct *work) * we will just continue with next hash slot. */ rcu_read_unlock(); - cond_resched_rcu_qs(); + cond_resched(); } while (++buckets < goal); if (gc_work->exiting) From 50d4fb781287b47c4c2d455e3395783c1f06a3a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:22:18 -0700 Subject: [PATCH 24/72] mm: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mlock.c b/mm/mlock.c index 30472d438794..f7f54fd2e13f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -779,7 +779,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched_rcu_qs(); + cond_resched(); } out: return 0; From a7e6425ea5816947d3cb51fbf57351207c074383 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:25:02 -0700 Subject: [PATCH 25/72] workqueue: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..aee7eaab05cb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2135,7 +2135,7 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched_rcu_qs(); + cond_resched(); spin_lock_irq(&pool->lock); From e31d28b6ab8f0431e2288edb02723269f54d1471 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:26:32 -0700 Subject: [PATCH 26/72] trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Ingo Molnar --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 79f838a75077..22fee766081b 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg) * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched_rcu_qs(); + cond_resched(); } return 0; From edf22f4ca26babcd8cba4f049c6be53f0e73bcc1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:31:12 -0700 Subject: [PATCH 27/72] softirq: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: NeilBrown Cc: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 2f5e87f1bae2..24d243ef8e71 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -665,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched_rcu_qs(); + cond_resched(); return; } local_irq_enable(); From 388a4c88064e7e62602b4d92ca127f0b0c9b305a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:39:34 -0700 Subject: [PATCH 28/72] fs: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Alexander Viro Cc: --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 3b080834b870..fc0eeb812e2c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched_rcu_qs(); + cond_resched(); } } i++; From f2b1760aedba1d8394636ba31b9e864e82527528 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:42:41 -0700 Subject: [PATCH 29/72] doc: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore documents this change. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Data-Structures/Data-Structures.html | 3 ++- .../RCU/Design/Requirements/Requirements.html | 4 ++-- Documentation/RCU/stallwarn.txt | 10 ++++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 38d6d800761f..412466e4967a 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1097,7 +1097,8 @@ will cause the CPU to disregard the values of its counters on its next exit from idle. Finally, the rcu_qs_ctr_snap field is used to detect cases where a given operation has resulted in a quiescent state -for all flavors of RCU, for example, cond_resched_rcu_qs(). +for all flavors of RCU, for example, cond_resched() +when RCU has indicated a need for quiescent states.

RCU Callback Handling
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 62e847bcdcdd..0372e6c54eef 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2797,7 +2797,7 @@ RCU must avoid degrading real-time response for CPU-bound threads, whether executing in usermode (which is one use case for CONFIG_NO_HZ_FULL=y) or in the kernel. That said, CPU-bound loops in the kernel must execute -cond_resched_rcu_qs() at least once per few tens of milliseconds +cond_resched() at least once per few tens of milliseconds in order to avoid receiving an IPI from RCU.

@@ -3128,7 +3128,7 @@ The solution, in the form of is to have implicit read-side critical sections that are delimited by voluntary context switches, that is, calls to schedule(), -cond_resched_rcu_qs(), and +cond_resched(), and synchronize_rcu_tasks(). In addition, transitions to and from userspace execution also delimit tasks-RCU read-side critical sections. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index a08f928c8557..4259f95c3261 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -23,12 +23,10 @@ o A CPU looping with preemption disabled. This condition can o A CPU looping with bottom halves disabled. This condition can result in RCU-sched and RCU-bh stalls. -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the - kernel without invoking schedule(). Note that cond_resched() - does not necessarily prevent RCU CPU stall warnings. Therefore, - if the looping in the kernel is really expected and desirable - behavior, you might need to replace some of the cond_resched() - calls with calls to cond_resched_rcu_qs(). +o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel + without invoking schedule(). If the looping in the kernel is + really expected and desirable behavior, you might need to add + some calls to cond_resched(). o Booting Linux using a console connection that is too slow to keep up with the boot-time console-message rate. For example, From dc259accec0845ddf56e87337c0b211026eca0ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 5 Nov 2017 05:51:43 -0800 Subject: [PATCH 30/72] rcu: Account for rcu_all_qs() in cond_resched() If cond_resched() returns false, then it has already invoked rcu_all_qs(). This is also invoked (now redundantly) by rcu_note_voluntary_context_switch(). This commit therefore changes cond_resched_rcu_qs() to invoke rcu_note_voluntary_context_switch_lite() instead of rcu_note_voluntary_context_switch() to avoid the redundant invocation of rcu_all_qs(). Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a6ddc42f87a5..7bd8b5a6db10 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -197,7 +197,7 @@ static inline void exit_tasks_rcu_finish(void) { } #define cond_resched_rcu_qs() \ do { \ if (!cond_resched()) \ - rcu_note_voluntary_context_switch(current); \ + rcu_note_voluntary_context_switch_lite(current); \ } while (0) /* From 40555946447a394889243e4393e312f65d847e1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 09:15:21 -0700 Subject: [PATCH 31/72] doc: READ_ONCE() now implies smp_barrier_depends() This commit updates an example in memory-barriers.txt to account for the fact that READ_ONCE() now implies smp_barrier_depends(). Signed-off-by: Paul E. McKenney [ paulmck: Added MEMORY_BARRIER instructions from DEC Alpha from READ_ONCE(), per David Howells's feedback. ] --- Documentation/memory-barriers.txt | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 479ecec80593..13fd35b6a597 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -227,17 +227,20 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); + Q = READ_ONCE(P); D = READ_ONCE(*Q); the CPU will issue the following memory operations: Q = LOAD P, D = LOAD *Q - and always in that order. On most systems, smp_read_barrier_depends() - does nothing, but it is required for DEC Alpha. The READ_ONCE() - is required to prevent compiler mischief. Please note that you - should normally use something like rcu_dereference() instead of - open-coding smp_read_barrier_depends(). + and always in that order. However, on DEC Alpha, READ_ONCE() also + emits a memory-barrier instruction, so that a DEC Alpha CPU will + instead issue the following memory operations: + + Q = LOAD P, MEMORY_BARRIER, D = LOAD *Q, MEMORY_BARRIER + + Whether on DEC Alpha or not, the READ_ONCE() also prevents compiler + mischief. (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: From a4bd78ed215873a68869e41fd59543be8ca38e7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 09:17:49 -0700 Subject: [PATCH 32/72] mn10300: READ_ONCE() now implies smp_read_barrier_depends() Given that READ_ONCE() now implies smp_read_barrier_depends(), there is no need for the open-coded smp_read_barrier_depends() in mn10300_serial_receive_interrupt() and mn10300_serial_poll_get_char(). This commit therefore removes them, but replaces them with comments calling out that carrying dependencies through non-pointers is quite dangerous. Compilers simply know too much about integers. Signed-off-by: Paul E. McKenney Cc: David Howells Cc: Mark Rutland Cc: --- arch/mn10300/kernel/mn10300-serial.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c index d7ef1232a82a..4994b570dfd9 100644 --- a/arch/mn10300/kernel/mn10300-serial.c +++ b/arch/mn10300/kernel/mn10300-serial.c @@ -550,7 +550,7 @@ try_again: return; } - smp_read_barrier_depends(); + /* READ_ONCE() enforces dependency, but dangerous through integer!!! */ ch = port->rx_buffer[ix++]; st = port->rx_buffer[ix++]; smp_mb(); @@ -1728,7 +1728,10 @@ static int mn10300_serial_poll_get_char(struct uart_port *_port) if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) return NO_POLL_CHAR; - smp_read_barrier_depends(); + /* + * READ_ONCE() enforces dependency, but dangerous + * through integer!!! + */ ch = port->rx_buffer[ix++]; st = port->rx_buffer[ix++]; smp_mb(); From cb7e125e03274cffa97d74433c876765efffaf6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 09:26:25 -0700 Subject: [PATCH 33/72] drivers/net/ethernet/qlogic/qed: Fix __qed_spq_block() ordering The __qed_spq_block() function expects an smp_read_barrier_depends() to order a prior READ_ONCE() against a later load that does not depend on the prior READ_ONCE(), an expectation that can fail to be met. This commit therefore replaces the READ_ONCE() with smp_load_acquire() and removes the smp_read_barrier_depends(). Signed-off-by: Paul E. McKenney Cc: Ariel Elior Cc: Cc: --- drivers/net/ethernet/qlogic/qed/qed_spq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index be48d9abd001..c1237ec58b6c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -97,9 +97,7 @@ static int __qed_spq_block(struct qed_hwfn *p_hwfn, while (iter_cnt--) { /* Validate we receive completion update */ - if (READ_ONCE(comp_done->done) == 1) { - /* Read updated FW return value */ - smp_read_barrier_depends(); + if (smp_load_acquire(&comp_done->done) == 1) { /* ^^^ */ if (p_fw_ret) *p_fw_ret = comp_done->fw_return_code; return 0; From 7088efa9137a15d7d21e3abce73e40c9c8a18d68 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 10:04:27 -0700 Subject: [PATCH 34/72] fs/dcache: Use release-acquire for name/length update The code in __d_alloc() carefully orders filling in the NUL character of the name (and the length, hash, and the name itself) with assigning of the name itself. However, prepend_name() does not order the accesses to the ->name and ->len fields, other than on TSO systems. This commit therefore replaces prepend_name()'s READ_ONCE() of ->name with an smp_load_acquire(), which orders against the subsequent READ_ONCE() of ->len. Because READ_ONCE() now incorporates smp_read_barrier_depends(), prepend_name()'s smp_read_barrier_depends() is removed. Finally, to save a line, the smp_wmb()/store pair in __d_alloc() is replaced by smp_store_release(). Signed-off-by: Paul E. McKenney Cc: Alexander Viro Cc: --- fs/dcache.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 5c7df1df81ff..379dce86f001 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1636,8 +1636,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_wmb(); - dentry->d_name.name = dname; + smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; @@ -3047,17 +3046,14 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * - * Data dependency barrier is needed to make sure that we see that terminating - * NUL. Alpha strikes again, film at 11... + * Load acquire is needed to make sure that we see that terminating NUL. */ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { - const char *dname = READ_ONCE(name->name); + const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); char *p; - smp_read_barrier_depends(); - *buflen -= dlen + 1; if (*buflen < 0) return -ENAMETOOLONG; From b393e8b33efd2ee08576ceddc10c2b4bfb3b5435 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 10:20:44 -0700 Subject: [PATCH 35/72] percpu: READ_ONCE() now implies smp_read_barrier_depends() Because READ_ONCE() now implies smp_read_barrier_depends(), this commit removes the now-redundant smp_read_barrier_depends() following the READ_ONCE() in __ref_is_percpu(). Signed-off-by: Paul E. McKenney Acked-by: Tejun Heo Cc: Christoph Lameter --- include/linux/percpu-refcount.h | 6 +++--- lib/percpu-refcount.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 6658d9ee5257..864d167a1073 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -139,12 +139,12 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. + * + * The smp_read_barrier_depends() implied by READ_ONCE() pairs + * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); - /* paired with smp_store_release() in __percpu_ref_switch_to_percpu() */ - smp_read_barrier_depends(); - /* * Theoretically, the following could test just ATOMIC; however, * then we'd have to mask off DEAD separately as DEAD may be diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe03c6d52761..30e7dd88148b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -197,10 +197,10 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* - * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __ref_is_percpu() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following __PERCPU_REF_ATOMIC clearing. + * Restore per-cpu operation. smp_store_release() is paired + * with READ_ONCE() in __ref_is_percpu() and guarantees that the + * zeroing is visible to all percpu accesses which can see the + * following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; From 137f61f6528f2bd552a75c59567d29db2857af97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 10:30:58 -0700 Subject: [PATCH 36/72] rcu: Adjust read-side accessor comments for READ_ONCE() Now that READ_ONCE() implies smp_read_barrier_depends(), the commit updates now-misleading comments to account for this change. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a6ddc42f87a5..000432b87e5a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -433,12 +433,12 @@ static inline void rcu_preempt_sleep_check(void) { } * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the - * smp_read_barrier_depends() and keep the READ_ONCE(). This is useful - * when the value of this pointer is accessed, but the pointer is not - * dereferenced, for example, when testing an RCU-protected pointer against - * NULL. Although rcu_access_pointer() may also be used in cases where - * update-side locks prevent the value of the pointer from changing, you - * should instead use rcu_dereference_protected() for this use case. + * lockdep checks for being in an RCU read-side critical section. This is + * useful when the value of this pointer is accessed, but the pointer is + * not dereferenced, for example, when testing an RCU-protected pointer + * against NULL. Although rcu_access_pointer() may also be used in cases + * where update-side locks prevent the value of the pointer from changing, + * you should instead use rcu_dereference_protected() for this use case. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as @@ -521,12 +521,11 @@ static inline void rcu_preempt_sleep_check(void) { } * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the READ_ONCE(). This - * is useful in cases where update-side locks prevent the value of the - * pointer from changing. Please note that this primitive does *not* - * prevent the compiler from repeating this reference or combining it - * with other references, so it should not be used without protection - * of appropriate locks. + * the READ_ONCE(). This is useful in cases where update-side locks + * prevent the value of the pointer from changing. Please note that this + * primitive does *not* prevent the compiler from repeating this reference + * or combining it with other references, so it should not be used without + * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent From 1ba9c5e6c615e8aca9041e27c40f25569704ae72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 10:37:22 -0700 Subject: [PATCH 37/72] rtnetlink: Update now-misleading smp_read_barrier_depends() comment Now that READ_ONCE() implies smp_read_barrier_depends(), update the rtnl_dereference() header comment accordingly. Signed-off-by: Paul E. McKenney Cc: "David S. Miller" Cc: Vladislav Yasevich Cc: Mark Rutland Cc: David Ahern Cc: Vlad Yasevich --- include/linux/rtnetlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 2032ce2eb20b..1eadec3fc228 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -70,8 +70,7 @@ static inline bool lockdep_rtnl_is_held(void) * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the READ_ONCE(), because - * caller holds RTNL. + * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) From 98b22737847cc015a797567e32d0a4826003afbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:00:32 -0700 Subject: [PATCH 38/72] seqlock: Remove now-redundant smp_read_barrier_depends() READ_ONCE() now implies smp_read_barrier_depends(), so this patch removes the now-redundant smp_read_barrier_depends() from raw_read_seqcount_latch(). Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar --- include/linux/seqlock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f189a8a3bbb8..bcf4cf26b8c8 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -278,9 +278,8 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s) static inline int raw_read_seqcount_latch(seqcount_t *s) { - int seq = READ_ONCE(s->sequence); /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ - smp_read_barrier_depends(); + int seq = READ_ONCE(s->sequence); /* ^^^ */ return seq; } From 5c6338b4877038d28148fcfe1e7f862970ebaad1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:08:53 -0700 Subject: [PATCH 39/72] uprobes: Remove now-redundant smp_read_barrier_depends() Now that READ_ONCE() implies smp_read_barrier_depends(), the get_xol_area() and get_trampoline_vaddr() no longer need their smp_read_barrier_depends() calls, which this commit removes. While we are here, convert the corresponding smp_wmb() to an smp_store_release(). Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 267f6ef91d97..ce6848e46e94 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = 0; - smp_wmb(); /* pairs with get_xol_area() */ - mm->uprobes_state.xol_area = area; + /* pairs with get_xol_area() */ + smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: up_write(&mm->mmap_sem); @@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void) if (!mm->uprobes_state.xol_area) __create_xol_area(0); - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } @@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void) struct xol_area *area; unsigned long trampoline_vaddr = -1; - area = current->mm->uprobes_state.xol_area; - smp_read_barrier_depends(); + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; From 548095dea63ffc016d39c35b32c628d033638aca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:22:50 -0700 Subject: [PATCH 40/72] locking: Remove smp_read_barrier_depends() from queued_spin_lock_slowpath() Queued spinlocks are not used by DEC Alpha, and furthermore operations such as READ_ONCE() and release/relaxed RMW atomics are being changed to imply smp_read_barrier_depends(). This commit therefore removes the now-redundant smp_read_barrier_depends() from queued_spin_lock_slowpath(), and adjusts the comments accordingly. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/qspinlock.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 294294c71ba4..38ece035039e 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) * @tail : The new queue tail code word * Return: The previous queue tail code word * - * xchg(lock, tail) + * xchg(lock, tail), which heads an address dependency * * p,*,* -> n,*,* ; prev = xchg(lock, node) */ @@ -409,13 +409,11 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); /* - * The above xchg_tail() is also a load of @lock which generates, - * through decode_tail(), a pointer. - * - * The address dependency matches the RELEASE of xchg_tail() - * such that the access to @prev must happen after. + * The above xchg_tail() is also a load of @lock which + * generates, through decode_tail(), a pointer. The address + * dependency matches the RELEASE of xchg_tail() such that + * the subsequent access to @prev happens after. */ - smp_read_barrier_depends(); WRITE_ONCE(prev->next, node); From 243d1a7977ae0814aa1ccb8bb87f8a4e0822ca8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:30:11 -0700 Subject: [PATCH 41/72] tracepoint: Remove smp_read_barrier_depends() from comment The comment in tracepoint_add_func() mentions smp_read_barrier_depends(), whose use should be quite restricted. This commit updates the comment to instead mention the smp_store_release() and rcu_dereference_sched() that the current code actually uses. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Acked-by: Steven Rostedt (VMware) Acked-by: Mathieu Desnoyers --- kernel/tracepoint.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 685c50ae6300..671b13457387 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp, } /* - * rcu_assign_pointer has a smp_wmb() which makes sure that the new - * probe callbacks array is consistent before setting a pointer to it. - * This array is referenced by __DO_TRACE from - * include/linux/tracepoints.h. A matching smp_read_barrier_depends() - * is used. + * rcu_assign_pointer has as smp_store_release() which makes sure + * that the new probe callbacks array is consistent before setting + * a pointer to it. This array is referenced by __DO_TRACE from + * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); if (!static_key_enabled(&tp->key)) From 516df050615e4b0fd2dd0448cb5a807208a3837a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:39:57 -0700 Subject: [PATCH 42/72] lib/assoc_array: Remove smp_read_barrier_depends() Now that smp_read_barrier_depends() is implied by READ_ONCE(), the several smp_read_barrier_depends() calls may be removed from lib/assoc_array.c. This commit makes this change and marks the READ_ONCE() calls that head address dependencies. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Mark Rutland Cc: Alexander Kuleshov Cc: David Howells --- lib/assoc_array.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index b77d51da8c73..c6659cb37033 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -38,12 +38,10 @@ begin_node: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ shortcut = assoc_array_ptr_to_shortcut(cursor); - smp_read_barrier_depends(); - cursor = READ_ONCE(shortcut->next_node); + cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ } node = assoc_array_ptr_to_node(cursor); - smp_read_barrier_depends(); slot = 0; /* We perform two passes of each node. @@ -55,15 +53,12 @@ begin_node: */ has_meta = 0; for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = READ_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ has_meta |= (unsigned long)ptr; if (ptr && assoc_array_ptr_is_leaf(ptr)) { - /* We need a barrier between the read of the pointer - * and dereferencing the pointer - but only if we are - * actually going to dereference it. + /* We need a barrier between the read of the pointer, + * which is supplied by the above READ_ONCE(). */ - smp_read_barrier_depends(); - /* Invoke the callback */ ret = iterator(assoc_array_ptr_to_leaf(ptr), iterator_data); @@ -86,10 +81,8 @@ begin_node: continue_node: node = assoc_array_ptr_to_node(cursor); - smp_read_barrier_depends(); - for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = READ_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (assoc_array_ptr_is_meta(ptr)) { cursor = ptr; goto begin_node; @@ -98,16 +91,15 @@ continue_node: finished_node: /* Move up to the parent (may need to skip back over a shortcut) */ - parent = READ_ONCE(node->back_pointer); + parent = READ_ONCE(node->back_pointer); /* Address dependency. */ slot = node->parent_slot; if (parent == stop) return 0; if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); - smp_read_barrier_depends(); cursor = parent; - parent = READ_ONCE(shortcut->back_pointer); + parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */ slot = shortcut->parent_slot; if (parent == stop) return 0; @@ -147,7 +139,7 @@ int assoc_array_iterate(const struct assoc_array *array, void *iterator_data), void *iterator_data) { - struct assoc_array_ptr *root = READ_ONCE(array->root); + struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */ if (!root) return 0; @@ -194,7 +186,7 @@ assoc_array_walk(const struct assoc_array *array, pr_devel("-->%s()\n", __func__); - cursor = READ_ONCE(array->root); + cursor = READ_ONCE(array->root); /* Address dependency. */ if (!cursor) return assoc_array_walk_tree_empty; @@ -216,11 +208,9 @@ jumped: consider_node: node = assoc_array_ptr_to_node(cursor); - smp_read_barrier_depends(); - slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); slot &= ASSOC_ARRAY_FAN_MASK; - ptr = READ_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ pr_devel("consider slot %x [ix=%d type=%lu]\n", slot, level, (unsigned long)ptr & 3); @@ -254,7 +244,6 @@ consider_node: cursor = ptr; follow_shortcut: shortcut = assoc_array_ptr_to_shortcut(cursor); - smp_read_barrier_depends(); pr_devel("shortcut to %d\n", shortcut->skip_to_level); sc_level = level + ASSOC_ARRAY_LEVEL_STEP; BUG_ON(sc_level > shortcut->skip_to_level); @@ -294,7 +283,7 @@ follow_shortcut: } while (sc_level < shortcut->skip_to_level); /* The shortcut matches the leaf's index to this point. */ - cursor = READ_ONCE(shortcut->next_node); + cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { level = sc_level; goto jumped; @@ -331,20 +320,18 @@ void *assoc_array_find(const struct assoc_array *array, return NULL; node = result.terminal_node.node; - smp_read_barrier_depends(); /* If the target key is available to us, it's has to be pointed to by * the terminal node. */ for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { - ptr = READ_ONCE(node->slots[slot]); + ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer * and dereferencing the pointer - but only if we are * actually going to dereference it. */ leaf = assoc_array_ptr_to_leaf(ptr); - smp_read_barrier_depends(); if (ops->compare_object(leaf, index_key)) return (void *)leaf; } From 08df477434754629303c9e2bfa8d67ecb44f9c20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:51:45 -0700 Subject: [PATCH 43/72] mm/ksm: Remove now-redundant smp_read_barrier_depends() Because READ_ONCE() now implies smp_read_barrier_depends(), the smp_read_barrier_depends() in get_ksm_page() is now redundant. This commit removes it and updates the comments. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Ingo Molnar Cc: "Aneesh Kumar K.V" Cc: Claudio Imbrenda Cc: --- mm/ksm.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index be8f4576f842..c406f75957ad 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -675,15 +675,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: - kpfn = READ_ONCE(stable_node->kpfn); + kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ page = pfn_to_page(kpfn); - - /* - * page is computed from kpfn, so on most architectures reading - * page->mapping is naturally ordered after reading node->kpfn, - * but on Alpha we need to be more careful. - */ - smp_read_barrier_depends(); if (READ_ONCE(page->mapping) != expected_mapping) goto stale; From 4be2b04e43fd3d8164d7aeb1808e47fbeb0c0de0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 12:09:04 -0700 Subject: [PATCH 44/72] netfilter: Remove now-redundant smp_read_barrier_depends() READ_ONCE() now implies smp_read_barrier_depends(), which means that the instances in arpt_do_table(), ipt_do_table(), and ip6t_do_table() are now redundant. This commit removes them and adjusts the comments. Signed-off-by: Paul E. McKenney Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: Cc: Cc: --- net/ipv4/netfilter/arp_tables.c | 7 +------ net/ipv4/netfilter/ip_tables.c | 7 +------ net/ipv6/netfilter/ip6_tables.c | 7 +------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f88221aebc9d..d242c2d29161 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -202,13 +202,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4cbe5e80f3bf..46866cc24a84 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -260,13 +260,8 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f06e25065a34..ac1db84722a7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -282,12 +282,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; From d963007c7210deebef48c5e57aa4ca4cf9c059cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 12:19:27 -0700 Subject: [PATCH 45/72] keyring: Remove now-redundant smp_read_barrier_depends() Now that the associative-array library properly heads dependency chains, the various smp_read_barrier_depends() calls in security/keys/keyring.c are no longer needed. This commit therefore removes them. Signed-off-by: Paul E. McKenney Cc: David Howells Cc: "Serge E. Hallyn" Cc: Cc: Reviewed-by: James Morris --- security/keys/keyring.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index d0bccebbd3b5..41bcf57e96f2 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -713,7 +713,6 @@ descend_to_keyring: * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); - smp_read_barrier_depends(); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; @@ -723,8 +722,6 @@ descend_to_keyring: } node = assoc_array_ptr_to_node(ptr); - smp_read_barrier_depends(); - ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; @@ -736,7 +733,6 @@ descend_to_node: kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); - smp_read_barrier_depends(); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } @@ -744,7 +740,6 @@ descend_to_node: begin_node: kdebug("begin_node"); - smp_read_barrier_depends(); slot = 0; ascend_to_node: /* Go through the slots in a node */ @@ -792,14 +787,12 @@ ascend_to_node: if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); - smp_read_barrier_depends(); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); - smp_read_barrier_depends(); slot++; /* If we've ascended to the root (zero backpointer), we must have just From adf90eb49055636fc35aede54174456ac3520f27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:04:22 -0800 Subject: [PATCH 46/72] drivers/infiniband: Remove now-redundant smp_read_barrier_depends() The smp_read_barrier_depends() does nothing at all except on DEC Alpha, and no current DEC Alpha systems use Infiniband: lkml.kernel.org/r/20171023085921.jwbntptn6ictbnvj@tower This commit therefore makes Infiniband depend on !ALPHA and removes the now-ineffective invocations of smp_read_barrier_depends() from the InfiniBand driver. Please note that this patch should not be construed as my saying that InfiniBand's memory ordering is correct, but rather that this patch does not in any way affect InfiniBand's correctness. In other words, the result of applying this patch is bug-for-bug compatible with the original. Signed-off-by: Paul E. McKenney Cc: Doug Ledford Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Cree Cc: Andrea Parri Cc: Cc: [ paulmck: Removed drivers/dma/ioat/dma.c per Jason Gunthorpe's feedback. ] Acked-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/hw/hfi1/rc.c | 4 ---- drivers/infiniband/hw/hfi1/ruc.c | 1 - drivers/infiniband/hw/hfi1/sdma.c | 1 - drivers/infiniband/hw/hfi1/uc.c | 2 -- drivers/infiniband/hw/hfi1/ud.c | 2 -- drivers/infiniband/hw/qib/qib_rc.c | 3 --- drivers/infiniband/hw/qib/qib_ruc.c | 1 - drivers/infiniband/hw/qib/qib_uc.c | 2 -- drivers/infiniband/hw/qib/qib_ud.c | 2 -- drivers/infiniband/sw/rdmavt/qp.c | 1 - 11 files changed, 1 insertion(+), 19 deletions(-) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 98ac46ed7214..3bb6e35b0bbf 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -4,6 +4,7 @@ menuconfig INFINIBAND depends on NET depends on INET depends on m || IPV6 != m + depends on !ALPHA select IRQ_POLL ---help--- Core support for InfiniBand (IB). Make sure to also select diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fd01a760259f..f527bcda4650 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -302,7 +302,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -346,7 +345,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) newreq = 0; if (qp->s_cur == qp->s_tail) { /* Check if send work queue is empty. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_tail == READ_ONCE(qp->s_head)) { clear_ahg(qp); goto bail; @@ -900,7 +898,6 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, } /* Ensure s_rdma_ack_cnt changes are committed */ - smp_read_barrier_depends(); if (qp->s_rdma_ack_cnt) { hfi1_queue_rc_ack(qp, is_fecn); return; @@ -1562,7 +1559,6 @@ static void rc_rcv_resp(struct hfi1_packet *packet) trace_hfi1_ack(qp, psn); /* Ignore invalid responses. */ - smp_read_barrier_depends(); /* see post_one_send */ if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0) goto ack_done; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 2c7fc6e331ea..13b994738f41 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -362,7 +362,6 @@ static void ruc_loopback(struct rvt_qp *sqp) sqp->s_flags |= RVT_S_BUSY; again: - smp_read_barrier_depends(); /* see post_one_send() */ if (sqp->s_last == READ_ONCE(sqp->s_head)) goto clr_busy; wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 31c8f89b5fc8..61c130dbed10 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -553,7 +553,6 @@ static void sdma_hw_clean_up_task(unsigned long opaque) static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde) { - smp_read_barrier_depends(); /* see sdma_update_tail() */ return sde->tx_ring[sde->tx_head & sde->sdma_mask]; } diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 991bbee04821..132b63e787d1 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -79,7 +79,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -119,7 +118,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* Check if send work queue is empty. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_cur == READ_ONCE(qp->s_head)) { clear_ahg(qp); goto bail; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index beb5091eccca..deb184574395 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -486,7 +486,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -500,7 +499,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } /* see post_one_send() */ - smp_read_barrier_depends(); if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 8f5754fb8579..1a785c37ad0a 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -246,7 +246,6 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -293,7 +292,6 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) newreq = 0; if (qp->s_cur == qp->s_tail) { /* Check if send work queue is empty. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_tail == READ_ONCE(qp->s_head)) goto bail; /* @@ -1340,7 +1338,6 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, goto ack_done; /* Ignore invalid responses. */ - smp_read_barrier_depends(); /* see post_one_send */ if (qib_cmp24(psn, READ_ONCE(qp->s_next_psn)) >= 0) goto ack_done; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 9a37e844d4c8..4662cc7bde92 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -367,7 +367,6 @@ static void qib_ruc_loopback(struct rvt_qp *sqp) sqp->s_flags |= RVT_S_BUSY; again: - smp_read_barrier_depends(); /* see post_one_send() */ if (sqp->s_last == READ_ONCE(sqp->s_head)) goto clr_busy; wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index bddcc37ace44..70c58b88192c 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -60,7 +60,6 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -90,7 +89,6 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* Check if send work queue is empty. */ - smp_read_barrier_depends(); /* see post_one_send() */ if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; /* diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 15962ed193ce..386c3c4da0c7 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -252,7 +252,6 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - smp_read_barrier_depends(); /* see post_one_send */ if (qp->s_last == READ_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ @@ -266,7 +265,6 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) } /* see post_one_send() */ - smp_read_barrier_depends(); if (qp->s_cur == READ_ONCE(qp->s_head)) goto bail; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 9177df60742a..eae84c216e2f 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1684,7 +1684,6 @@ static inline int rvt_qp_is_avail( /* non-reserved operations */ if (likely(qp->s_avail)) return 0; - smp_read_barrier_depends(); /* see rc.c */ slast = READ_ONCE(qp->s_last); if (qp->s_head >= slast) avail = qp->s_size - (qp->s_head - slast); From 98c1ec7cefaadbf65680d116c3d8612b93a841a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Dec 2017 17:04:39 -0800 Subject: [PATCH 47/72] drivers/dma/ioat: Remove now-redundant smp_read_barrier_depends() Now that READ_ONCE() implies smp_read_barrier_depends(), the __cleanup() and ioat_abort_descs() functions no longer need their smp_read_barrier_depends() calls, which this commit removes. It is actually not entirely clear why this driver ever included smp_read_barrier_depends() given that it appears to be x86-only and given that smp_read_barrier_depends() has no effect whatsoever except on DEC Alpha. Signed-off-by: Paul E. McKenney Cc: Vinod Koul Cc: Dan Williams Cc: --- drivers/dma/ioat/dma.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 58d4ccd33672..8b5b23a8ace9 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -597,7 +597,6 @@ static void __cleanup(struct ioatdma_chan *ioat_chan, dma_addr_t phys_complete) for (i = 0; i < active && !seen_current; i++) { struct dma_async_tx_descriptor *tx; - smp_read_barrier_depends(); prefetch(ioat_get_ring_ent(ioat_chan, idx + i + 1)); desc = ioat_get_ring_ent(ioat_chan, idx + i); dump_desc_dbg(ioat_chan, desc); @@ -715,7 +714,6 @@ static void ioat_abort_descs(struct ioatdma_chan *ioat_chan) for (i = 1; i < active; i++) { struct dma_async_tx_descriptor *tx; - smp_read_barrier_depends(); prefetch(ioat_get_ring_ent(ioat_chan, idx + i + 1)); desc = ioat_get_ring_ent(ioat_chan, idx + i); From 9ad3c143d7d6942c66f27bc6c18f5df638f70aff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:20:40 -0800 Subject: [PATCH 48/72] doc: De-emphasize smp_read_barrier_depends This commit keeps only the historical and low-level discussion of smp_read_barrier_depends(). Signed-off-by: Paul E. McKenney [ paulmck: Adjusted to allow for David Howells feedback on prior commit. ] --- Documentation/RCU/Design/Requirements/Requirements.html | 3 ++- Documentation/RCU/rcu_dereference.txt | 6 +----- Documentation/RCU/whatisRCU.txt | 3 +-- Documentation/circular-buffers.txt | 3 +-- Documentation/memory-barriers.txt | 7 +++++-- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 62e847bcdcdd..571c3d75510f 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -581,7 +581,8 @@ This guarantee was only partially premeditated. DYNIX/ptx used an explicit memory barrier for publication, but had nothing resembling rcu_dereference() for subscription, nor did it have anything resembling the smp_read_barrier_depends() -that was later subsumed into rcu_dereference(). +that was later subsumed into rcu_dereference() and later +still into READ_ONCE(). The need for these operations made itself known quite suddenly at a late-1990s meeting with the DEC Alpha architects, back in the days when DEC was still a free-standing company. diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index 1acb26b09b48..ab96227bad42 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt @@ -122,11 +122,7 @@ o Be very careful about comparing pointers obtained from Note that if checks for being within an RCU read-side critical section are not required and the pointer is never dereferenced, rcu_access_pointer() should be used in place - of rcu_dereference(). The rcu_access_pointer() primitive - does not require an enclosing read-side critical section, - and also omits the smp_read_barrier_depends() included in - rcu_dereference(), which in turn should provide a small - performance gain in some CPUs (e.g., the DEC Alpha). + of rcu_dereference(). o The comparison is against a pointer that references memory that was initialized "a long time ago." The reason diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index df62466da4e0..a27fbfb0efb8 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -600,8 +600,7 @@ don't forget about them when submitting patches making use of RCU!] #define rcu_dereference(p) \ ({ \ - typeof(p) _________p1 = p; \ - smp_read_barrier_depends(); \ + typeof(p) _________p1 = READ_ONCE(p); \ (_________p1); \ }) diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt index d4628174b7c5..53e51caa3347 100644 --- a/Documentation/circular-buffers.txt +++ b/Documentation/circular-buffers.txt @@ -220,8 +220,7 @@ before it writes the new tail pointer, which will erase the item. Note the use of READ_ONCE() and smp_load_acquire() to read the opposition index. This prevents the compiler from discarding and -reloading its cached value - which some compilers will do across -smp_read_barrier_depends(). This isn't strictly needed if you can +reloading its cached value. This isn't strictly needed if you can be sure that the opposition index will _only_ be used the once. The smp_load_acquire() additionally forces the CPU to order against subsequent memory references. Similarly, smp_store_release() is used diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 13fd35b6a597..a863009849a3 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1818,7 +1818,7 @@ The Linux kernel has eight basic CPU memory barriers: GENERAL mb() smp_mb() WRITE wmb() smp_wmb() READ rmb() smp_rmb() - DATA DEPENDENCY read_barrier_depends() smp_read_barrier_depends() + DATA DEPENDENCY READ_ONCE() All memory barriers except the data dependency barriers imply a compiler @@ -2867,7 +2867,10 @@ access depends on a read, not all do, so it may not be relied on. Other CPUs may also have split caches, but must coordinate between the various cachelets for normal memory accesses. The semantics of the Alpha removes the -need for coordination in the absence of memory barriers. +need for hardware coordination in the absence of memory barriers, which +permitted Alpha to sport higher CPU clock rates back in the day. However, +please note that smp_read_barrier_depends() should not be used except in +Alpha arch-specific code and within the READ_ONCE() macro. CACHE COHERENCY VS DMA From 632a5c1c8fd281b82b1d70bdb1d692cba3b9ffd3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:23:24 -0800 Subject: [PATCH 49/72] genetlink: Remove smp_read_barrier_depends() from comment Now that smp_read_barrier_depends() has been de-emphasized, the less said about it, the better. Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Mark Rutland Cc: Kate Stewart Cc: Ingo Molnar Cc: Philippe Ombredanne Cc: Greg Kroah-Hartman --- include/linux/genetlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index ecc2928e8046..bc738504ab4a 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -31,8 +31,7 @@ extern wait_queue_head_t genl_sk_destructing_waitq; * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the READ_ONCE(), because - * caller holds genl mutex. + * the READ_ONCE(), because caller holds genl mutex. */ #define genl_dereference(p) \ rcu_dereference_protected(p, lockdep_genl_is_held()) From dfe1b4427835f995aed593d063e6fcdf78b00823 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:26:06 -0800 Subject: [PATCH 50/72] netlink: Remove smp_read_barrier_depends() from comment Now that smp_read_barrier_depends() has been de-emphasized, the less said about it, the better. Signed-off-by: Paul E. McKenney Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: Cc: --- include/linux/netfilter/nfnetlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 495ba4dd9da5..34551f8aaf9d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -67,8 +67,7 @@ static inline bool lockdep_nfnl_is_held(__u8 subsys_id) * @ss: The nfnetlink subsystem ID * * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the READ_ONCE(), because - * caller holds the NFNL subsystem mutex. + * the READ_ONCE(), because caller holds the NFNL subsystem mutex. */ #define nfnl_dereference(p, ss) \ rcu_dereference_protected(p, lockdep_nfnl_is_held(ss)) From 91db2592e463157d8f4755f56230fb04d0308c4e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:37:35 -0800 Subject: [PATCH 51/72] checkpatch: Add warnings for {smp_,}read_barrier_depends() Now that both smp_read_barrier_depends() and read_barrier_depends() are being de-emphasized, warn if any are added. Signed-off-by: Paul E. McKenney Cc: Andy Whitcroft Cc: Joe Perches [ paulmck: Skipped checking files and handled whitespace per Joe Perches. ] --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 95cda3ecc66b..9a384bfe2bd5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5586,6 +5586,12 @@ sub process { } } +# check for smp_read_barrier_depends and read_barrier_depends + if (!$file && $line =~ /\b(smp_|)read_barrier_depends\s*\(/) { + WARN("READ_BARRIER_DEPENDS", + "$1read_barrier_depends should only be used in READ_ONCE or DEC Alpha code\n" . $herecurr); + } + # check of hardware specific defines if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) { CHK("ARCH_DEFINES", From 3a5db0b108e0a40f08c2bcff6a675dbf632b91e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 09:45:10 -0800 Subject: [PATCH 52/72] drivers/vhost: Remove now-redundant read_barrier_depends() Because READ_ONCE() now implies read_barrier_depends(), the read_barrier_depends() in next_desc() is now redundant. This commit therefore removes it and the related comments. Signed-off-by: Paul E. McKenney Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Cc: Cc: --- drivers/vhost/vhost.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 33ac2b186b85..78b5940a415a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1877,12 +1877,7 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) return -1U; /* Check they're not leading us off end of descriptors. */ - next = vhost16_to_cpu(vq, desc->next); - /* Make sure compiler knows to grab that: we don't want it changing! */ - /* We will use the result as an index in an array, so most - * architectures only need a compiler barrier here. */ - read_barrier_depends(); - + next = vhost16_to_cpu(vq, READ_ONCE(desc->next)); return next; } From 9122caf99b85c0f16938419547d5a9a84ae287a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:16:41 -0400 Subject: [PATCH 53/72] tracing, rcu: Hide trace event rcu_nocb_wake when not used The trace event rcu_nocb_wake is only used when CONFIG_RCU_NOCB_CPU is defined. But the trace event is defined regardless. As defined trace events take up memory, it is a waste to have it defined when not used. Surround the trace event with an #ifdef to have it only defined when it is used. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 59d40c454aa0..dbca79ea0677 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -243,6 +243,7 @@ TRACE_EVENT(rcu_exp_funnel_lock, __entry->grphi, __entry->gpevent) ); +#ifdef CONFIG_RCU_NOCB_CPU /* * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended * to assist debugging of these handoffs. @@ -285,6 +286,7 @@ TRACE_EVENT(rcu_nocb_wake, TP_printk("%s %d %s", __entry->rcuname, __entry->cpu, __entry->reason) ); +#endif /* * Tracepoint for tasks blocking within preemptible-RCU read-side From efd88b02bb9e6b8b73a20ea611e5d07ed6d4af34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Oct 2017 14:52:41 -0700 Subject: [PATCH 54/72] rcu: Add comment giving debug strategy for double call_rcu() The following statement has for some reason proven non-intuitive: WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); This commit therefore adds a comment that states that this warning usually triggers in response to a double call_rcu(), which is sort of like a double free. The comment also suggests building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y to track down the double call_rcu(). Reported-by: David Howells Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..1bdc0481aaf1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2789,6 +2789,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; + + /* + * The following usually indicates a double call_rcu(). To track + * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. + */ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); From 84b12b752f41cd3d25d75692c2145d816e42926c Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Fri, 17 Nov 2017 21:40:15 +0600 Subject: [PATCH 55/72] rcu: Remove have_rcu_nocb_mask from tree_plugin.h Currently have_rcu_nocb_mask is used to avoid double allocation of rcu_nocb_mask during boot up. Due to different representation of cpumask_var_t on different kernel config CPUMASK=y(or n) it was okay. But now we have a helper cpumask_available(), which can be utilized to check whether rcu_nocb_mask has been allocated or not without using a variable. Removing the variable also reduces vmlinux size. Unpatched version: text data bss dec hex filename 13050393 7852470 14543408 35446271 21cddff vmlinux Patched version: text data bss dec hex filename 13050390 7852438 14543408 35446236 21cdddc vmlinux Signed-off-by: Rakib Mullick Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3975f1..13a8e08f1998 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - have_rcu_nocb_mask = true; cpulist_parse(str, rcu_nocb_mask); return 1; } @@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { - if (have_rcu_nocb_mask) + if (cpumask_available(rcu_nocb_mask)) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } @@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); return; } - have_rcu_nocb_mask = true; } - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; #if defined(CONFIG_NO_HZ_FULL) @@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); From cc1321c96f855525fbd847fec130f000daa1bb1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Oct 2017 11:05:03 -0700 Subject: [PATCH 56/72] torture: Reduce #ifdefs for preempt_schedule() This commit adds a torture_preempt_schedule() that is nothingness in !PREEMPT builds and is preempt_schedule() otherwise. Then torture_preempt_schedule() is used to eliminate several ugly #ifdefs, both in rcutorture and in locktorture. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 6 ++++++ kernel/locking/locktorture.c | 24 ++++++------------------ kernel/rcu/rcutorture.c | 4 +--- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index a45702eb3e7b..907d266aaddc 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -96,4 +96,10 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) +#ifdef CONFIG_PREEMPT +#define torture_preempt_schedule() preempt_schedule() +#else +#define torture_preempt_schedule() +#endif + #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d4dad3..617cea2520b3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -130,10 +130,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_lock_busted_write_unlock(void) @@ -179,10 +177,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) @@ -352,10 +348,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 5); else mdelay(longdelay_ms / 5); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_mutex_unlock(void) __releases(torture_mutex) @@ -507,10 +501,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) @@ -547,10 +539,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 10); else mdelay(longdelay_ms / 10); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_write(void) __releases(torture_rwsem) @@ -574,10 +564,8 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_read(void) __releases(torture_rwsem) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74f6b0146b98..e7d3cce84214 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -315,11 +315,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp) } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!preempt_count() && !(torture_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif + torture_preempt_schedule(); /* QS only if preemptible. */ } static void rcu_torture_read_unlock(int idx) __releases(RCU) From e8302739aa2204d52dacf9e9619cb6e755fa997a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Oct 2017 11:23:42 -0700 Subject: [PATCH 57/72] rcutorture: Preempt RCU-preempt readers more vigorously This commit attempts to make a very rare rcutorture failure happen more often by increasing the fraction of RCU-preempt read-side critical sections that are preempted. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e7d3cce84214..1074ecc3f72f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -316,7 +316,7 @@ static void rcu_read_delay(struct torture_random_state *rrsp) if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 20000))) + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ } From 2adfa4210f8f35cdfb4e08318cc06b99752964c2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:20 +0900 Subject: [PATCH 58/72] rcutorture/configinit: Fix build directory error message The 'configinit.sh' script checks the format of optional argument for the build directory, printing an error message if the format is not valid. However, the error message uses the wrong variable, indicating an empty string even though the user entered a non-empty (but erroneous) string. This commit fixes the script to use the correct variable. Fixes: c87b9c601ac8 ("rcutorture: Add KVM-based test framework") Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/configinit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index 51f66a7ce876..c15f270e121d 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -51,7 +51,7 @@ then mkdir $builddir fi else - echo Bad build directory: \"$builddir\" + echo Bad build directory: \"$buildloc\" exit 2 fi fi From 3a0b3bbbff0f69c59e753dddf97e4e334b7fa997 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:21 +0900 Subject: [PATCH 59/72] rcutorture: Remove unused script, config2frag.sh The 'config2frag.sh' script is not used, so this commit removes it. Fixes: c87b9c601ac8 ("rcutorture: Add KVM-based test framework") Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/config2frag.sh | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100755 tools/testing/selftests/rcutorture/bin/config2frag.sh diff --git a/tools/testing/selftests/rcutorture/bin/config2frag.sh b/tools/testing/selftests/rcutorture/bin/config2frag.sh deleted file mode 100755 index 56f51ae13d73..000000000000 --- a/tools/testing/selftests/rcutorture/bin/config2frag.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Usage: config2frag.sh < .config > configfrag -# -# Converts the "# CONFIG_XXX is not set" to "CONFIG_XXX=n" so that the -# resulting file becomes a legitimate Kconfig fragment. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# -# Copyright (C) IBM Corporation, 2013 -# -# Authors: Paul E. McKenney - -LANG=C sed -e 's/^# CONFIG_\([a-zA-Z0-9_]*\) is not set$/CONFIG_\1=n/' From e5ed531dca4f569397ee5df60cd8ea2684c9aeff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:22 +0900 Subject: [PATCH 60/72] rcutorture/kvm.sh: Remove unused variable, `alldone` The variable `alldone` is defined but not used within an awk script. This commit therefore removes it. Fixes:53954671033d ("rcutorture: Do better bin packing") Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index ccd49e958fd2..7eb8d14e2aab 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -238,7 +238,6 @@ BEGIN { } END { - alldone = 0; batch = 0; nc = -1; From 8dcd6f3fe206c0bb8996e59386a04027b1c2fb9b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:23 +0900 Subject: [PATCH 61/72] rcutorture/kvm.sh: Use consistent help text for --qemu-args The '--qemu-args' option's help text is wrongly copied from '--qemu-cmd' option and its argument type description message format is inconsistent with other arguments. This commit fixes the usage and type messages to be consistent with others. Fixes: e9ce640001c6 ("rcutorture: Add --qemu-args argument to kvm.sh") Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7eb8d14e2aab..64d96fc3dd62 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -70,7 +70,7 @@ usage () { echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" - echo " --qemu-args qemu-system-..." + echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." echo " --results absolute-pathname" echo " --torture rcu" @@ -150,7 +150,7 @@ do TORTURE_INITRD=""; export TORTURE_INITRD ;; --qemu-args|--qemu-arg) - checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error' + checkarg --qemu-args "(qemu arguments)" $# "$2" '^-' '^error' TORTURE_QEMU_ARG="$2" shift ;; From 512e3bd0b554eb25d8816ab3954e0f39c98e8183 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:24 +0900 Subject: [PATCH 62/72] rcutorture/kvm.sh: Support execution from any directory The 'kvm.sh' rcutorture script requires that it be invoked from the top of Linux-kernel source tree. It is just a subtle restriction, but users using it for the first time could forget the restriction and be confused. Moreover, it makes commands a little longer, which can be frustrating. This commit therefore lets users invoke the script from any location. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 64d96fc3dd62..d2a4fd94de6a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -1,8 +1,7 @@ #!/bin/bash # # Run a series of 14 tests under KVM. These are not particularly -# well-selected or well-tuned, but are the current set. Run from the -# top level of the source tree. +# well-selected or well-tuned, but are the current set. # # Edit the definitions below to set the locations of the various directories, # as well as the test duration. @@ -34,6 +33,8 @@ T=${TMPDIR-/tmp}/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T +cd `dirname $scriptname`/../../../../../ + dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM From 81394e3f6df8f72895354fe29a1ef60cb0765a78 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:25 +0900 Subject: [PATCH 63/72] rcutorture/kvm-recheck-*: Improve result directory readability check The kvm-recheck-(lock|rcu|rcuperf).sh scripts check whether the user-specified results directory exists. If not, it prints out error message that says the specified directory is unreadable. To make the message more precise, this commit adds a readability check. Fixes: 2193e1604eac ("rcutorture: Abstract kvm-recheck.sh") Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh index 43f764098e50..2de92f43ee8c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh @@ -23,7 +23,7 @@ # Authors: Paul E. McKenney i="$1" -if test -d $i +if test -d "$i" -a -r "$i" then : else diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 559e01ac86be..9e34656bf659 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -23,7 +23,7 @@ # Authors: Paul E. McKenney i="$1" -if test -d $i +if test -d "$i" -a -r "$i" then : else diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index 8f3121afc716..6138fd94abfe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -23,7 +23,7 @@ # Authors: Paul E. McKenney i="$1" -if test -d $i +if test -d "$i" -a -r "$i" then : else From fa48beb5f485a82a15f777198c770feb6d01c794 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:27 +0900 Subject: [PATCH 64/72] rcutorture: Simplify logging Both the 'kvm.sh' and 'kvm-test-1-run.sh' scripts log messages by printing the message to 'stdout' and then also printing it into the log file. Generation of the message thus occurs twice, once for 'stdout' and once for the log file. Moreover, many of the messages contain 'date' output, which results in date being invoked twice (once for stdout print, once for log file write). As a result, the date information in stdout and log file can differ, which could cause confusion. This commit therefore simplifies the logging procedure by using 'tee'. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 4 +-- tools/testing/selftests/rcutorture/bin/kvm.sh | 32 +++++++------------ 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index ab14b97c942c..0406c67378cb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -154,9 +154,7 @@ cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` vcpus=`identify_qemu_vcpus` if test $cpu_count -gt $vcpus then - echo CPU count limited from $cpu_count to $vcpus - touch $resdir/Warnings - echo CPU count limited from $cpu_count to $vcpus >> $resdir/Warnings + echo CPU count limited from $cpu_count to $vcpus | tee -a $resdir/Warnings cpu_count=$vcpus fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index d2a4fd94de6a..7d1f607f0f76 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -331,8 +331,7 @@ awk < $T/cfgcpu.pack \ # Dump out the scripting required to run one test batch. function dump(first, pastlast, batchnum) { - print "echo ----Start batch " batchnum ": `date`"; - print "echo ----Start batch " batchnum ": `date` >> " rd "/log"; + print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log"; print "needqemurun=" jn=1 for (j = first; j < pastlast; j++) { @@ -349,21 +348,18 @@ function dump(first, pastlast, batchnum) ovf = "-ovf"; else ovf = ""; - print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`"; - print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` >> " rd "/log"; + print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; print "rm -f " builddir ".*"; print "touch " builddir ".wait"; print "mkdir " builddir " > /dev/null 2>&1 || :"; print "mkdir " rd cfr[jn] " || :"; print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" - print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date`"; - print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` >> " rd "/log"; + print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; print "while test -f " builddir ".wait" print "do" print "\tsleep 1" print "done" - print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date`"; - print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` >> " rd "/log"; + print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` | tee -a " rd "log"; jn++; } for (j = 1; j < jn; j++) { @@ -371,8 +367,7 @@ function dump(first, pastlast, batchnum) print "rm -f " builddir ".ready" print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" - print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date`"; - print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` >> " rd "/log"; + print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` | tee -a " rd "log"; print "\tneedqemurun=1" print "fi" } @@ -386,31 +381,26 @@ function dump(first, pastlast, batchnum) njitter = ja[1]; if (TORTURE_BUILDONLY && njitter != 0) { njitter = 0; - print "echo Build-only run, so suppressing jitter >> " rd "/log" + print "echo Build-only run, so suppressing jitter | tee -a " rd "log" } if (TORTURE_BUILDONLY) { print "needqemurun=" } print "if test -n \"$needqemurun\"" print "then" - print "\techo ---- Starting kernels. `date`"; - print "\techo ---- Starting kernels. `date` >> " rd "/log"; + print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; for (j = 0; j < njitter; j++) print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "\twait" - print "\techo ---- All kernel runs complete. `date`"; - print "\techo ---- All kernel runs complete. `date` >> " rd "/log"; + print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" - print "\techo ---- No kernel runs. `date`"; - print "\techo ---- No kernel runs. `date` >> " rd "/log"; + print "\techo ---- No kernel runs. `date` | tee -a " rd "log"; print "fi" for (j = 1; j < jn; j++) { builddir=KVM "/b" j - print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results:"; - print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: >> " rd "/log"; - print "cat " rd cfr[j] "/kvm-test-1-run.sh.out"; - print "cat " rd cfr[j] "/kvm-test-1-run.sh.out >> " rd "/log"; + print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: | tee -a " rd "log"; + print "cat " rd cfr[j] "/kvm-test-1-run.sh.out | tee -a " rd "log"; } } From feef2d286a098c3510322d5c1348432899489214 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:28 +0900 Subject: [PATCH 65/72] rcutorture: Simplify functions.sh include path Inclusions of 'functions.sh' from 'kvm-test-1-run.sh' and 'kvm-recheck*.sh' use its absolute path. Because the directory containing 'functions.sh' is already in PATH, the full path is unnecessary. This commit therefore simplifies the inclusions to use the short relative path. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 2 +- .../selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 9e34656bf659..c2e1bb6d0cba 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -30,7 +30,7 @@ else echo Unreadable results directory: $i exit 1 fi -. tools/testing/selftests/rcutorture/bin/functions.sh +. functions.sh configfile=`echo $i | sed -e 's/^.*\///'` ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'` diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh index f79b0e9e84fc..963f71289d22 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -26,7 +26,7 @@ # Authors: Paul E. McKenney i="$1" -. tools/testing/selftests/rcutorture/bin/functions.sh +. functions.sh if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100 then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index 6138fd94abfe..ccebf772fa1e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -31,7 +31,7 @@ else exit 1 fi PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH -. tools/testing/selftests/rcutorture/bin/functions.sh +. functions.sh if kvm-recheck-rcuperf-ftrace.sh $i then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index f659346d3358..f7e988f369dd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -25,7 +25,7 @@ # Authors: Paul E. McKenney PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH -. tools/testing/selftests/rcutorture/bin/functions.sh +. functions.sh for rd in "$@" do firsttime=1 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0406c67378cb..1b78a12740e5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -42,7 +42,7 @@ T=${TMPDIR-/tmp}/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 mkdir $T -. $KVM/bin/functions.sh +. functions.sh . $CONFIGFRAG/ver_functions.sh config_template=${1} From af0695d3fcbf8ac387eb48d1356d1956c6af7fd9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Nov 2017 19:17:26 +0900 Subject: [PATCH 66/72] rcutorture/kvm-build.sh: Skip build directory check Check for build-directory existence and write permissions are provided in both 'kvm-test-1-run.sh' an 'kvm-build.sh'. Because the 'kvm-build.sh' is dependent on 'kvm-test-1-run.sh' ('kvm-build.sh' uses variables that defined from its caller.), these checks are unnecessarily duplicated. This commit therefore removes the check in from the 'kvm-build.sh' script. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-build.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index fb66d0173638..34d126734cde 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -29,11 +29,6 @@ then exit 1 fi builddir=${2} -if test -z "$builddir" -o ! -d "$builddir" -o ! -w "$builddir" -then - echo "kvm-build.sh :$builddir: Not a writable directory, cannot build into it" - exit 1 -fi T=${TMPDIR-/tmp}/test-linux.sh.$$ trap 'rm -rf $T' 0 From 8f9dd8317386b0bcb20cf0bfc832ba2ea67f44d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Nov 2017 14:10:03 -0800 Subject: [PATCH 67/72] torture: Place all torture-test modules in one MAINTAINERS group There is some confusion about where patches to kernel/torture.c and kernel/locking/locktorture.c should be sent. This commit therefore updates MAINTAINERS appropriately. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) --- MAINTAINERS | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index aa71ab52fd76..bdeb64f8bf54 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8194,6 +8194,7 @@ F: arch/*/include/asm/rwsem.h F: include/linux/seqlock.h F: lib/locking*.[ch] F: kernel/locking/ +X: kernel/locking/locktorture.c LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks) M: "Richard Russon (FlatCap)" @@ -11451,15 +11452,6 @@ L: linux-wireless@vger.kernel.org S: Orphan F: drivers/net/wireless/ray* -RCUTORTURE MODULE -M: Josh Triplett -M: "Paul E. McKenney" -L: linux-kernel@vger.kernel.org -S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git -F: Documentation/RCU/torture.txt -F: kernel/rcu/rcutorture.c - RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" M: Josh Triplett @@ -13748,6 +13740,18 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/topstar-laptop.c +TORTURE-TEST MODULES +M: Davidlohr Bueso +M: "Paul E. McKenney" +M: Josh Triplett +L: linux-kernel@vger.kernel.org +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +F: Documentation/RCU/torture.txt +F: kernel/torture.c +F: kernel/rcu/rcutorture.c +F: kernel/locking/locktorture.c + TOSHIBA ACPI EXTRAS DRIVER M: Azael Avalos L: platform-driver-x86@vger.kernel.org From f2f762608f45353b0b8c37507824f95bb716c3d5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 15 May 2017 02:07:22 -0700 Subject: [PATCH 68/72] locking/locktorture: Fix rwsem reader_delay We should account for nreader threads, not writers in this callback. Could even trigger a div by 0 if the user explicitly disables writers. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 617cea2520b3..a307a79e6b0b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -560,7 +560,7 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); From 2ce77d16db4240dd2e422fc0a5c26d3e2ec03446 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 15 May 2017 02:07:23 -0700 Subject: [PATCH 69/72] locking/locktorture: Fix num reader/writer corner cases Things can explode for locktorture if the user does combinations of nwriters_stress=0 nreaders_stress=0. Fix this by not assuming we always want to torture writer threads. Reported-by: Jeremy Linton Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Reviewed-by: Jeremy Linton Tested-by: Jeremy Linton --- kernel/locking/locktorture.c | 74 +++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index a307a79e6b0b..2a1fc2a58910 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -703,8 +703,7 @@ static void __torture_print_stats(char *page, { bool fail = 0; int i, n_stress; - long max = 0; - long min = statp[0].n_lock_acquired; + long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; @@ -811,7 +810,7 @@ static void lock_torture_cleanup(void) * such, only perform the underlying torture-specific cleanups, * and avoid anything related to locktorture. */ - if (!cxt.lwsa) + if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { @@ -886,6 +885,13 @@ static int __init lock_torture_init(void) firsterr = -EINVAL; goto unwind; } + + if (nwriters_stress == 0 && nreaders_stress == 0) { + pr_alert("lock-torture: must run at least one locking thread\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cxt.cur_ops->init) cxt.cur_ops->init(); @@ -909,17 +915,19 @@ static int __init lock_torture_init(void) #endif /* Initialize the statistics so that each run gets its own numbers. */ + if (nwriters_stress) { + lock_is_write_held = 0; + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } - lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); - if (cxt.lwsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < cxt.nrealwriters_stress; i++) { - cxt.lwsa[i].n_lock_fail = 0; - cxt.lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; + } } if (cxt.cur_ops->readlock) { @@ -936,19 +944,21 @@ static int __init lock_torture_init(void) cxt.nrealreaders_stress = cxt.nrealwriters_stress; } - lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); - if (cxt.lrsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); - firsterr = -ENOMEM; - kfree(cxt.lwsa); - cxt.lwsa = NULL; - goto unwind; - } + if (nreaders_stress) { + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + cxt.lwsa = NULL; + goto unwind; + } - for (i = 0; i < cxt.nrealreaders_stress; i++) { - cxt.lrsa[i].n_lock_fail = 0; - cxt.lrsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } } } @@ -978,12 +988,14 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), - GFP_KERNEL); - if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nwriters_stress) { + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } if (cxt.cur_ops->readlock) { From 4ced3314fd3a73dabac4e8a41747883eff36c3e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 15:01:02 -0800 Subject: [PATCH 70/72] torture: Make stutter less vulnerable to compilers and races The stutter_wait() function repeatedly fetched stutter_pause_test, and should really just fetch it once on each pass. The races should be harmless, but why have the races? Also, the whole point of the value "2" for stutter_pause_test is to get everyone to start at very nearly the same time, but the value "2" was the first jiffy of the stutter rather than the last jiffy of the stutter. This commit rearranges the code to be more sensible. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 52781e838541..3bcbd4fbfe18 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -573,18 +573,21 @@ static int stutter; */ void stutter_wait(const char *title) { + int spt; + cond_resched_rcu_qs(); - while (READ_ONCE(stutter_pause_test) || - (torture_runnable && !READ_ONCE(*torture_runnable))) { - if (stutter_pause_test) - if (READ_ONCE(stutter_pause_test) == 1) - schedule_timeout_interruptible(1); - else - while (READ_ONCE(stutter_pause_test)) - cond_resched(); - else + spt = READ_ONCE(stutter_pause_test); + while (spt || (torture_runnable && !READ_ONCE(*torture_runnable))) { + if (spt == 1) { + schedule_timeout_interruptible(1); + } else if (spt == 2) { + while (READ_ONCE(stutter_pause_test)) + cond_resched(); + } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); + } torture_shutdown_absorb(title); + spt = READ_ONCE(stutter_pause_test); } } EXPORT_SYMBOL_GPL(stutter_wait); @@ -597,17 +600,15 @@ static int torture_stutter(void *arg) { VERBOSE_TOROUT_STRING("torture_stutter task started"); do { - if (!torture_must_stop()) { - if (stutter > 1) { - schedule_timeout_interruptible(stutter - 1); - WRITE_ONCE(stutter_pause_test, 2); - } - schedule_timeout_interruptible(1); + if (!torture_must_stop() && stutter > 1) { WRITE_ONCE(stutter_pause_test, 1); + schedule_timeout_interruptible(stutter - 1); + WRITE_ONCE(stutter_pause_test, 2); + schedule_timeout_interruptible(1); } + WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); From a2f2577d96ad060b65eb909dd39b57d676754119 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 20:19:17 -0800 Subject: [PATCH 71/72] torture: Eliminate torture_runnable and perf_runnable The purpose of torture_runnable is to allow rcutorture and locktorture to be started and stopped via sysfs when they are built into the kernel (as in not compiled as loadable modules). However, the 0444 permissions for both instances of torture_runnable prevent this use case from ever being put into practice. Given that there have been no complaints about this deficiency, it is reasonable to conclude that no one actually makes use of this sysfs capability. The perf_runnable module parameter for rcuperf is in the same situation. This commit therefore removes both torture_runnable instances as well as perf_runnable. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 9 --------- Documentation/locking/locktorture.txt | 5 ----- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 6 +----- kernel/rcu/rcuperf.c | 6 +----- kernel/rcu/rcutorture.c | 6 +----- kernel/torture.c | 6 ++---- .../selftests/rcutorture/configs/lock/ver_functions.sh | 1 - .../selftests/rcutorture/configs/rcu/ver_functions.sh | 1 - .../rcutorture/configs/rcuperf/ver_functions.sh | 1 - 10 files changed, 6 insertions(+), 37 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..66d471f0b92e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2049,9 +2049,6 @@ This tests the locking primitive's ability to transition abruptly to and from idle. - locktorture.torture_runnable= [BOOT] - Start locktorture running at boot time. - locktorture.torture_type= [KNL] Specify the locking implementation to test. @@ -3459,9 +3456,6 @@ the same as for rcuperf.nreaders. N, where N is the number of CPUs - rcuperf.perf_runnable= [BOOT] - Start rcuperf running at boot time. - rcuperf.perf_type= [KNL] Specify the RCU implementation to test. @@ -3595,9 +3589,6 @@ Test RCU's dyntick-idle handling. See also the rcutorture.shuffle_interval parameter. - rcutorture.torture_runnable= [BOOT] - Start rcutorture running at boot time. - rcutorture.torture_type= [KNL] Specify the RCU implementation to test. diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index a2ef3a929bf1..6a8df4cd19bf 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -57,11 +57,6 @@ torture_type Type of lock to torture. By default, only spinlocks will o "rwsem_lock": read/write down() and up() semaphore pairs. -torture_runnable Start locktorture at boot time in the case where the - module is built into the kernel, otherwise wait for - torture_runnable to be set via sysfs before starting. - By default it will begin once the module is loaded. - ** Torture-framework (RCU + locking) ** diff --git a/include/linux/torture.h b/include/linux/torture.h index 907d266aaddc..66272862070b 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -79,7 +79,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -bool torture_init_begin(char *ttype, bool v, int *runnable); +bool torture_init_begin(char *ttype, bool v); void torture_init_end(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 2a1fc2a58910..6850ffd69125 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -77,10 +77,6 @@ struct lock_stress_stats { long n_lock_acquired; }; -int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); - /* Forward reference. */ static void lock_torture_cleanup(void); @@ -866,7 +862,7 @@ static int __init lock_torture_init(void) &percpu_rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 1f87a02c3399..d1ebdf9868bb 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -106,10 +106,6 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -static int perf_runnable = IS_ENABLED(MODULE); -module_param(perf_runnable, int, 0444); -MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); - /* * Operations vector for selecting different types of tests. */ @@ -646,7 +642,7 @@ rcu_perf_init(void) &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + if (!torture_init_begin(perf_type, verbose)) return -EBUSY; /* Process args and tell the world that the perf'er is on the job. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1074ecc3f72f..308e6fdbced8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -static int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); - #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ @@ -1729,7 +1725,7 @@ rcu_torture_init(void) &sched_ops, &tasks_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/torture.c b/kernel/torture.c index 3bcbd4fbfe18..572576ad9f58 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -61,7 +61,6 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); -static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -577,7 +576,7 @@ void stutter_wait(const char *title) cond_resched_rcu_qs(); spt = READ_ONCE(stutter_pause_test); - while (spt || (torture_runnable && !READ_ONCE(*torture_runnable))) { + while (spt) { if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -649,7 +648,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -661,7 +660,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) } torture_type = ttype; verbose = v; - torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; return true; } diff --git a/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh index 252aae618984..80eb646e1319 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh @@ -38,6 +38,5 @@ per_version_boot_params () { echo $1 `locktorture_param_onoff "$1" "$2"` \ locktorture.stat_interval=15 \ locktorture.shutdown_secs=$3 \ - locktorture.torture_runnable=1 \ locktorture.verbose=1 } diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index ffb85ed786fa..24ec91041957 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -51,7 +51,6 @@ per_version_boot_params () { `rcutorture_param_n_barrier_cbs "$1"` \ rcutorture.stat_interval=15 \ rcutorture.shutdown_secs=$3 \ - rcutorture.torture_runnable=1 \ rcutorture.test_no_idle_hz=1 \ rcutorture.verbose=1 } diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh index 34f2a1b35ee5..b9603115d7c7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -46,7 +46,6 @@ rcuperf_param_nwriters () { per_version_boot_params () { echo $1 `rcuperf_param_nreaders "$1"` \ `rcuperf_param_nwriters "$1"` \ - rcuperf.perf_runnable=1 \ rcuperf.shutdown=1 \ rcuperf.verbose=1 } From 29d3939084583b26a5487be64b9523e61468f1be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 22:07:59 -0800 Subject: [PATCH 72/72] torture: Save a line in stutter_wait(): while -> for Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 572576ad9f58..37b94012a3f8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -576,7 +576,7 @@ void stutter_wait(const char *title) cond_resched_rcu_qs(); spt = READ_ONCE(stutter_pause_test); - while (spt) { + for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -586,7 +586,6 @@ void stutter_wait(const char *title) schedule_timeout_interruptible(round_jiffies_relative(HZ)); } torture_shutdown_absorb(title); - spt = READ_ONCE(stutter_pause_test); } } EXPORT_SYMBOL_GPL(stutter_wait);