From 01b4c39901e087ceebae2733857248de81476bd8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2019 15:22:59 +0200 Subject: [PATCH 01/42] nohz: Add TICK_DEP_BIT_RCU If a nohz_full CPU is looping in the kernel, the scheduling-clock tick might nevertheless remain disabled. In !PREEMPT kernels, this can prevent RCU's attempts to enlist the aid of that CPU's executions of cond_resched(), which can in turn result in an arbitrarily delayed grace period and thus an OOM. RCU therefore needs a way to enable a holdout nohz_full CPU's scheduler-clock interrupt. This commit therefore provides a new TICK_DEP_BIT_RCU value which RCU can pass to tick_dep_set_cpu() and friends to force on the scheduler-clock interrupt for a specified CPU or task. In some cases, rcutorture needs to turn on the scheduler-clock tick, so this commit also exports the relevant symbols to GPL-licensed modules. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/tick.h | 7 ++++++- include/trace/events/timer.h | 3 ++- kernel/time/tick-sched.c | 7 +++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index f92a10b5e112..39eb44564058 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -108,7 +108,8 @@ enum tick_dep_bits { TICK_DEP_BIT_POSIX_TIMER = 0, TICK_DEP_BIT_PERF_EVENTS = 1, TICK_DEP_BIT_SCHED = 2, - TICK_DEP_BIT_CLOCK_UNSTABLE = 3 + TICK_DEP_BIT_CLOCK_UNSTABLE = 3, + TICK_DEP_BIT_RCU = 4 }; #define TICK_DEP_MASK_NONE 0 @@ -116,6 +117,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) +#define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; @@ -268,6 +270,9 @@ static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } +static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } + static inline void tick_dep_set(enum tick_dep_bits bit) { } static inline void tick_dep_clear(enum tick_dep_bits bit) { } static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index b7a904825e7d..295517f109d7 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -367,7 +367,8 @@ TRACE_EVENT(itimer_expire, tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ - tick_dep_name_end(CLOCK_UNSTABLE) + tick_dep_name(CLOCK_UNSTABLE) \ + tick_dep_name_end(RCU) #undef tick_dep_name #undef tick_dep_mask_name diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 955851748dc3..d1b0a84b6112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,6 +198,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU) { + trace_tick_stop(0, TICK_DEP_MASK_RCU); + return true; + } + return false; } @@ -324,6 +329,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) preempt_enable(); } } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { @@ -331,6 +337,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) atomic_andnot(BIT(bit), &ts->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. Posix CPU timers need this in order to elapse From ae9e557b5be2e285f48ee945d9c8faf75d4f6a66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Aug 2019 16:29:42 -0700 Subject: [PATCH 02/42] time: Export tick start/stop functions for rcutorture It turns out that rcutorture needs to ensure that the scheduling-clock interrupt is enabled in CONFIG_NO_HZ_FULL kernels before starting on CPU-bound in-kernel processing. This commit therefore exports tick_nohz_dep_set_task(), tick_nohz_dep_clear_task(), and tick_nohz_full_setup() to GPL kernel modules. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d1b0a84b6112..1ffdb4ba1ded 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -172,6 +172,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; bool tick_nohz_full_running; +EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) @@ -351,11 +352,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) */ tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse @@ -404,6 +407,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } +EXPORT_SYMBOL_GPL(tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { From 6a949b7af82db7eb1e52caaed122eab1cf63acee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Jul 2019 11:50:56 -0700 Subject: [PATCH 03/42] rcu: Force on tick when invoking lots of callbacks Callback invocation can run for a significant time period, and within CONFIG_NO_HZ_FULL=y kernels, this period will be devoid of scheduler-clock interrupts. In-kernel execution without such interrupts can cause all manner of malfunction, with RCU CPU stall warnings being but one result. This commit therefore forces scheduling-clock interrupts on whenever more than a few RCU callbacks are invoked. Because offloaded callback invocation can be preempted, this forcing is withdrawn on each context switch. This in turn requires that the loop invoking RCU callbacks reiterate the forcing periodically. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] [ paulmck: Remove NO_HZ_FULL check per Frederic Weisbecker feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81105141b6a8..238f93b4b0a4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2151,6 +2151,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ + tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { debug_rcu_head_unqueue(rhp); @@ -2217,6 +2218,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Re-invoke RCU core processing if there are callbacks remaining. */ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } /* From d38e6dc6ed0dfef8d323354031a1ee1a7cfdedc1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Jul 2019 12:00:48 -0700 Subject: [PATCH 04/42] rcutorture: Force on tick for readers and callback flooders Readers and callback flooders in the rcutorture stress-test suite run for extended time periods by design. They do take pains to relinquish the CPU from time to time, but in some cases this relies on the scheduler being active, which in turn relies on the scheduler-clock interrupt firing from time to time. This commit therefore forces scheduling-clock interrupts within these loops. While in the area, this commit also prevents rcu_torture_reader()'s occasional timed sleeps from delaying shutdown. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3c9feca1eab1..ab61f5c1353b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "rcu.h" @@ -1363,15 +1364,15 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - + tick_dep_set_task(current, TICK_DEP_BIT_RCU); do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand)) + if (!rcu_torture_one_read(&rand) && !torture_must_stop()) schedule_timeout_interruptible(HZ); - if (time_after(jiffies, lastsleep)) { + if (time_after(jiffies, lastsleep) && !torture_must_stop()) { schedule_timeout_interruptible(1); lastsleep = jiffies + 10; } @@ -1383,6 +1384,7 @@ rcu_torture_reader(void *arg) del_timer_sync(&t); destroy_timer_on_stack(&t); } + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); return 0; } @@ -1729,10 +1731,10 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); - } else { - // No userspace emulation: CB invocation throttles call_rcu() - cond_resched(); + return; } + // No userspace emulation: CB invocation throttles call_rcu() + cond_resched(); } /* @@ -1865,6 +1867,7 @@ static void rcu_torture_fwd_prog_cr(void) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rcu_launder_gp_seq_start = gps; + tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1911,6 +1914,7 @@ static void rcu_torture_fwd_prog_cr(void) rcu_torture_fwd_cb_hist(); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); WRITE_ONCE(rcu_fwd_cb_nodelay, false); } From 366237e7b0833faa2d8da7a8d7d7da8c3ca802e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jul 2019 08:01:01 -0700 Subject: [PATCH 05/42] stop_machine: Provide RCU quiescent state in multi_cpu_stop() When multi_cpu_stop() loops waiting for other tasks, it can trigger an RCU CPU stall warning. This can be misleading because what is instead needed is information on whatever task is blocking multi_cpu_stop(). This commit therefore inserts an RCU quiescent state into the multi_cpu_stop() function's waitloop. Signed-off-by: Paul E. McKenney --- include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 18b1ed9864b0..c5147de885ec 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,6 +37,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); +void rcu_momentary_dyntick_idle(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 238f93b4b0a4..a5c296d202ae 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -364,7 +364,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -static void __maybe_unused rcu_momentary_dyntick_idle(void) +void rcu_momentary_dyntick_idle(void) { int special; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..34c4f117d8c7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -233,6 +233,7 @@ static int multi_cpu_stop(void *data) */ touch_nmi_watchdog(); } + rcu_momentary_dyntick_idle(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); From 96926686deab853bcacf887501f4ed958e38666b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2019 15:12:47 -0700 Subject: [PATCH 06/42] rcu: Make CPU-hotplug removal operations enable tick CPU-hotplug removal operations run the multi_cpu_stop() function, which relies on the scheduler to gain control from whatever is running on the various online CPUs, including any nohz_full CPUs running long loops in kernel-mode code. Lack of the scheduler-clock interrupt on such CPUs can delay multi_cpu_stop() for several minutes and can also result in RCU CPU stall warnings. This commit therefore causes CPU-hotplug removal operations to enable the scheduler-clock interrupt on all online CPUs. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] [ paulmck: Apply simplifications suggested by Frederic Weisbecker. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a5c296d202ae..7c67ea561b36 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2101,6 +2101,9 @@ int rcutree_dead_cpu(unsigned int cpu) rcu_boost_kthread_setaffinity(rnp, -1); /* Do any needed no-CB deferred wakeups from this CPU. */ do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); + + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); return 0; } @@ -3085,6 +3088,9 @@ int rcutree_online_cpu(unsigned int cpu) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); return 0; } @@ -3105,6 +3111,9 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); return 0; } From 79ba7ff5a9925f5c170f51ed7a96d1475eb6c27f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Aug 2019 13:17:35 -0700 Subject: [PATCH 07/42] rcutorture: Emulate dyntick aspect of userspace nohz_full sojourn During an actual call_rcu() flood, there would be frequent trips to userspace (in-kernel call_rcu() floods must be otherwise housebroken). Userspace execution on nohz_full CPUs implies an RCU dyntick idle/not-idle transition pair, so this commit adds emulation of that pair. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 + kernel/rcu/rcutorture.c | 11 +++++++++++ kernel/rcu/tree.c | 1 + 3 files changed, 13 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 9bf1dfe7781f..37b6f0c2b79d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -84,6 +84,7 @@ static inline void rcu_scheduler_starting(void) { } #endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_is_watching(void) { return true; } +static inline void rcu_momentary_dyntick_idle(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ab61f5c1353b..49ad88765ed2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1761,6 +1761,11 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } } return freed; } @@ -1835,6 +1840,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) static void rcu_torture_fwd_prog_cr(void) { unsigned long cver; + unsigned long flags; unsigned long gps; int i; long n_launders; @@ -1894,6 +1900,11 @@ static void rcu_torture_fwd_prog_cr(void) } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c67ea561b36..66354ef776aa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -375,6 +375,7 @@ void rcu_momentary_dyntick_idle(void) WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); rcu_preempt_deferred_qs(current); } +EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle From ac5f636130c2014eb535f30951460b75db6cbe04 Mon Sep 17 00:00:00 2001 From: Ethan Hansen <1ethanhansen@gmail.com> Date: Thu, 1 Aug 2019 14:00:40 -0700 Subject: [PATCH 08/42] rcu: Remove unused function rcutorture_record_progress() The function rcutorture_record_progress() is declared in rcu.h, but is never used. This commit therefore removes rcutorture_record_progress() to clean code. Signed-off-by: Ethan Hansen <1ethanhansen@gmail.com> Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 8fd4f82c9b3d..aeec70fda82c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -455,7 +455,6 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); -void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, @@ -468,7 +467,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, From c5d3c8ca22d487ad9a18da9a7722a76e9bdbf4fd Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 2 Aug 2019 09:46:56 +0800 Subject: [PATCH 09/42] locktorture: Replace strncmp() with str_has_prefix() The strncmp() function is error-prone because it is easy to get the length wrong, especially if the string is subject to change, especially given the need to account for the terminating nul byte. This commit therefore substitutes the newly introduced str_has_prefix(), which does not require a separately specified length. Signed-off-by: Chuhong Yuan Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index c513031cd7e3..8dd900247205 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -889,16 +889,16 @@ static int __init lock_torture_init(void) cxt.nrealwriters_stress = 2 * num_online_cpus(); #ifdef CONFIG_DEBUG_MUTEXES - if (strncmp(torture_type, "mutex", 5) == 0) + if (str_has_prefix(torture_type, "mutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_RT_MUTEXES - if (strncmp(torture_type, "rtmutex", 7) == 0) + if (str_has_prefix(torture_type, "rtmutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_SPINLOCK - if ((strncmp(torture_type, "spin", 4) == 0) || - (strncmp(torture_type, "rw_lock", 7) == 0)) + if ((str_has_prefix(torture_type, "spin")) || + (str_has_prefix(torture_type, "rw_lock"))) cxt.debug_lock = true; #endif From 9f8ba55d49cef46da63f7863ec544e2b2b7eda66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2019 20:18:25 -0700 Subject: [PATCH 10/42] rcutorture: Remove CONFIG_HOTPLUG_CPU=n from scenarios A number of mainstream CPU families are no longer capable of building kernels having CONFIG_SMP=y and CONFIG_HOTPLUG_CPU=n, so this commit removes this combination from the rcutorture scenarios having it. People wishing to try out this combination may still do so using the "--kconfig CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n" argument to the tools/testing/selftests/rcutorture/bin/kvm.sh script that is used to run rcutorture. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TASKS03 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TREE02 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TREE04 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TREE06 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TREE08 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 3 --- tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL | 3 --- tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 8 files changed, 22 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index 28568b72a31b..ea4399020c6c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -1,8 +1,5 @@ CONFIG_SMP=y CONFIG_NR_CPUS=2 -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 index 35e639e39366..65daee4fbf5a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 @@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 24c9f6012e35..f6d6a40c0576 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 CONFIG_RCU_FANOUT_LEAF=3 CONFIG_DEBUG_LOCK_ALLOC=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 index 05a4eec3f27b..bf4980d606b5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 @@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=6 CONFIG_RCU_FANOUT_LEAF=6 CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 index fb1c763c10c5..c810c5276a89 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 @@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index 6710e749d9de..8523a7515cbf 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -8,9 +8,6 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL index 4d8eb5bfb6f6..5d546efa68e8 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL @@ -6,9 +6,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index af6fca03602f..1b96d68473b8 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -6,7 +6,6 @@ Kconfig Parameters: CONFIG_DEBUG_LOCK_ALLOC -- Do three, covering CONFIG_PROVE_LOCKING & not. CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one. -CONFIG_HOTPLUG_CPU -- Do half. (Every second.) CONFIG_HZ_PERIODIC -- Do one. CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.) CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement. From b3ffb206ddd7f07d83bafd10e1b403df57055af4 Mon Sep 17 00:00:00 2001 From: Ethan Hansen <1ethanhansen@gmail.com> Date: Wed, 7 Aug 2019 17:27:32 -0700 Subject: [PATCH 11/42] rcu: Remove unused variable rcu_perf_writer_state The variable rcu_perf_writer_state is declared and initialized, but is never actually referenced. Remove it to clean code. Signed-off-by: Ethan Hansen <1ethanhansen@gmail.com> [ paulmck: Also removed unused macros assigned to that variable. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5a879d073c1c..5f884d560384 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -109,15 +109,6 @@ static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); -static int rcu_perf_writer_state; -#define RTWS_INIT 0 -#define RTWS_ASYNC 1 -#define RTWS_BARRIER 2 -#define RTWS_EXP_SYNC 3 -#define RTWS_SYNC 4 -#define RTWS_IDLE 5 -#define RTWS_STOPPING 6 - #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -404,25 +395,20 @@ retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - rcu_perf_writer_state = RTWS_ASYNC; atomic_inc(this_cpu_ptr(&n_async_inflight)); cur_ops->async(rhp, rcu_perf_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { - rcu_perf_writer_state = RTWS_BARRIER; cur_ops->gp_barrier(); goto retry; } else { kfree(rhp); /* Because we are stopping. */ } } else if (gp_exp) { - rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { - rcu_perf_writer_state = RTWS_SYNC; cur_ops->sync(); } - rcu_perf_writer_state = RTWS_IDLE; t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; i_max = i; @@ -463,10 +449,8 @@ retry: rcu_perf_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { - rcu_perf_writer_state = RTWS_BARRIER; cur_ops->gp_barrier(); } - rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); return 0; From 8b5ddf8b99dc42241d1d413c6685bce18275c40e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Aug 2019 12:02:40 -0700 Subject: [PATCH 12/42] rcutorture: Separate warnings for each failure type Currently, each of six different types of failure triggers a single WARN_ON_ONCE(), and it is then necessary to stare at the rcu_torture_stats(), Reader Pipe, and Reader Batch lines looking for inappropriately non-zero values. This can be annoying and error-prone, so this commit provides a separate WARN_ON_ONCE() for each of the six error conditions and adds short comments to each to ease error identification. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3c9feca1eab1..5ac467293803 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1442,15 +1442,18 @@ rcu_torture_stats_print(void) n_rcu_torture_barrier_error); pr_alert("%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || + if (atomic_read(&n_rcu_torture_mberror) || + n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || + n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() + WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread + WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio + WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed + WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) From fbbd5e358cecb5fa490550ace66463517a7577e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2019 11:43:53 -0700 Subject: [PATCH 13/42] rcutorture: Make in-kernel-loop testing more brutal The rcu_torture_fwd_prog_nr() tests the ability of RCU to tolerate in-kernel busy loops. It invokes rcu_torture_fwd_prog_cond_resched() within its delay loop, which, in PREEMPT && NO_HZ_FULL kernels results in the occasional direct call to schedule(). Now, this direct call to schedule() is appropriate for call_rcu() flood testing, in which either the kernel should restrain itself or userspace transitions will supply the needed restraint. But in pure in-kernel loops, the occasional cond_resched() should do the job. This commit therefore makes rcu_torture_fwd_prog_nr() use cond_resched() instead of rcu_torture_fwd_prog_cond_resched() in order to increase the brutality of this aspect of rcutorture testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5ac467293803..df1caa93ee63 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1806,7 +1806,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - rcu_torture_fwd_prog_cond_resched(1); + cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && From 67d64918a163fd62cf3b668d69133b723c48ed96 Mon Sep 17 00:00:00 2001 From: "Wolfgang M. Reimer" Date: Mon, 16 Sep 2019 16:54:04 +0200 Subject: [PATCH 14/42] locking: locktorture: Do not include rwlock.h directly Including rwlock.h directly will cause kernel builds to fail if CONFIG_PREEMPT_RT is defined. The correct header file (rwlock_rt.h OR rwlock.h) will be included by spinlock.h which is included by locktorture.c anyway. Remove the include of linux/rwlock.h. Signed-off-by: Wolfgang M. Reimer Signed-off-by: Sebastian Andrzej Siewior Acked-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8dd900247205..99475a66c94f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include From daebf24a8e8c6064cba3a330db9fe9376a137d2c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 6 Sep 2019 16:57:22 -0400 Subject: [PATCH 15/42] tools/memory-model: Fix data race detection for unordered store and load Currently the Linux Kernel Memory Model gives an incorrect response for the following litmus test: C plain-WWC {} P0(int *x) { WRITE_ONCE(*x, 2); } P1(int *x, int *y) { int r1; int r2; int r3; r1 = READ_ONCE(*x); if (r1 == 2) { smp_rmb(); r2 = *x; } smp_rmb(); r3 = READ_ONCE(*x); WRITE_ONCE(*y, r3 - 1); } P2(int *x, int *y) { int r4; r4 = READ_ONCE(*y); if (r4 > 0) WRITE_ONCE(*x, 1); } exists (x=2 /\ 1:r2=2 /\ 2:r4=1) The memory model says that the plain read of *x in P1 races with the WRITE_ONCE(*x) in P2. The problem is that we have a write W and a read R related by neither fre or rfe, but rather W ->coe W' ->rfe R, where W' is an intermediate write (the WRITE_ONCE() in P0). In this situation there is no particular ordering between W and R, so either a wr-vis link from W to R or an rw-xbstar link from R to W would prove that the accesses aren't concurrent. But the LKMM only looks for a wr-vis link, which is equivalent to assuming that W must execute before R. This is not necessarily true on non-multicopy-atomic systems, as the WWC pattern demonstrates. This patch changes the LKMM to accept either a wr-vis or a reverse rw-xbstar link as a proof of non-concurrency. Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.cat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index ea2ff4b94074..2a9b4fe4a84e 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -197,7 +197,7 @@ empty (wr-incoh | rw-incoh | ww-incoh) as plain-coherence (* Actual races *) let ww-nonrace = ww-vis & ((Marked * W) | rw-xbstar) & ((W * Marked) | wr-vis) let ww-race = (pre-race & co) \ ww-nonrace -let wr-race = (pre-race & (co? ; rf)) \ wr-vis +let wr-race = (pre-race & (co? ; rf)) \ wr-vis \ rw-xbstar^-1 let rw-race = (pre-race & fr) \ rw-xbstar flag ~empty (ww-race | wr-race | rw-race) as data-race From 3321ea12907abd477ff7e9bf5f365524b8f1f2fc Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 1 Oct 2019 13:39:47 -0400 Subject: [PATCH 16/42] tools/memory-model/Documentation: Fix typos in explanation.txt This patch fixes a few minor typos and improves word usage in a few places in the Linux Kernel Memory Model's explanation.txt file. Signed-off-by: Alan Stern Reviewed-by: Joel Fernandes (Google) Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/explanation.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index 488f11f6c588..1b5264559cd6 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -206,7 +206,7 @@ goes like this: P0 stores 1 to buf before storing 1 to flag, since it executes its instructions in order. - Since an instruction (in this case, P1's store to flag) cannot + Since an instruction (in this case, P0's store to flag) cannot execute before itself, the specified outcome is impossible. However, real computer hardware almost never follows the Sequential @@ -419,7 +419,7 @@ example: The object code might call f(5) either before or after g(6); the memory model cannot assume there is a fixed program order relation -between them. (In fact, if the functions are inlined then the +between them. (In fact, if the function calls are inlined then the compiler might even interleave their object code.) @@ -499,7 +499,7 @@ different CPUs (external reads-from, or rfe). For our purposes, a memory location's initial value is treated as though it had been written there by an imaginary initial store that -executes on a separate CPU before the program runs. +executes on a separate CPU before the main program runs. Usage of the rf relation implicitly assumes that loads will always read from a single store. It doesn't apply properly in the presence @@ -955,7 +955,7 @@ atomic update. This is what the LKMM's "atomic" axiom says. THE PRESERVED PROGRAM ORDER RELATION: ppo ----------------------------------------- -There are many situations where a CPU is obligated to execute two +There are many situations where a CPU is obliged to execute two instructions in program order. We amalgamate them into the ppo (for "preserved program order") relation, which links the po-earlier instruction to the po-later instruction and is thus a sub-relation of @@ -1572,7 +1572,7 @@ and there are events X, Y and a read-side critical section C such that: 2. X comes "before" Y in some sense (including rfe, co and fr); - 2. Y is po-before Z; + 3. Y is po-before Z; 4. Z is the rcu_read_unlock() event marking the end of C; From ddc82999f02580f93f9be2b8fb3b10f6139fb281 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 1 Oct 2019 13:40:11 -0400 Subject: [PATCH 17/42] tools/memory-model/Documentation: Put redefinition of rcu-fence into explanation.txt This patch updates the Linux Kernel Memory Model's explanation.txt file to incorporate the introduction of the rcu-order relation and the redefinition of rcu-fence made by commit 15aa25cbf0cc ("tools/memory-model: Change definition of rcu-fence"). Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- .../Documentation/explanation.txt | 53 +++++++++++++------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index 1b5264559cd6..ecf6cccea5c3 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -27,7 +27,7 @@ Explanation of the Linux-Kernel Memory Consistency Model 19. AND THEN THERE WAS ALPHA 20. THE HAPPENS-BEFORE RELATION: hb 21. THE PROPAGATES-BEFORE RELATION: pb - 22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-fence, and rb + 22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb 23. LOCKING 24. ODDS AND ENDS @@ -1425,8 +1425,8 @@ they execute means that it cannot have cycles. This requirement is the content of the LKMM's "propagation" axiom. -RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-fence, and rb -------------------------------------------------------------- +RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb +------------------------------------------------------------------------ RCU (Read-Copy-Update) is a powerful synchronization mechanism. It rests on two concepts: grace periods and read-side critical sections. @@ -1536,29 +1536,29 @@ Z's CPU before Z begins but doesn't propagate to some other CPU until after X ends.) Similarly, X ->rcu-rscsi Y ->rcu-link Z says that X is the end of a critical section which starts before Z begins. -The LKMM goes on to define the rcu-fence relation as a sequence of +The LKMM goes on to define the rcu-order relation as a sequence of rcu-gp and rcu-rscsi links separated by rcu-link links, in which the number of rcu-gp links is >= the number of rcu-rscsi links. For example: X ->rcu-gp Y ->rcu-link Z ->rcu-rscsi T ->rcu-link U ->rcu-gp V -would imply that X ->rcu-fence V, because this sequence contains two +would imply that X ->rcu-order V, because this sequence contains two rcu-gp links and one rcu-rscsi link. (It also implies that -X ->rcu-fence T and Z ->rcu-fence V.) On the other hand: +X ->rcu-order T and Z ->rcu-order V.) On the other hand: X ->rcu-rscsi Y ->rcu-link Z ->rcu-rscsi T ->rcu-link U ->rcu-gp V -does not imply X ->rcu-fence V, because the sequence contains only +does not imply X ->rcu-order V, because the sequence contains only one rcu-gp link but two rcu-rscsi links. -The rcu-fence relation is important because the Grace Period Guarantee -means that rcu-fence acts kind of like a strong fence. In particular, -E ->rcu-fence F implies not only that E begins before F ends, but also -that any write po-before E will propagate to every CPU before any -instruction po-after F can execute. (However, it does not imply that -E must execute before F; in fact, each synchronize_rcu() fence event -is linked to itself by rcu-fence as a degenerate case.) +The rcu-order relation is important because the Grace Period Guarantee +means that rcu-order links act kind of like strong fences. In +particular, E ->rcu-order F implies not only that E begins before F +ends, but also that any write po-before E will propagate to every CPU +before any instruction po-after F can execute. (However, it does not +imply that E must execute before F; in fact, each synchronize_rcu() +fence event is linked to itself by rcu-order as a degenerate case.) To prove this in full generality requires some intellectual effort. We'll consider just a very simple case: @@ -1585,7 +1585,26 @@ G's CPU before G starts must propagate to every CPU before C starts. In particular, the write propagates to every CPU before F finishes executing and hence before any instruction po-after F can execute. This sort of reasoning can be extended to handle all the situations -covered by rcu-fence. +covered by rcu-order. + +The rcu-fence relation is a simple extension of rcu-order. While +rcu-order only links certain fence events (calls to synchronize_rcu(), +rcu_read_lock(), or rcu_read_unlock()), rcu-fence links any events +that are separated by an rcu-order link. This is analogous to the way +the strong-fence relation links events that are separated by an +smp_mb() fence event (as mentioned above, rcu-order links act kind of +like strong fences). Written symbolically, X ->rcu-fence Y means +there are fence events E and F such that: + + X ->po E ->rcu-order F ->po Y. + +From the discussion above, we see this implies not only that X +executes before Y, but also (if X is a store) that X propagates to +every CPU before Y executes. Thus rcu-fence is sort of a +"super-strong" fence: Unlike the original strong fences (smp_mb() and +synchronize_rcu()), rcu-fence is able to link events on different +CPUs. (Perhaps this fact should lead us to say that rcu-fence isn't +really a fence at all!) Finally, the LKMM defines the RCU-before (rb) relation in terms of rcu-fence. This is done in essentially the same way as the pb @@ -1596,7 +1615,7 @@ before F, just as E ->pb F does (and for much the same reasons). Putting this all together, the LKMM expresses the Grace Period Guarantee by requiring that the rb relation does not contain a cycle. Equivalently, this "rcu" axiom requires that there are no events E -and F with E ->rcu-link F ->rcu-fence E. Or to put it a third way, +and F with E ->rcu-link F ->rcu-order E. Or to put it a third way, the axiom requires that there are no cycles consisting of rcu-gp and rcu-rscsi alternating with rcu-link, where the number of rcu-gp links is >= the number of rcu-rscsi links. @@ -1750,7 +1769,7 @@ addition to normal RCU. The ideas involved are much the same as above, with new relations srcu-gp and srcu-rscsi added to represent SRCU grace periods and read-side critical sections. There is a restriction on the srcu-gp and srcu-rscsi links that can appear in an -rcu-fence sequence (the srcu-rscsi links must be paired with srcu-gp +rcu-order sequence (the srcu-rscsi links must be paired with srcu-gp links having the same SRCU domain with proper nesting); the details are relatively unimportant. From c58a80170169305f7a088e8ebb61231e3095b5cd Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 1 Oct 2019 13:40:19 -0400 Subject: [PATCH 18/42] tools/memory-model/Documentation: Add plain accesses and data races to explanation.txt This patch updates the Linux Kernel Memory Model's explanation.txt file by adding a section devoted to the model's handling of plain accesses and data-race detection. Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- .../Documentation/explanation.txt | 539 +++++++++++++++++- 1 file changed, 534 insertions(+), 5 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index ecf6cccea5c3..e91a2eb19592 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -29,7 +29,8 @@ Explanation of the Linux-Kernel Memory Consistency Model 21. THE PROPAGATES-BEFORE RELATION: pb 22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb 23. LOCKING - 24. ODDS AND ENDS + 24. PLAIN ACCESSES AND DATA RACES + 25. ODDS AND ENDS @@ -42,8 +43,7 @@ linux-kernel.bell and linux-kernel.cat files that make up the formal version of the model; they are extremely terse and their meanings are far from clear. -This document describes the ideas underlying the LKMM, but excluding -the modeling of bare C (or plain) shared memory accesses. It is meant +This document describes the ideas underlying the LKMM. It is meant for people who want to understand how the model was designed. It does not go into the details of the code in the .bell and .cat files; rather, it explains in English what the code expresses symbolically. @@ -857,7 +857,7 @@ outlined above. These restrictions involve the necessity of maintaining cache coherence and the fact that a CPU can't operate on a value before it knows what that value is, among other things. -The formal version of the LKMM is defined by five requirements, or +The formal version of the LKMM is defined by six requirements, or axioms: Sequential consistency per variable: This requires that the @@ -877,10 +877,14 @@ axioms: grace periods obey the rules of RCU, in particular, the Grace-Period Guarantee. + Plain-coherence: This requires that plain memory accesses + (those not using READ_ONCE(), WRITE_ONCE(), etc.) must obey + the operational model's rules regarding cache coherence. + The first and second are quite common; they can be found in many memory models (such as those for C11/C++11). The "happens-before" and "propagation" axioms have analogs in other memory models as well. The -"rcu" axiom is specific to the LKMM. +"rcu" and "plain-coherence" axioms are specific to the LKMM. Each of these axioms is discussed below. @@ -1915,6 +1919,521 @@ architectures supported by the Linux kernel, albeit for various differing reasons. +PLAIN ACCESSES AND DATA RACES +----------------------------- + +In the LKMM, memory accesses such as READ_ONCE(x), atomic_inc(&y), +smp_load_acquire(&z), and so on are collectively referred to as +"marked" accesses, because they are all annotated with special +operations of one kind or another. Ordinary C-language memory +accesses such as x or y = 0 are simply called "plain" accesses. + +Early versions of the LKMM had nothing to say about plain accesses. +The C standard allows compilers to assume that the variables affected +by plain accesses are not concurrently read or written by any other +threads or CPUs. This leaves compilers free to implement all manner +of transformations or optimizations of code containing plain accesses, +making such code very difficult for a memory model to handle. + +Here is just one example of a possible pitfall: + + int a = 6; + int *x = &a; + + P0() + { + int *r1; + int r2 = 0; + + r1 = x; + if (r1 != NULL) + r2 = READ_ONCE(*r1); + } + + P1() + { + WRITE_ONCE(x, NULL); + } + +On the face of it, one would expect that when this code runs, the only +possible final values for r2 are 6 and 0, depending on whether or not +P1's store to x propagates to P0 before P0's load from x executes. +But since P0's load from x is a plain access, the compiler may decide +to carry out the load twice (for the comparison against NULL, then again +for the READ_ONCE()) and eliminate the temporary variable r1. The +object code generated for P0 could therefore end up looking rather +like this: + + P0() + { + int r2 = 0; + + if (x != NULL) + r2 = READ_ONCE(*x); + } + +And now it is obvious that this code runs the risk of dereferencing a +NULL pointer, because P1's store to x might propagate to P0 after the +test against NULL has been made but before the READ_ONCE() executes. +If the original code had said "r1 = READ_ONCE(x)" instead of "r1 = x", +the compiler would not have performed this optimization and there +would be no possibility of a NULL-pointer dereference. + +Given the possibility of transformations like this one, the LKMM +doesn't try to predict all possible outcomes of code containing plain +accesses. It is instead content to determine whether the code +violates the compiler's assumptions, which would render the ultimate +outcome undefined. + +In technical terms, the compiler is allowed to assume that when the +program executes, there will not be any data races. A "data race" +occurs when two conflicting memory accesses execute concurrently; +two memory accesses "conflict" if: + + they access the same location, + + they occur on different CPUs (or in different threads on the + same CPU), + + at least one of them is a plain access, + + and at least one of them is a store. + +The LKMM tries to determine whether a program contains two conflicting +accesses which may execute concurrently; if it does then the LKMM says +there is a potential data race and makes no predictions about the +program's outcome. + +Determining whether two accesses conflict is easy; you can see that +all the concepts involved in the definition above are already part of +the memory model. The hard part is telling whether they may execute +concurrently. The LKMM takes a conservative attitude, assuming that +accesses may be concurrent unless it can prove they cannot. + +If two memory accesses aren't concurrent then one must execute before +the other. Therefore the LKMM decides two accesses aren't concurrent +if they can be connected by a sequence of hb, pb, and rb links +(together referred to as xb, for "executes before"). However, there +are two complicating factors. + +If X is a load and X executes before a store Y, then indeed there is +no danger of X and Y being concurrent. After all, Y can't have any +effect on the value obtained by X until the memory subsystem has +propagated Y from its own CPU to X's CPU, which won't happen until +some time after Y executes and thus after X executes. But if X is a +store, then even if X executes before Y it is still possible that X +will propagate to Y's CPU just as Y is executing. In such a case X +could very well interfere somehow with Y, and we would have to +consider X and Y to be concurrent. + +Therefore when X is a store, for X and Y to be non-concurrent the LKMM +requires not only that X must execute before Y but also that X must +propagate to Y's CPU before Y executes. (Or vice versa, of course, if +Y executes before X -- then Y must propagate to X's CPU before X +executes if Y is a store.) This is expressed by the visibility +relation (vis), where X ->vis Y is defined to hold if there is an +intermediate event Z such that: + + X is connected to Z by a possibly empty sequence of + cumul-fence links followed by an optional rfe link (if none of + these links are present, X and Z are the same event), + +and either: + + Z is connected to Y by a strong-fence link followed by a + possibly empty sequence of xb links, + +or: + + Z is on the same CPU as Y and is connected to Y by a possibly + empty sequence of xb links (again, if the sequence is empty it + means Z and Y are the same event). + +The motivations behind this definition are straightforward: + + cumul-fence memory barriers force stores that are po-before + the barrier to propagate to other CPUs before stores that are + po-after the barrier. + + An rfe link from an event W to an event R says that R reads + from W, which certainly means that W must have propagated to + R's CPU before R executed. + + strong-fence memory barriers force stores that are po-before + the barrier, or that propagate to the barrier's CPU before the + barrier executes, to propagate to all CPUs before any events + po-after the barrier can execute. + +To see how this works out in practice, consider our old friend, the MP +pattern (with fences and statement labels, but without the conditional +test): + + int buf = 0, flag = 0; + + P0() + { + X: WRITE_ONCE(buf, 1); + smp_wmb(); + W: WRITE_ONCE(flag, 1); + } + + P1() + { + int r1; + int r2 = 0; + + Z: r1 = READ_ONCE(flag); + smp_rmb(); + Y: r2 = READ_ONCE(buf); + } + +The smp_wmb() memory barrier gives a cumul-fence link from X to W, and +assuming r1 = 1 at the end, there is an rfe link from W to Z. This +means that the store to buf must propagate from P0 to P1 before Z +executes. Next, Z and Y are on the same CPU and the smp_rmb() fence +provides an xb link from Z to Y (i.e., it forces Z to execute before +Y). Therefore we have X ->vis Y: X must propagate to Y's CPU before Y +executes. + +The second complicating factor mentioned above arises from the fact +that when we are considering data races, some of the memory accesses +are plain. Now, although we have not said so explicitly, up to this +point most of the relations defined by the LKMM (ppo, hb, prop, +cumul-fence, pb, and so on -- including vis) apply only to marked +accesses. + +There are good reasons for this restriction. The compiler is not +allowed to apply fancy transformations to marked accesses, and +consequently each such access in the source code corresponds more or +less directly to a single machine instruction in the object code. But +plain accesses are a different story; the compiler may combine them, +split them up, duplicate them, eliminate them, invent new ones, and +who knows what else. Seeing a plain access in the source code tells +you almost nothing about what machine instructions will end up in the +object code. + +Fortunately, the compiler isn't completely free; it is subject to some +limitations. For one, it is not allowed to introduce a data race into +the object code if the source code does not already contain a data +race (if it could, memory models would be useless and no multithreaded +code would be safe!). For another, it cannot move a plain access past +a compiler barrier. + +A compiler barrier is a kind of fence, but as the name implies, it +only affects the compiler; it does not necessarily have any effect on +how instructions are executed by the CPU. In Linux kernel source +code, the barrier() function is a compiler barrier. It doesn't give +rise directly to any machine instructions in the object code; rather, +it affects how the compiler generates the rest of the object code. +Given source code like this: + + ... some memory accesses ... + barrier(); + ... some other memory accesses ... + +the barrier() function ensures that the machine instructions +corresponding to the first group of accesses will all end po-before +any machine instructions corresponding to the second group of accesses +-- even if some of the accesses are plain. (Of course, the CPU may +then execute some of those accesses out of program order, but we +already know how to deal with such issues.) Without the barrier() +there would be no such guarantee; the two groups of accesses could be +intermingled or even reversed in the object code. + +The LKMM doesn't say much about the barrier() function, but it does +require that all fences are also compiler barriers. In addition, it +requires that the ordering properties of memory barriers such as +smp_rmb() or smp_store_release() apply to plain accesses as well as to +marked accesses. + +This is the key to analyzing data races. Consider the MP pattern +again, now using plain accesses for buf: + + int buf = 0, flag = 0; + + P0() + { + U: buf = 1; + smp_wmb(); + X: WRITE_ONCE(flag, 1); + } + + P1() + { + int r1; + int r2 = 0; + + Y: r1 = READ_ONCE(flag); + if (r1) { + smp_rmb(); + V: r2 = buf; + } + } + +This program does not contain a data race. Although the U and V +accesses conflict, the LKMM can prove they are not concurrent as +follows: + + The smp_wmb() fence in P0 is both a compiler barrier and a + cumul-fence. It guarantees that no matter what hash of + machine instructions the compiler generates for the plain + access U, all those instructions will be po-before the fence. + Consequently U's store to buf, no matter how it is carried out + at the machine level, must propagate to P1 before X's store to + flag does. + + X and Y are both marked accesses. Hence an rfe link from X to + Y is a valid indicator that X propagated to P1 before Y + executed, i.e., X ->vis Y. (And if there is no rfe link then + r1 will be 0, so V will not be executed and ipso facto won't + race with U.) + + The smp_rmb() fence in P1 is a compiler barrier as well as a + fence. It guarantees that all the machine-level instructions + corresponding to the access V will be po-after the fence, and + therefore any loads among those instructions will execute + after the fence does and hence after Y does. + +Thus U's store to buf is forced to propagate to P1 before V's load +executes (assuming V does execute), ruling out the possibility of a +data race between them. + +This analysis illustrates how the LKMM deals with plain accesses in +general. Suppose R is a plain load and we want to show that R +executes before some marked access E. We can do this by finding a +marked access X such that R and X are ordered by a suitable fence and +X ->xb* E. If E was also a plain access, we would also look for a +marked access Y such that X ->xb* Y, and Y and E are ordered by a +fence. We describe this arrangement by saying that R is +"post-bounded" by X and E is "pre-bounded" by Y. + +In fact, we go one step further: Since R is a read, we say that R is +"r-post-bounded" by X. Similarly, E would be "r-pre-bounded" or +"w-pre-bounded" by Y, depending on whether E was a store or a load. +This distinction is needed because some fences affect only loads +(i.e., smp_rmb()) and some affect only stores (smp_wmb()); otherwise +the two types of bounds are the same. And as a degenerate case, we +say that a marked access pre-bounds and post-bounds itself (e.g., if R +above were a marked load then X could simply be taken to be R itself.) + +The need to distinguish between r- and w-bounding raises yet another +issue. When the source code contains a plain store, the compiler is +allowed to put plain loads of the same location into the object code. +For example, given the source code: + + x = 1; + +the compiler is theoretically allowed to generate object code that +looks like: + + if (x != 1) + x = 1; + +thereby adding a load (and possibly replacing the store entirely). +For this reason, whenever the LKMM requires a plain store to be +w-pre-bounded or w-post-bounded by a marked access, it also requires +the store to be r-pre-bounded or r-post-bounded, so as to handle cases +where the compiler adds a load. + +(This may be overly cautious. We don't know of any examples where a +compiler has augmented a store with a load in this fashion, and the +Linux kernel developers would probably fight pretty hard to change a +compiler if it ever did this. Still, better safe than sorry.) + +Incidentally, the other tranformation -- augmenting a plain load by +adding in a store to the same location -- is not allowed. This is +because the compiler cannot know whether any other CPUs might perform +a concurrent load from that location. Two concurrent loads don't +constitute a race (they can't interfere with each other), but a store +does race with a concurrent load. Thus adding a store might create a +data race where one was not already present in the source code, +something the compiler is forbidden to do. Augmenting a store with a +load, on the other hand, is acceptable because doing so won't create a +data race unless one already existed. + +The LKMM includes a second way to pre-bound plain accesses, in +addition to fences: an address dependency from a marked load. That +is, in the sequence: + + p = READ_ONCE(ptr); + r = *p; + +the LKMM says that the marked load of ptr pre-bounds the plain load of +*p; the marked load must execute before any of the machine +instructions corresponding to the plain load. This is a reasonable +stipulation, since after all, the CPU can't perform the load of *p +until it knows what value p will hold. Furthermore, without some +assumption like this one, some usages typical of RCU would count as +data races. For example: + + int a = 1, b; + int *ptr = &a; + + P0() + { + b = 2; + rcu_assign_pointer(ptr, &b); + } + + P1() + { + int *p; + int r; + + rcu_read_lock(); + p = rcu_dereference(ptr); + r = *p; + rcu_read_unlock(); + } + +(In this example the rcu_read_lock() and rcu_read_unlock() calls don't +really do anything, because there aren't any grace periods. They are +included merely for the sake of good form; typically P0 would call +synchronize_rcu() somewhere after the rcu_assign_pointer().) + +rcu_assign_pointer() performs a store-release, so the plain store to b +is definitely w-post-bounded before the store to ptr, and the two +stores will propagate to P1 in that order. However, rcu_dereference() +is only equivalent to READ_ONCE(). While it is a marked access, it is +not a fence or compiler barrier. Hence the only guarantee we have +that the load of ptr in P1 is r-pre-bounded before the load of *p +(thus avoiding a race) is the assumption about address dependencies. + +This is a situation where the compiler can undermine the memory model, +and a certain amount of care is required when programming constructs +like this one. In particular, comparisons between the pointer and +other known addresses can cause trouble. If you have something like: + + p = rcu_dereference(ptr); + if (p == &x) + r = *p; + +then the compiler just might generate object code resembling: + + p = rcu_dereference(ptr); + if (p == &x) + r = x; + +or even: + + rtemp = x; + p = rcu_dereference(ptr); + if (p == &x) + r = rtemp; + +which would invalidate the memory model's assumption, since the CPU +could now perform the load of x before the load of ptr (there might be +a control dependency but no address dependency at the machine level). + +Finally, it turns out there is a situation in which a plain write does +not need to be w-post-bounded: when it is separated from the +conflicting access by a fence. At first glance this may seem +impossible. After all, to be conflicting the second access has to be +on a different CPU from the first, and fences don't link events on +different CPUs. Well, normal fences don't -- but rcu-fence can! +Here's an example: + + int x, y; + + P0() + { + WRITE_ONCE(x, 1); + synchronize_rcu(); + y = 3; + } + + P1() + { + rcu_read_lock(); + if (READ_ONCE(x) == 0) + y = 2; + rcu_read_unlock(); + } + +Do the plain stores to y race? Clearly not if P1 reads a non-zero +value for x, so let's assume the READ_ONCE(x) does obtain 0. This +means that the read-side critical section in P1 must finish executing +before the grace period in P0 does, because RCU's Grace-Period +Guarantee says that otherwise P0's store to x would have propagated to +P1 before the critical section started and so would have been visible +to the READ_ONCE(). (Another way of putting it is that the fre link +from the READ_ONCE() to the WRITE_ONCE() gives rise to an rcu-link +between those two events.) + +This means there is an rcu-fence link from P1's "y = 2" store to P0's +"y = 3" store, and consequently the first must propagate from P1 to P0 +before the second can execute. Therefore the two stores cannot be +concurrent and there is no race, even though P1's plain store to y +isn't w-post-bounded by any marked accesses. + +Putting all this material together yields the following picture. For +two conflicting stores W and W', where W ->co W', the LKMM says the +stores don't race if W can be linked to W' by a + + w-post-bounded ; vis ; w-pre-bounded + +sequence. If W is plain then they also have to be linked by an + + r-post-bounded ; xb* ; w-pre-bounded + +sequence, and if W' is plain then they also have to be linked by a + + w-post-bounded ; vis ; r-pre-bounded + +sequence. For a conflicting load R and store W, the LKMM says the two +accesses don't race if R can be linked to W by an + + r-post-bounded ; xb* ; w-pre-bounded + +sequence or if W can be linked to R by a + + w-post-bounded ; vis ; r-pre-bounded + +sequence. For the cases involving a vis link, the LKMM also accepts +sequences in which W is linked to W' or R by a + + strong-fence ; xb* ; {w and/or r}-pre-bounded + +sequence with no post-bounding, and in every case the LKMM also allows +the link simply to be a fence with no bounding at all. If no sequence +of the appropriate sort exists, the LKMM says that the accesses race. + +There is one more part of the LKMM related to plain accesses (although +not to data races) we should discuss. Recall that many relations such +as hb are limited to marked accesses only. As a result, the +happens-before, propagates-before, and rcu axioms (which state that +various relation must not contain a cycle) doesn't apply to plain +accesses. Nevertheless, we do want to rule out such cycles, because +they don't make sense even for plain accesses. + +To this end, the LKMM imposes three extra restrictions, together +called the "plain-coherence" axiom because of their resemblance to the +rules used by the operational model to ensure cache coherence (that +is, the rules governing the memory subsystem's choice of a store to +satisfy a load request and its determination of where a store will +fall in the coherence order): + + If R and W conflict and it is possible to link R to W by one + of the xb* sequences listed above, then W ->rfe R is not + allowed (i.e., a load cannot read from a store that it + executes before, even if one or both is plain). + + If W and R conflict and it is possible to link W to R by one + of the vis sequences listed above, then R ->fre W is not + allowed (i.e., if a store is visible to a load then the load + must read from that store or one coherence-after it). + + If W and W' conflict and it is possible to link W to W' by one + of the vis sequences listed above, then W' ->co W is not + allowed (i.e., if one store is visible to a second then the + second must come after the first in the coherence order). + +This is the extent to which the LKMM deals with plain accesses. +Perhaps it could say more (for example, plain accesses might +contribute to the ppo relation), but at the moment it seems that this +minimal, conservative approach is good enough. + + ODDS AND ENDS ------------- @@ -1962,6 +2481,16 @@ treated as READ_ONCE() and rcu_assign_pointer() is treated as smp_store_release() -- which is basically how the Linux kernel treats them. +Although we said that plain accesses are not linked by the ppo +relation, they do contribute to it indirectly. Namely, when there is +an address dependency from a marked load R to a plain store W, +followed by smp_wmb() and then a marked store W', the LKMM creates a +ppo link from R to W'. The reasoning behind this is perhaps a little +shaky, but essentially it says there is no way to generate object code +for this source code in which W' could execute before R. Just as with +pre-bounding by address dependencies, it is possible for the compiler +to undermine this relation if sufficient care is not taken. + There are a few oddball fences which need special treatment: smp_mb__before_atomic(), smp_mb__after_atomic(), and smp_mb__after_spinlock(). The LKMM uses fence events with special From 66e4c33b51bc515ca803c0948cf1525b53ffd631 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Aug 2019 16:14:00 -0700 Subject: [PATCH 19/42] rcu: Force tick on for nohz_full CPUs not reaching quiescent states CPUs running for long time periods in the kernel in nohz_full mode might leave the scheduling-clock interrupt disabled for then full duration of their in-kernel execution. This can (among other things) delay grace periods. This commit therefore forces the tick back on for any nohz_full CPU that is failing to pass through a quiescent state upon return from interrupt, which the resched_cpu() will induce. Reported-by: Joel Fernandes [ paulmck: Clear ->rcu_forced_tick as reported by Joel Fernandes testing. ] [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 38 +++++++++++++++++++++++++++++++------- kernel/rcu/tree.h | 1 + 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 66354ef776aa..9fda33864ede 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -651,6 +651,12 @@ static __always_inline void rcu_nmi_exit_common(bool irq) */ if (rdp->dynticks_nmi_nesting != 1) { trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == 2 && + rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_MASK_RCU); + } WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; @@ -886,6 +892,18 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } +/* + * If the scheduler-clock interrupt was enabled on a nohz_full CPU + * in order to get to a quiescent state, disable it. + */ +void rcu_disable_tick_upon_qs(struct rcu_data *rdp) +{ + if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { + tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + rdp->rcu_forced_tick = false; + } +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1980,6 +1998,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) if (!offloaded) needwake = rcu_accelerate_cbs(rnp, rdp); + rcu_disable_tick_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) @@ -2265,6 +2284,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) int cpu; unsigned long flags; unsigned long mask; + struct rcu_data *rdp; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { @@ -2289,8 +2309,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(&rcu_data, cpu))) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { mask |= bit; + rcu_disable_tick_upon_qs(rdp); + } } } if (mask != 0) { @@ -2318,7 +2341,7 @@ void rcu_force_quiescent_state(void) rnp = __this_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || - !raw_spin_trylock(&rnp->fqslock); + !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) @@ -2851,7 +2874,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); complete(&rcu_state.barrier_completion); } else { rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); @@ -2875,7 +2898,7 @@ static void rcu_barrier_func(void *unused) } else { debug_rcu_head_unqueue(&rdp->barrier_head); rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); } @@ -2902,7 +2925,7 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; @@ -2934,11 +2957,11 @@ void rcu_barrier(void) continue; if (rcu_segcblist_n_cbs(&rdp->cblist)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); } } put_online_cpus(); @@ -3160,6 +3183,7 @@ void rcu_cpu_starting(unsigned int cpu) rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + rcu_disable_tick_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c612f306fe89..055c31781d3a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -181,6 +181,7 @@ struct rcu_data { atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ + bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ From b200a0489517d9e5a52e983183e890f573454ebd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2019 13:24:49 -0700 Subject: [PATCH 20/42] rcu: Force nohz_full tick on upon irq enter instead of exit There is interrupt-exit code that forces on the tick for nohz_full CPUs failing to respond to the current grace period in a timely fashion. However, this code must compare ->dynticks_nmi_nesting to the value 2 in the interrupt-exit fastpath. This commit therefore moves this code to the interrupt-entry fastpath, where a lighter-weight comparison to zero may be used. Reported-by: Joel Fernandes [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9fda33864ede..8dc878406c71 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -651,12 +651,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) */ if (rdp->dynticks_nmi_nesting != 1) { trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == 2 && - rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_MASK_RCU); - } WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; @@ -831,6 +825,11 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rcu_cleanup_after_idle(); incby = 1; + } else if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && + rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, From 516e5ae0c94016294d3ef175454215b235d03945 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 5 Sep 2019 10:26:41 -0700 Subject: [PATCH 21/42] rcu: Reset CPU hints when reporting a quiescent state In some cases, tracing shows that need_heavy_qs is still set even though urgent_qs was cleared upon reporting of a quiescent state. One such case is when the softirq reports that a CPU has passed quiescent state. Commit 671a63517cf9 ("rcu: Avoid unnecessary softirq when system is idle") fixed a bug where core_needs_qs was not being cleared. In order to avoid running into similar situations with the urgent-grace-period flags, this commit causes rcu_disable_urgency_upon_qs(), previously rcu_disable_tick_upon_qs(), to clear the urgency hints, ->rcu_urgent_qs and ->rcu_need_heavy_qs. Note that it is possible for CPUs to go offline with these urgency hints still set. This is handled because rcu_disable_urgency_upon_qs() is also invoked during the online process. Because these hints can be cleared both by the corresponding CPU and by the grace-period kthread, this commit also adds a number of READ_ONCE() and WRITE_ONCE() calls. Tested overnight with rcutorture running for 60 minutes on all configurations of RCU. Signed-off-by: "Joel Fernandes (Google)" [ paulmck: Clear urgency flags in rcu_disable_urgency_upon_qs(). ] [ paulmck: Remove ->core_needs_qs from the set cleared at quiescent state. ] [ paulmck: Make rcu_disable_urgency_upon_qs static per kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8dc878406c71..82caca305cae 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -827,7 +827,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { rdp->rcu_forced_tick = true; tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } @@ -892,11 +892,14 @@ void rcu_irq_enter_irqson(void) } /* - * If the scheduler-clock interrupt was enabled on a nohz_full CPU - * in order to get to a quiescent state, disable it. + * If any sort of urgency was applied to the current CPU (for example, + * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order + * to get to a quiescent state, disable it. */ -void rcu_disable_tick_upon_qs(struct rcu_data *rdp) +static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + WRITE_ONCE(rdp->rcu_urgent_qs, false); + WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); rdp->rcu_forced_tick = false; @@ -1997,7 +2000,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) if (!offloaded) needwake = rcu_accelerate_cbs(rnp, rdp); - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) @@ -2311,7 +2314,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) rdp = per_cpu_ptr(&rcu_data, cpu); if (f(rdp)) { mask |= bit; - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); } } } @@ -3182,7 +3185,7 @@ void rcu_cpu_starting(unsigned int cpu) rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { From ed93dfc6bc0084485ccad1ff6bd2ea81ab2c03cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Sep 2019 14:09:56 -0700 Subject: [PATCH 22/42] rcu: Confine ->core_needs_qs accesses to the corresponding CPU Commit 671a63517cf9 ("rcu: Avoid unnecessary softirq when system is idle") fixed a bug that could result in an indefinite number of unnecessary invocations of the RCU_SOFTIRQ handler at the trailing edge of a scheduler-clock interrupt. However, the fix introduced off-CPU stores to ->core_needs_qs. These writes did not conflict with the on-CPU stores because the CPU's leaf rcu_node structure's ->lock was held across all such stores. However, the loads from ->core_needs_qs were not promoted to READ_ONCE() and, worse yet, the code loading from ->core_needs_qs was written assuming that it was only ever updated by the corresponding CPU. So operation has been robust, but only by luck. This situation is therefore an accident waiting to happen. This commit therefore takes a different approach. Instead of clearing ->core_needs_qs from the grace-period kthread's force-quiescent-state processing, it modifies the rcu_pending() function to suppress the rcu_sched_clock_irq() function's call to invoke_rcu_core() if there is no grace period in progress. This avoids the infinite needless RCU_SOFTIRQ handlers while still keeping all accesses to ->core_needs_qs local to the corresponding CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 82caca305cae..0c8046bc5ec7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1989,7 +1989,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2819,6 +2818,7 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); */ static int rcu_pending(void) { + bool gp_in_progress; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2834,7 +2834,8 @@ static int rcu_pending(void) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) + gp_in_progress = rcu_gp_in_progress(); + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; /* Does this CPU have callbacks ready to invoke? */ @@ -2842,8 +2843,7 @@ static int rcu_pending(void) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && + if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || !rcu_segcblist_is_offloaded(&rdp->cblist)) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) From dd7dafd1ad50aa9ed7958235431f243ea131ee7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Sep 2019 03:39:22 -0700 Subject: [PATCH 23/42] rcu: Make kernel-mode nohz_full CPUs invoke the RCU core processing If a nohz_full CPU is idle or executing in userspace, it makes good sense to keep it out of RCU core processing. After all, the RCU grace-period kthread can see its quiescent states and all of its callbacks are offloaded, so there is nothing for RCU core processing to do. However, if a nohz_full CPU is executing in kernel space, the RCU grace-period kthread cannot do anything for it, so such a CPU must report its own quiescent states. This commit therefore makes nohz_full CPUs skip RCU core processing only if the scheduler-clock interrupt caught them in idle or in userspace. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c8046bc5ec7..4e6ae2699d2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -497,7 +497,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static int rcu_pending(void); +static int rcu_pending(int user); /* * Return the number of RCU GPs completed thus far for debug & stats. @@ -2267,7 +2267,7 @@ void rcu_sched_clock_irq(int user) __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); - if (rcu_pending()) + if (rcu_pending(user)) invoke_rcu_core(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); * CPU-local state are performed first. However, we must check for CPU * stalls first, else we might not get a chance. */ -static int rcu_pending(void) +static int rcu_pending(int user) { bool gp_in_progress; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -2829,8 +2829,8 @@ static int rcu_pending(void) if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu()) + /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ + if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ From 8e6af017f4b1da9cdd2b55ce83853df8e167b4d3 Mon Sep 17 00:00:00 2001 From: Ethan Hansen <1ethanhansen@gmail.com> Date: Fri, 2 Aug 2019 13:37:58 -0700 Subject: [PATCH 24/42] rcu: Remove unused function hlist_bl_del_init_rcu() The function hlist_bl_del_init_rcu() is declared in rculist_bl.h, but never used. This commit therefore removes it. Signed-off-by: Ethan Hansen <1ethanhansen@gmail.com> Signed-off-by: Paul E. McKenney --- include/linux/rculist_bl.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 66e73ec1aa99..0b952d06eb0b 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -24,34 +24,6 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } -/** - * hlist_bl_del_init_rcu - deletes entry from hash list with re-initialization - * @n: the element to delete from the hash list. - * - * Note: hlist_bl_unhashed() on the node returns true after this. It is - * useful for RCU based read lockfree traversal if the writer side - * must know if the list entry is still hashed or already unhashed. - * - * In particular, it means that we can not poison the forward pointers - * that may still be used for walking the hash list and we can only - * zero the pprev pointer so list_unhashed() will return true after - * this. - * - * The caller must take whatever precautions are necessary (such as - * holding appropriate locks) to avoid racing with another - * list-mutation primitive, such as hlist_bl_add_head_rcu() or - * hlist_bl_del_rcu(), running on this same list. However, it is - * perfectly legal to run concurrently with the _rcu list-traversal - * primitives, such as hlist_bl_for_each_entry_rcu(). - */ -static inline void hlist_bl_del_init_rcu(struct hlist_bl_node *n) -{ - if (!hlist_bl_unhashed(n)) { - __hlist_bl_del(n); - n->pprev = NULL; - } -} - /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. From 1d24dd4e01fb6b928cf679e3e415ddff7016fa96 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 8 Aug 2019 10:32:58 +0800 Subject: [PATCH 25/42] rcu: Several rcu_segcblist functions can be static None of rcu_segcblist_set_len(), rcu_segcblist_add_len(), or rcu_segcblist_xchg_len() are used outside of kernel/rcu/rcu_segcblist.c. This commit therefore makes them static. Fixes: eda669a6a2c5 ("rcu/nocb: Atomic ->len field in rcu_segcblist structure") Signed-off-by: kbuild test robot [ paulmck: "Fixes:" updated per Stephen Rothwell feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 495c58ce1640..cbc87b804db9 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -88,7 +88,7 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) } /* Set the length of an rcu_segcblist structure. */ -void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU atomic_long_set(&rsclp->len, v); @@ -104,7 +104,7 @@ void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * This increase is fully ordered with respect to the callers accesses * both before and after. */ -void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU smp_mb__before_atomic(); /* Up to the caller! */ @@ -134,7 +134,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) * with the actual number of callbacks on the structure. This exchange is * fully ordered with respect to the callers accesses both before and after. */ -long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) +static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU return atomic_long_xchg(&rsclp->len, v); From 5a6446626d7e062b47a5cc1cf27d5060e60bb0d9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 15 Aug 2019 10:18:42 -0400 Subject: [PATCH 26/42] workqueue: Convert for_each_wq to use built-in list check Because list_for_each_entry_rcu() can now check for holding a lock as well as for being in an RCU read-side critical section, this commit replaces the workqueue_sysfs_unregister() function's use of assert_rcu_or_wq_mutex() and list_for_each_entry_rcu() with list_for_each_entry_rcu() augmented with a lockdep_is_held() optional argument. Acked-by: Tejun Heo Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/workqueue.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc2e09a8ea61..e501c79e283a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -364,11 +364,6 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") -#define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ - !lockdep_is_held(&wq->mutex), \ - "RCU or wq->mutex should be held") - #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ @@ -425,9 +420,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ - else + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ + lockdep_is_held(&(wq->mutex))) #ifdef CONFIG_DEBUG_OBJECTS_WORK From 05ef9e9eb3dade21413680f41eb0170778e8ae2b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 15 Aug 2019 22:59:14 -0400 Subject: [PATCH 27/42] rcu: Ensure that ->rcu_urgent_qs is set before resched IPI The RCU-specific resched_cpu() function sends a resched IPI to the specified CPU, which can be used to force the tick on for a given nohz_full CPU. This is needed when this nohz_full CPU is looping in the kernel while blocking the current grace period. However, for the tick to actually be forced on in all cases, that CPU's rcu_data structure's ->rcu_urgent_qs flag must be set beforehand. This commit therefore causes rcu_implicit_dynticks_qs() to set this flag prior to invoking resched_cpu() on a holdout nohz_full CPU. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81105141b6a8..0d83b1944e19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1073,6 +1073,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (tick_nohz_full_cpu(rdp->cpu) && time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); } From 7eb54685c63cc9185a48d0d9c1ad25a34d4e1da0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Aug 2019 16:55:21 -0700 Subject: [PATCH 28/42] rcu: Remove obsolete descriptions for rcu_barrier tracepoint Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 694bd040cf51..afa898532ab8 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -713,8 +713,6 @@ TRACE_EVENT_RCU(rcu_torture_read, * "Begin": rcu_barrier() started. * "EarlyExit": rcu_barrier() piggybacked, thus early exit. * "Inc1": rcu_barrier() piggyback check counter incremented. - * "OfflineNoCB": rcu_barrier() found callback on never-online CPU - * "OnlineNoCB": rcu_barrier() found online no-CBs CPU. * "OnlineQ": rcu_barrier() found online CPU with callbacks. * "OnlineNQ": rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. From d01f86206864e429839f6a4aeb90064f0c043ed9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Aug 2019 10:29:06 -0700 Subject: [PATCH 29/42] rcu: Update descriptions for rcu_nocb_wake tracepoint Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index afa898532ab8..4609f2ef7767 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -258,20 +258,27 @@ TRACE_EVENT_RCU(rcu_exp_funnel_lock, * the number of the offloaded CPU are extracted. The third and final * argument is a string as follows: * - * "WakeEmpty": Wake rcuo kthread, first CB to empty list. - * "WakeEmptyIsDeferred": Wake rcuo kthread later, first CB to empty list. - * "WakeOvf": Wake rcuo kthread, CB list is huge. - * "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge. - * "WakeNot": Don't wake rcuo kthread. - * "WakeNotPoll": Don't wake rcuo kthread because it is polling. - * "DeferredWake": Carried out the "IsDeferred" wakeup. - * "Poll": Start of new polling cycle for rcu_nocb_poll. - * "Sleep": Sleep waiting for GP for !rcu_nocb_poll. - * "CBSleep": Sleep waiting for CBs for !rcu_nocb_poll. - * "WokeEmpty": rcuo kthread woke to find empty list. - * "WokeNonEmpty": rcuo kthread woke to find non-empty list. - * "WaitQueue": Enqueue partially done, timed wait for it to complete. - * "WokeQueue": Partial enqueue now complete. + * "AlreadyAwake": The to-be-awakened rcuo kthread is already awake. + * "Bypass": rcuo GP kthread sees non-empty ->nocb_bypass. + * "CBSleep": rcuo CB kthread sleeping waiting for CBs. + * "Check": rcuo GP kthread checking specified CPU for work. + * "DeferredWake": Timer expired or polled check, time to wake. + * "DoWake": The to-be-awakened rcuo kthread needs to be awakened. + * "EndSleep": Done waiting for GP for !rcu_nocb_poll. + * "FirstBQ": New CB to empty ->nocb_bypass (->cblist maybe non-empty). + * "FirstBQnoWake": FirstBQ plus rcuo kthread need not be awakened. + * "FirstBQwake": FirstBQ plus rcuo kthread must be awakened. + * "FirstQ": New CB to empty ->cblist (->nocb_bypass maybe non-empty). + * "NeedWaitGP": rcuo GP kthread must wait on a grace period. + * "Poll": Start of new polling cycle for rcu_nocb_poll. + * "Sleep": Sleep waiting for GP for !rcu_nocb_poll. + * "Timer": Deferred-wake timer expired. + * "WakeEmptyIsDeferred": Wake rcuo kthread later, first CB to empty list. + * "WakeEmpty": Wake rcuo kthread, first CB to empty list. + * "WakeNot": Don't wake rcuo kthread. + * "WakeNotPoll": Don't wake rcuo kthread because it is polling. + * "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge. + * "WokeEmpty": rcuo CB kthread woke to find empty list. */ TRACE_EVENT_RCU(rcu_nocb_wake, From 7cc0fffde6e4ff76be20d41a3577012fe584a559 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Aug 2019 10:34:25 -0700 Subject: [PATCH 30/42] rcu: Update descriptions for rcu_future_grace_period tracepoint Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 4609f2ef7767..66122602bd08 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -93,16 +93,16 @@ TRACE_EVENT_RCU(rcu_grace_period, * the data from the rcu_node structure, other than rcuname, which comes * from the rcu_state structure, and event, which is one of the following: * - * "Startleaf": Request a grace period based on leaf-node data. + * "Cleanup": Clean up rcu_node structure after previous GP. + * "CleanupMore": Clean up, and another GP is needed. + * "EndWait": Complete wait. + * "NoGPkthread": The RCU grace-period kthread has not yet started. * "Prestarted": Someone beat us to the request * "Startedleaf": Leaf node marked for future GP. * "Startedleafroot": All nodes from leaf to root marked for future GP. * "Startedroot": Requested a nocb grace period based on root-node data. - * "NoGPkthread": The RCU grace-period kthread has not yet started. + * "Startleaf": Request a grace period based on leaf-node data. * "StartWait": Start waiting for the requested grace period. - * "EndWait": Complete wait. - * "Cleanup": Clean up rcu_node structure after previous GP. - * "CleanupMore": Clean up, and another GP is needed. */ TRACE_EVENT_RCU(rcu_future_grace_period, From b8889c9c89a2655a231dfed93cc9bdca0930ea67 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 23 Sep 2019 17:26:34 +0300 Subject: [PATCH 31/42] rcu: Fix uninitialized variable in nocb_gp_wait() We never set this to false. This probably doesn't affect most people's runtime because GCC will automatically initialize it to false at certain common optimization levels. But that behavior is related to a bug in GCC and obviously should not be relied on. Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs") Signed-off-by: Dan Carpenter Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2defc7fe74c3..fa08d55f7040 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1946,7 +1946,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) int __maybe_unused cpu = my_rdp->cpu; unsigned long cur_gp_seq; unsigned long flags; - bool gotcbs; + bool gotcbs = false; unsigned long j = jiffies; bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; From 36b5dae64513b7ce3a0e0f6cb469e0f74bacad45 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Sep 2019 10:10:31 -0700 Subject: [PATCH 32/42] rcu: Suppress levelspread uninitialized messages New tools bring new warnings, and with v5.3 comes: kernel/rcu/srcutree.c: warning: 'levelspread[]' may be used uninitialized in this function [-Wuninitialized]: => 121:34 This commit suppresses this warning by initializing the full array to INT_MIN, which will result in failures should any out-of-bounds references appear. Reported-by: Michael Ellerman Reported-by: Geert Uytterhoeven Signed-off-by: Paul E. McKenney Reviewed-by: Geert Uytterhoeven --- kernel/rcu/rcu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 8fd4f82c9b3d..b64c707f6065 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -299,6 +299,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; + for (i = 0; i < RCU_NUM_LVLS; i++) + levelspread[i] = INT_MIN; if (rcu_fanout_exact) { levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) From a63fc6b75cca984c71f095282e0227a390ba88f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:05:11 -0700 Subject: [PATCH 33/42] rcu: Upgrade rcu_swap_protected() to rcu_replace_pointer() Although the rcu_swap_protected() macro follows the example of swap(), the interactions with RCU make its update of its argument somewhat counter-intuitive. This commit therefore introduces an rcu_replace_pointer() that returns the old value of the RCU pointer instead of doing the argument update. Once all the uses of rcu_swap_protected() are updated to instead use rcu_replace_pointer(), rcu_swap_protected() will be removed. Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Shane M Seymour Cc: Martin K. Petersen --- include/linux/rcupdate.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 75a2eded7aa2..185dd9736863 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -382,6 +382,24 @@ do { \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) +/** + * rcu_replace_pointer() - replace an RCU pointer, returning its old value + * @rcu_ptr: RCU pointer, whose old value is returned + * @ptr: regular pointer + * @c: the lockdep conditions under which the dereference will take place + * + * Perform a replacement, where @rcu_ptr is an RCU-annotated + * pointer and @c is the lockdep argument that is passed to the + * rcu_dereference_protected() call used to read that pointer. The old + * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. + */ +#define rcu_replace_pointer(rcu_ptr, ptr, c) \ +({ \ + typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ + rcu_assign_pointer((rcu_ptr), (ptr)); \ + __tmp; \ +}) + /** * rcu_swap_protected() - swap an RCU and a regular pointer * @rcu_ptr: RCU pointer From 12e78e6902134c9e49b2481c2515555e6f7b12dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:15:35 -0700 Subject: [PATCH 34/42] x86/kvm/pmu: Replace rcu_swap_protected() with rcu_replace_pointer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: Paolo Bonzini Cc: "Radim Krčmář" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: --- arch/x86/kvm/pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 46875bbd0419..5ddb05a26a1b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -416,8 +416,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) *filter = tmp; mutex_lock(&kvm->lock); - rcu_swap_protected(kvm->arch.pmu_event_filter, filter, - mutex_is_locked(&kvm->lock)); + filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); From 1feace5d6a4a1acf44dde2bfb5c36cc0b1cf559c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:22:15 -0700 Subject: [PATCH 35/42] drm/i915: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Reviewed-by: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: David Airlie Cc: Daniel Vetter Cc: Chris Wilson Cc: Tvrtko Ursulin Cc: Cc: --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 1cdfe05514c3..3f3e803dfd5a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1629,7 +1629,7 @@ replace: i915_gem_context_set_user_engines(ctx); else i915_gem_context_clear_user_engines(ctx); - rcu_swap_protected(ctx->engines, set.engines, 1); + set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1); mutex_unlock(&ctx->engines_mutex); call_rcu(&set.engines->rcu, free_engines_rcu); From c0eaf15cd5d39e79feb81a122975df0bb5a1c106 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:26:28 -0700 Subject: [PATCH 36/42] drivers/scsi: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: Cc: --- drivers/scsi/scsi.c | 4 ++-- drivers/scsi/scsi_sysfs.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1f5b5c8a7f72..7a1b6c76f263 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -434,8 +434,8 @@ static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, return; mutex_lock(&sdev->inquiry_mutex); - rcu_swap_protected(*sdev_vpd_buf, vpd_buf, - lockdep_is_held(&sdev->inquiry_mutex)); + vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, + lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 64c96c7828ee..5adfcaba7a4e 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -466,10 +466,10 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) sdev->request_queue = NULL; mutex_lock(&sdev->inquiry_mutex); - rcu_swap_protected(sdev->vpd_pg80, vpd_pg80, - lockdep_is_held(&sdev->inquiry_mutex)); - rcu_swap_protected(sdev->vpd_pg83, vpd_pg83, - lockdep_is_held(&sdev->inquiry_mutex)); + vpd_pg80 = rcu_replace_pointer(sdev->vpd_pg80, vpd_pg80, + lockdep_is_held(&sdev->inquiry_mutex)); + vpd_pg83 = rcu_replace_pointer(sdev->vpd_pg83, vpd_pg83, + lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_pg83) From 62860da7082e4f2440c6bc96e4710d9c8bfb916b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:28:28 -0700 Subject: [PATCH 37/42] fs/afs: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: David Howells Cc: Cc: --- fs/afs/vl_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 21eb0c0be912..8fea54eba0c2 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, struct afs_addr_list *old = addrs; write_lock(&server->lock); - rcu_swap_protected(server->addresses, old, - lockdep_is_held(&server->lock)); + old = rcu_replace_pointer(server->addresses, old, + lockdep_is_held(&server->lock)); write_unlock(&server->lock); afs_put_addrlist(old); } From 6092f7263f7e56dacf72e383ed7ba9cbabec30e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:37:04 -0700 Subject: [PATCH 38/42] bpf/cgroup: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Yonghong Song Cc: Cc: --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddd8addcdb5c..c684cf424849 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -180,8 +180,8 @@ static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array *old_array) { - rcu_swap_protected(cgrp->bpf.effective[type], old_array, - lockdep_is_held(&cgroup_mutex)); + old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ From e3f0d761fcaec5d445c9280d6e09087dc32828d2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:42:28 -0700 Subject: [PATCH 39/42] net/core: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Cc: "David S. Miller" Cc: Jiri Pirko Cc: Eric Dumazet Cc: Ido Schimmel Cc: Petr Machata Cc: Paolo Abeni Cc: --- net/core/dev.c | 4 ++-- net/core/sock_reuseport.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index bf3ed413abaf..c5d8882d100f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1288,8 +1288,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) } mutex_lock(&ifalias_mutex); - rcu_swap_protected(dev->ifalias, new_alias, - mutex_is_locked(&ifalias_mutex)); + new_alias = rcu_replace_pointer(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index f3ceec93f392..f19f179538b9 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -356,8 +356,8 @@ int reuseport_detach_prog(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); - rcu_swap_protected(reuse->prog, old_prog, - lockdep_is_held(&reuseport_lock)); + old_prog = rcu_replace_pointer(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); if (!old_prog) From b685b534bf1586a01b32280d1da39febaf270039 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:53:02 -0700 Subject: [PATCH 40/42] net/netfilter: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: Cc: Cc: --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d481f9baca2f..337997464240 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1461,8 +1461,9 @@ static void nft_chain_stats_replace(struct nft_trans *trans) if (!nft_trans_chain_stats(trans)) return; - rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + nft_trans_chain_stats(trans) = + rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); From 445d3749315f34229dcfc3efd82796f97fc72e92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 16:09:18 -0700 Subject: [PATCH 41/42] net/sched: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Cc: --- net/sched/act_api.c | 2 +- net/sched/act_csum.c | 4 ++-- net/sched/act_ct.c | 3 ++- net/sched/act_ctinfo.c | 4 ++-- net/sched/act_ife.c | 2 +- net/sched/act_mirred.c | 4 ++-- net/sched/act_mpls.c | 2 +- net/sched/act_police.c | 6 +++--- net/sched/act_sample.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- net/sched/act_tunnel_key.c | 4 ++-- net/sched/act_vlan.c | 2 +- 12 files changed, 21 insertions(+), 20 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2558f00f6b3e..3d51573d86c9 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -88,7 +88,7 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *goto_chain) { a->tcfa_action = action; - rcu_swap_protected(a->goto_chain, goto_chain, 1); + goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1); return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d3cfad88dc3a..87dddbaa2031 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -101,8 +101,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(p->params, params_new, - lockdep_is_held(&p->tcf_lock)); + params_new = rcu_replace_pointer(p->params, params_new, + lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); if (goto_ch) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index fcc46025e790..2d5ab233349e 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -722,7 +722,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock)); + params = rcu_replace_pointer(c->params, params, + lockdep_is_held(&c->tcf_lock)); spin_unlock_bh(&c->tcf_lock); if (goto_ch) diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 0dbcfd1dca7b..c59981897846 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -257,8 +257,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); - rcu_swap_protected(ci->params, cp_new, - lockdep_is_held(&ci->tcf_lock)); + cp_new = rcu_replace_pointer(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3a31e241c647..2ea2e164e3bd 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -594,7 +594,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(ife->params, p, 1); + p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9ce073a05414..0c2f737b72f2 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -178,8 +178,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(dev); - rcu_swap_protected(m->tcfm_dev, dev, - lockdep_is_held(&m->tcf_lock)); + dev = rcu_replace_pointer(m->tcfm_dev, dev, + lockdep_is_held(&m->tcf_lock)); if (dev) dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index e168df0e008a..5b3031c3e81d 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -258,7 +258,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); + p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); spin_unlock_bh(&m->tcf_lock); if (goto_ch) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 89c04c52af3d..caa91cf8791b 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -191,9 +191,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, police->tcfp_ptoks = new->tcfp_mtu_ptoks; spin_unlock_bh(&police->tcfp_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(police->params, - new, - lockdep_is_held(&police->tcf_lock)); + new = rcu_replace_pointer(police->params, + new, + lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); if (goto_ch) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 514456a0b9a8..4deeaf268693 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -102,8 +102,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = rate; s->psample_group_num = psample_group_num; - rcu_swap_protected(s->psample_group, psample_group, - lockdep_is_held(&s->tcf_lock)); + psample_group = rcu_replace_pointer(s->psample_group, psample_group, + lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6a8d3337c577..c38cc3945763 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -206,8 +206,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(d->params, params_new, - lockdep_is_held(&d->tcf_lock)); + params_new = rcu_replace_pointer(d->params, params_new, + lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2f83a79f76aa..20d7ca49f7cb 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -381,8 +381,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(t->params, params_new, - lockdep_is_held(&t->tcf_lock)); + params_new = rcu_replace_pointer(t->params, params_new, + lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 08aaf719a70f..7aca1f0ecc21 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -220,7 +220,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); + p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); spin_unlock_bh(&v->tcf_lock); if (goto_ch) From a60a5746004d7dbb68cbccd4c16d0529e2b2d1d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Oct 2019 15:07:09 -0700 Subject: [PATCH 42/42] security/safesetid: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds Reported-by: Reported-by: kbuild test robot [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Cc: Micah Morton Cc: James Morris Cc: "Serge E. Hallyn" Cc: --- security/safesetid/securityfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index 74a13d432ed8..f8bc574cea9c 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -179,8 +179,8 @@ out_free_rule: * doesn't currently exist, just use a spinlock for now. */ mutex_lock(&policy_update_lock); - rcu_swap_protected(safesetid_setuid_rules, pol, - lockdep_is_held(&policy_update_lock)); + pol = rcu_replace_pointer(safesetid_setuid_rules, pol, + lockdep_is_held(&policy_update_lock)); mutex_unlock(&policy_update_lock); err = len;