From 88a411c07b6fedcfc97b8dc51ae18540bd2beda0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 3 Apr 2008 09:06:13 +0200
Subject: [PATCH 1/3] seqlock: livelock fix

Thomas Gleixner debugged a particularly ugly seqlock related livelock:
do not process the seq-read section if we know it beforehand that the
test at the end of the section will fail ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/seqlock.h | 46 ++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 26e4925bc35b..632205ccc25d 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -85,23 +85,29 @@ static inline int write_tryseqlock(seqlock_t *sl)
 /* Start of read calculation -- fetch last complete writer token */
 static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
 {
-	unsigned ret = sl->sequence;
+	unsigned ret;
+
+repeat:
+	ret = sl->sequence;
 	smp_rmb();
+	if (unlikely(ret & 1)) {
+		cpu_relax();
+		goto repeat;
+	}
+
 	return ret;
 }
 
-/* Test if reader processed invalid data.
- * If initial values is odd, 
- *	then writer had already started when section was entered
- * If sequence value changed
- *	then writer changed data while in section
- *    
- * Using xor saves one conditional branch.
+/*
+ * Test if reader processed invalid data.
+ *
+ * If sequence value changed then writer changed data while in section.
  */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
 {
 	smp_rmb();
-	return (iv & 1) | (sl->sequence ^ iv);
+
+	return (sl->sequence != start);
 }
 
 
@@ -122,20 +128,26 @@ typedef struct seqcount {
 /* Start of read using pointer to a sequence counter only.  */
 static inline unsigned read_seqcount_begin(const seqcount_t *s)
 {
-	unsigned ret = s->sequence;
+	unsigned ret;
+
+repeat:
+	ret = s->sequence;
 	smp_rmb();
+	if (unlikely(ret & 1)) {
+		cpu_relax();
+		goto repeat;
+	}
 	return ret;
 }
 
-/* Test if reader processed invalid data.
- * Equivalent to: iv is odd or sequence number has changed.
- *                (iv & 1) || (*s != iv)
- * Using xor saves one conditional branch.
+/*
+ * Test if reader processed invalid data because sequence number has changed.
  */
-static inline int read_seqcount_retry(const seqcount_t *s, unsigned iv)
+static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-	return (iv & 1) | (s->sequence ^ iv);
+
+	return s->sequence != start;
 }
 
 

From 126e01bf92dfc5f0ba91e88be02c473e1506d7d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Apr 2008 00:25:08 +0200
Subject: [PATCH 2/3] softlockup: fix NOHZ wakeup

David Miller reported:

|--------------->
the following commit:

| commit 27ec4407790d075c325e1f4da0a19c56953cce23
| Author: Ingo Molnar <mingo@elte.hu>
| Date:   Thu Feb 28 21:00:21 2008 +0100
|
|     sched: make cpu_clock() globally synchronous
|
|     Alexey Zaytsev reported (and bisected) that the introduction of
|     cpu_clock() in printk made the timestamps jump back and forth.
|
|     Make cpu_clock() more reliable while still keeping it fast when it's
|     called frequently.
|
|     Signed-off-by: Ingo Molnar <mingo@elte.hu>

causes watchdog triggers when a cpu exits NOHZ state when it has been
there for >= the soft lockup threshold, for example here are some
messages from a 128 cpu Niagara2 box:

[  168.106406] BUG: soft lockup - CPU#11 stuck for 128s! [dd:3239]
[  168.989592] BUG: soft lockup - CPU#21 stuck for 86s! [swapper:0]
[  168.999587] BUG: soft lockup - CPU#29 stuck for 91s! [make:4511]
[  168.999615] BUG: soft lockup - CPU#2 stuck for 85s! [swapper:0]
[  169.020514] BUG: soft lockup - CPU#37 stuck for 91s! [swapper:0]
[  169.020514] BUG: soft lockup - CPU#45 stuck for 91s! [sh:4515]
[  169.020515] BUG: soft lockup - CPU#69 stuck for 92s! [swapper:0]
[  169.020515] BUG: soft lockup - CPU#77 stuck for 92s! [swapper:0]
[  169.020515] BUG: soft lockup - CPU#61 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#85 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#101 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#109 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#117 stuck for 92s! [swapper:0]
[  169.171483] BUG: soft lockup - CPU#40 stuck for 80s! [dd:3239]
[  169.331483] BUG: soft lockup - CPU#13 stuck for 86s! [swapper:0]
[  169.351500] BUG: soft lockup - CPU#43 stuck for 101s! [dd:3239]
[  169.531482] BUG: soft lockup - CPU#9 stuck for 129s! [mkdir:4565]
[  169.595754] BUG: soft lockup - CPU#20 stuck for 93s! [swapper:0]
[  169.626787] BUG: soft lockup - CPU#52 stuck for 93s! [swapper:0]
[  169.626787] BUG: soft lockup - CPU#84 stuck for 92s! [swapper:0]
[  169.636812] BUG: soft lockup - CPU#116 stuck for 94s! [swapper:0]

It's simple enough to trigger this by doing a 10 minute sleep after a
fresh bootup then starting a parallel kernel build.

I suspect this might be reintroducing a problem we've had and fixed
before, see the thread:

http://marc.info/?l=linux-kernel&m=119546414004065&w=2
<---------------|

touch the softlockup watchdog when exiting NOHZ state - we are
obviously not locked up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d358d4e3a958..b854a895591e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -393,6 +393,7 @@ void tick_nohz_restart_sched_tick(void)
 		sub_preempt_count(HARDIRQ_OFFSET);
 	}
 
+	touch_softlockup_watchdog();
 	/*
 	 * Cancel the scheduled timer and restore the tick
 	 */

From 3f5087a2bae5d1ce10a3d698dec8f879a96f5419 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Apr 2008 00:25:08 +0200
Subject: [PATCH 3/3] sched: fix share (re)distribution

fix __aggregate_redistribute_shares() related lockup reported by
David S. Miller.

The problem this code tries to solve is 'accurately' calculating the 'fair'
share of the group weight for each cpu. The current code falls back to a global
group rebalance in case the sched_domain's span it looks at has no shares, but
does have tasks.

The reason it gets stuck here, is because its inherently racy - if someone
steals the last task after we compute the agg->rq_weight, but before we
rebalance, we'll never get out of the loop.

We could of course go fix that, but while looking at this issue I found that
this 'fallback' wasn't nearly as rare as I'd hoped it to be. In fact its quite
common - and given it walks the whole machine, thats very bad.

The new approach is simple (why didn't I think of it before?), we set the
aggregate shares to the full task group weight, and each larger sched domain
that encounters an aggregate shares larger than the weight, clips it (it
already re-distributes anyway).

This nicely converges to the desired global picture where the sum of all
shares equals the task group weight.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 47 ++---------------------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0014b03adaca..85e1721594f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1656,42 +1656,6 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 	aggregate(tg, sd)->task_weight = task_weight;
 }
 
-/*
- * Redistribute tg->shares amongst all tg->cfs_rq[]s.
- */
-static void __aggregate_redistribute_shares(struct task_group *tg)
-{
-	int i, max_cpu = smp_processor_id();
-	unsigned long rq_weight = 0;
-	unsigned long shares, max_shares = 0, shares_rem = tg->shares;
-
-	for_each_possible_cpu(i)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-
-	for_each_possible_cpu(i) {
-		/*
-		 * divide shares proportional to the rq_weights.
-		 */
-		shares = tg->shares * tg->cfs_rq[i]->load.weight;
-		shares /= rq_weight + 1;
-
-		tg->cfs_rq[i]->shares = shares;
-
-		if (shares > max_shares) {
-			max_shares = shares;
-			max_cpu = i;
-		}
-		shares_rem -= shares;
-	}
-
-	/*
-	 * Ensure it all adds up to tg->shares; we can loose a few
-	 * due to rounding down when computing the per-cpu shares.
-	 */
-	if (shares_rem)
-		tg->cfs_rq[max_cpu]->shares += shares_rem;
-}
-
 /*
  * Compute the weight of this group on the given cpus.
  */
@@ -1701,18 +1665,11 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	unsigned long shares = 0;
 	int i;
 
-again:
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	/*
-	 * When the span doesn't have any shares assigned, but does have
-	 * tasks to run do a machine wide rebalance (should be rare).
-	 */
-	if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
-		__aggregate_redistribute_shares(tg);
-		goto again;
-	}
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
 
 	aggregate(tg, sd)->shares = shares;
 }