From 810b38179e9e4d4f57b4b733767bb08f8291a965 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 Feb 2008 15:21:01 -0500
Subject: [PATCH 1/6] sched: retain vruntime

Kei Tokunaga reported an interactivity problem when moving tasks
between control groups.

Tasks would retain their old vruntime when moved between groups, this
can cause funny lags. Re-set the vruntime on group move to fit within
the new tree.

Reported-by: Kei Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/sched.c        |  5 +++++
 kernel/sched_fair.c   | 14 ++++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ae4030067a9..11d8e9a74eff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -899,6 +899,10 @@ struct sched_class {
 			     int running);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			     int oldprio, int running);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	void (*moved_group) (struct task_struct *p);
+#endif
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index dcd553cc4ee8..0b949c4e73ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7625,6 +7625,11 @@ void sched_move_task(struct task_struct *tsk)
 
 	set_task_rq(tsk, task_cpu(tsk));
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->moved_group)
+		tsk->sched_class->moved_group(tsk);
+#endif
+
 	if (on_rq) {
 		if (unlikely(running))
 			tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3df4d46994ca..e2a530515619 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1353,6 +1353,16 @@ static void set_curr_task_fair(struct rq *rq)
 		set_next_entity(cfs_rq_of(se), se);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	update_curr(cfs_rq);
+	place_entity(cfs_rq, &p->se, 1);
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -1381,6 +1391,10 @@ static const struct sched_class fair_sched_class = {
 
 	.prio_changed		= prio_changed_fair,
 	.switched_to		= switched_to_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	.moved_group		= moved_group_fair,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG

From 6fa46fa526f2cab9ce21fa5e39501553a40d196d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 5 Mar 2008 10:00:12 -0500
Subject: [PATCH 2/6] sched: balance RT task resched only on runqueue

Sripathi Kodi reported a crash in the -rt kernel:

  https://bugzilla.redhat.com/show_bug.cgi?id=435674

this is due to a place that can reschedule a task without holding
the tasks runqueue lock.  This was caused by the RT balancing code
that pulls RT tasks to the current run queue and will reschedule the
current task.

There's a slight chance that the pulling of the RT tasks will release
the current runqueue's lock and retake it (in the double_lock_balance).
During this time that the runqueue is released, the current task can
migrate to another runqueue.

In the prio_changed_rt code, after the pull, if the current task is of
lesser priority than one of the RT tasks pulled, resched_task is called
on the current task. If the current task had migrated in that small
window, resched_task will be called without holding the runqueue lock
for the runqueue that the task is on.

This race condition also exists in the mainline kernel and this patch
adds a check to make sure the task hasn't migrated before calling
resched_task.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Tested-by: Sripathi Kodi <sripathik@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 76e828517541..0a6d2e516420 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1107,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 			pull_rt_task(rq);
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule.
+		 * then reschedule. Note, the above pull_rt_task
+		 * can release the rq lock and p could migrate.
+		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio)
+		if (p->prio > rq->rt.highest_prio && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */

From 150d8bede7f85eb00d8f4d628e6b0bae68739e3b Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Wed, 5 Mar 2008 16:56:37 -0500
Subject: [PATCH 3/6] sched: export task_nice

The API is trivial, and so is the implementation.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0b949c4e73ad..63a469f8853d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4422,7 +4422,7 @@ int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
-EXPORT_SYMBOL_GPL(task_nice);
+EXPORT_SYMBOL(task_nice);
 
 /**
  * idle_cpu - is a given cpu idle currently?

From 1868f958eb56fc41c5985c8732e564a400c5fdf5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 7 Mar 2008 09:35:06 +0800
Subject: [PATCH 4/6] sched: fix the wrong time slice value for SCHED_FIFO
 tasks

Function sys_sched_rr_get_interval returns wrong time slice value for
SCHED_FIFO tasks. The time slice for SCHED_FIFO tasks should be 0.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 63a469f8853d..5b13e4b0e009 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5100,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 	time_slice = 0;
 	if (p->policy == SCHED_RR) {
 		time_slice = DEF_TIMESLICE;
-	} else {
+	} else if (p->policy != SCHED_FIFO) {
 		struct sched_entity *se = &p->se;
 		unsigned long flags;
 		struct rq *rq;

From 2692a2406b9262bbb101708815be99ec2988e48b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 Feb 2008 12:00:46 +0100
Subject: [PATCH 5/6] sched: rt-group: fixup schedulability constraints
 calculation

it was only possible to configure the rt-group scheduling parameters
beyond the default value in a very small range.

that's because div64_64() has a different calling convention than
do_div() :/

fix a few untidies while we are here; sysctl_sched_rt_period may overflow
due to that multiplication, so cast to u64 first. Also that RUNTIME_INF
juggling makes little sense although its an effective NOP.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b13e4b0e009..b8ee864c7481 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7726,9 +7726,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 16;
 
-	runtime *= (1ULL << 16);
-	div64_64(runtime, period);
-	return runtime;
+	return div64_64(runtime << 16, period);
 }
 
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7757,18 +7755,16 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	u64 rt_runtime, rt_period;
 	int err = 0;
 
-	rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+	rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us == -1)
-		rt_runtime = rt_period;
+		rt_runtime = RUNTIME_INF;
 
 	mutex_lock(&rt_constraints_mutex);
 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
 		err = -EINVAL;
 		goto unlock;
 	}
-	if (rt_runtime_us == -1)
-		rt_runtime = RUNTIME_INF;
 	tg->rt_runtime = rt_runtime;
  unlock:
 	mutex_unlock(&rt_constraints_mutex);

From 521f1a2489c41f8b1181b0a8eb52e1c34284d50b Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Thu, 28 Feb 2008 15:21:56 +0530
Subject: [PATCH 6/6] sched: don't allow rt_runtime_us to be zero for groups
 having rt tasks

This patch checks if we can set the rt_runtime_us to 0. If there is a
realtime task in the group, we don't want to set the rt_runtime_us as 0
or bad things will happen. (that task wont get any CPU time despite
being TASK_RUNNNG)

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index b8ee864c7481..52b98675acb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7750,6 +7750,17 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	return total + to_ratio(period, runtime) < global_ratio;
 }
 
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+	struct task_struct *g, *p;
+	do_each_thread(g, p) {
+		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+			return 1;
+	} while_each_thread(g, p);
+	return 0;
+}
+
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
@@ -7761,12 +7772,18 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 		rt_runtime = RUNTIME_INF;
 
 	mutex_lock(&rt_constraints_mutex);
+	read_lock(&tasklist_lock);
+	if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+		err = -EBUSY;
+		goto unlock;
+	}
 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
 		err = -EINVAL;
 		goto unlock;
 	}
 	tg->rt_runtime = rt_runtime;
  unlock:
+	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return err;