sched: Move the loadavg code to a more obvious location

I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Borislav Petkov <bp@alien8.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Thomas Gleixner <tglx@linutronix.de> [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-14 13:19:42 +02:00 · 2015-04-14 13:19:42 +02:00 · 3289bdb429
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void update_cpu_load_nohz(void);
 #else
 static inline void update_cpu_load_nohz(void) { }
 #endif
 extern unsigned long get_parent_ip(unsigned long addr);
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -2397,9 +2397,9 @@ unsigned long nr_iowait_cpu(int cpu)
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-	struct rq *this = this_rq();
+	struct rq *rq = this_rq();
-	*nr_waiters = atomic_read(&this->nr_iowait);
+	*nr_waiters = atomic_read(&rq->nr_iowait);
-	*load = this->cpu_load[0];
+	*load = rq->load.weight;
 }
 #ifdef CONFIG_SMP
@ -2497,6 +2497,7 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 	perf_event_task_tick();
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -4323,6 +4323,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
 /*
 * per rq 'load' arrray crap; XXX kill this.
 */
 /*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
 * on nth tick when cpu may be busy, then we have:
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
 *
 * decay_load_missed() below does efficient calculation of
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
 *
 * The calculation is approximated on a 128 point scale.
 * degrade_zero_ticks is the number of ticks after which load at any
 * particular idx is approximated to be zero.
 * degrade_factor is a precomputed table, a row for each load idx.
 * Each column corresponds to degradation factor for a power of two ticks,
 * based on 128 point scale.
 * Example:
 * row 2, col 3 (=12) says that the degradation at load idx 2 after
 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
 *
 * With this power of 2 load factors, we can degrade the load n times
 * by looking at 1 bits in n and doing as many mult/shift instead of
 * n mult/shifts needed by the exact degradation.
 */
 #define DEGRADE_SHIFT		7
 static const unsigned char
 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
 static const unsigned char
 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
 					{0, 0, 0, 0, 0, 0, 0, 0},
 					{64, 32, 8, 0, 0, 0, 0, 0},
 					{96, 72, 40, 12, 1, 0, 0},
 					{112, 98, 75, 43, 15, 1, 0},
 					{120, 112, 98, 76, 45, 16, 2} };
 /*
 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 * would be when CPU is idle and so we just decay the old load without
 * adding any new load.
 */
 static unsigned long
 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 {
 	int j = 0;
 	if (!missed_updates)
 		return load;
 	if (missed_updates >= degrade_zero_ticks[idx])
 		return 0;
 	if (idx == 1)
 		return load >> missed_updates;
 	while (missed_updates) {
 		if (missed_updates % 2)
 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 		missed_updates >>= 1;
 		j++;
 	}
 	return load;
 }
 /*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 			      unsigned long pending_updates)
 {
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Update our load: */
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale - 1;
 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 	}
 	sched_avg_update(this_rq);
 }
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 *
 * Therefore we cannot use the delta approach from the regular tick since that
 * would seriously skew the load calculation. However we'll make do for those
 * updates happening while idle (nohz_idle_balance) or coming out of idle
 * (tick_nohz_idle_exit).
 *
 * This means we might still be one tick off for nohz periods.
 */
 /*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
 static void update_idle_cpu_load(struct rq *this_rq)
 {
 	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
 	unsigned long load = this_rq->cfs.runnable_load_avg;
 	unsigned long pending_updates;
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
 	if (load || curr_jiffies == this_rq->last_load_update_tick)
 		return;
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	this_rq->last_load_update_tick = curr_jiffies;
 	__update_cpu_load(this_rq, load, pending_updates);
 }
 /*
 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
 */
 void update_cpu_load_nohz(void)
 {
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
 	unsigned long pending_updates;
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 	raw_spin_lock(&this_rq->lock);
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	if (pending_updates) {
 		this_rq->last_load_update_tick = curr_jiffies;
 		/*
 		 * We were idle, this means load 0, the current load might be
 		 * !0 due to remote wakeups and the sort.
 		 */
 		__update_cpu_load(this_rq, 0, pending_updates);
 	}
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
 /*
 * Called from scheduler_tick()
 */
 void update_cpu_load_active(struct rq *this_rq)
 {
 	unsigned long load = this_rq->cfs.runnable_load_avg;
 	/*
 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
 	__update_cpu_load(this_rq, load, 1);
 }
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@ -1,7 +1,9 @@
 /*
- *  kernel/sched/proc.c
+ * kernel/sched/loadavg.c
 *
- *  Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
 * figure. Its a silly number but people think its important. We go through
 * great pains to make it work on big machines and tickless kernels.
 */
 #include <linux/export.h>
@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
 	long nr_active, delta = 0;
 	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active += (long)this_rq->nr_uninterruptible;
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
 	delta = calc_load_fold_active(this_rq);
 	if (delta) {
 		int idx = calc_load_write_idx();
 		atomic_long_add(delta, &calc_load_idle[idx]);
 	}
 }
@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 {
 	unsigned long result = 1UL << frac_bits;
-	if (n) for (;;) {
+	if (n) {
-		if (n & 1) {
+		for (;;) {
-			result *= x;
+			if (n & 1) {
-			result += 1UL << (frac_bits - 1);
+				result *= x;
-			result >>= frac_bits;
+				result += 1UL << (frac_bits - 1);
 				result >>= frac_bits;
 			}
 			n >>= 1;
 			if (!n)
 				break;
 			x *= x;
 			x += 1UL << (frac_bits - 1);
 			x >>= frac_bits;
 		}
 		n >>= 1;
 		if (!n)
 			break;
 		x *= x;
 		x += 1UL << (frac_bits - 1);
 		x >>= frac_bits;
 	}
 	return result;
@ -285,7 +290,6 @@ static unsigned long
 calc_load_n(unsigned long load, unsigned long exp,
 	    unsigned long active, unsigned int n)
 {
 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 }
@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 *
 * Called from the global timer code.
 */
 void calc_global_load(unsigned long ticks)
 {
@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
 }
 /*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
 * active count.
 */
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
 {
 	long delta;
@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
 	this_rq->calc_load_update += LOAD_FREQ;
 }
 /*
 * End of global load-average stuff
 */
 /*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
 * on nth tick when cpu may be busy, then we have:
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
 *
 * decay_load_missed() below does efficient calculation of
 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
 *
 * The calculation is approximated on a 128 point scale.
 * degrade_zero_ticks is the number of ticks after which load at any
 * particular idx is approximated to be zero.
 * degrade_factor is a precomputed table, a row for each load idx.
 * Each column corresponds to degradation factor for a power of two ticks,
 * based on 128 point scale.
 * Example:
 * row 2, col 3 (=12) says that the degradation at load idx 2 after
 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
 *
 * With this power of 2 load factors, we can degrade the load n times
 * by looking at 1 bits in n and doing as many mult/shift instead of
 * n mult/shifts needed by the exact degradation.
 */
 #define DEGRADE_SHIFT		7
 static const unsigned char
 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
 static const unsigned char
 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
 					{0, 0, 0, 0, 0, 0, 0, 0},
 					{64, 32, 8, 0, 0, 0, 0, 0},
 					{96, 72, 40, 12, 1, 0, 0},
 					{112, 98, 75, 43, 15, 1, 0},
 					{120, 112, 98, 76, 45, 16, 2} };
 /*
 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 * would be when CPU is idle and so we just decay the old load without
 * adding any new load.
 */
 static unsigned long
 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 {
 	int j = 0;
 	if (!missed_updates)
 		return load;
 	if (missed_updates >= degrade_zero_ticks[idx])
 		return 0;
 	if (idx == 1)
 		return load >> missed_updates;
 	while (missed_updates) {
 		if (missed_updates % 2)
 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 		missed_updates >>= 1;
 		j++;
 	}
 	return load;
 }
 /*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 			      unsigned long pending_updates)
 {
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Update our load: */
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale - 1;
 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 	}
 	sched_avg_update(this_rq);
 }
 #ifdef CONFIG_SMP
 static inline unsigned long get_rq_runnable_load(struct rq *rq)
 {
 	return rq->cfs.runnable_load_avg;
 }
 #else
 static inline unsigned long get_rq_runnable_load(struct rq *rq)
 {
 	return rq->load.weight;
 }
 #endif
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 *
 * Therefore we cannot use the delta approach from the regular tick since that
 * would seriously skew the load calculation. However we'll make do for those
 * updates happening while idle (nohz_idle_balance) or coming out of idle
 * (tick_nohz_idle_exit).
 *
 * This means we might still be one tick off for nohz periods.
 */
 /*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
 void update_idle_cpu_load(struct rq *this_rq)
 {
 	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
 	unsigned long load = get_rq_runnable_load(this_rq);
 	unsigned long pending_updates;
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
 	if (load || curr_jiffies == this_rq->last_load_update_tick)
 		return;
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	this_rq->last_load_update_tick = curr_jiffies;
 	__update_cpu_load(this_rq, load, pending_updates);
 }
 /*
 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
 */
 void update_cpu_load_nohz(void)
 {
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
 	unsigned long pending_updates;
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 	raw_spin_lock(&this_rq->lock);
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	if (pending_updates) {
 		this_rq->last_load_update_tick = curr_jiffies;
 		/*
 		 * We were idle, this means load 0, the current load might be
 		 * !0 due to remote wakeups and the sort.
 		 */
 		__update_cpu_load(this_rq, 0, pending_updates);
 	}
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
 /*
 * Called from scheduler_tick()
 */
 void update_cpu_load_active(struct rq *this_rq)
 {
 	unsigned long load = get_rq_runnable_load(this_rq);
 	/*
 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
 	__update_cpu_load(this_rq, load, 1);
 	calc_load_account_active(this_rq);
 }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
 #ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
 #else
 static inline void update_cpu_load_active(struct rq *this_rq) { }
 #endif
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
 static inline void add_nr_running(struct rq *rq, unsigned count)