|
Chuck Ebbert |
b218718 |
From: Venkatesh Pallipadi <venki@google.com>
|
|
Chuck Ebbert |
b218718 |
Date: Tue, 18 May 2010 01:14:43 +0000 (-0700)
|
|
Chuck Ebbert |
b218718 |
Subject: sched: Avoid side-effect of tickless idle on update_cpu_load
|
|
Chuck Ebbert |
b218718 |
X-Git-Tag: v2.6.36-rc1~531^2~22
|
|
Chuck Ebbert |
b218718 |
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=fdf3e95d3916f18bf8703fb065499fdbc4dfe34c
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
sched: Avoid side-effect of tickless idle on update_cpu_load
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
tickless idle has a negative side effect on update_cpu_load(), which
|
|
Chuck Ebbert |
b218718 |
in turn can affect load balancing behavior.
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
update_cpu_load() is supposed to be called every tick, to keep track
|
|
Chuck Ebbert |
b218718 |
of various load indicies. With tickless idle, there are no scheduler
|
|
Chuck Ebbert |
b218718 |
ticks called on the idle CPUs. Idle CPUs may still do load balancing
|
|
Chuck Ebbert |
b218718 |
(with idle_load_balance CPU) using the stale cpu_load. It will also
|
|
Chuck Ebbert |
b218718 |
cause problems when all CPUs go idle for a while and become active
|
|
Chuck Ebbert |
b218718 |
again. In this case loads would not degrade as expected.
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
This is how rq->nr_load_updates change looks like under different
|
|
Chuck Ebbert |
b218718 |
conditions:
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
<cpu_num> <nr_load_updates change>
|
|
Chuck Ebbert |
b218718 |
All CPUS idle for 10 seconds (HZ=1000)
|
|
Chuck Ebbert |
b218718 |
0 1621
|
|
Chuck Ebbert |
b218718 |
10 496
|
|
Chuck Ebbert |
b218718 |
11 139
|
|
Chuck Ebbert |
b218718 |
12 875
|
|
Chuck Ebbert |
b218718 |
13 1672
|
|
Chuck Ebbert |
b218718 |
14 12
|
|
Chuck Ebbert |
b218718 |
15 21
|
|
Chuck Ebbert |
b218718 |
1 1472
|
|
Chuck Ebbert |
b218718 |
2 2426
|
|
Chuck Ebbert |
b218718 |
3 1161
|
|
Chuck Ebbert |
b218718 |
4 2108
|
|
Chuck Ebbert |
b218718 |
5 1525
|
|
Chuck Ebbert |
b218718 |
6 701
|
|
Chuck Ebbert |
b218718 |
7 249
|
|
Chuck Ebbert |
b218718 |
8 766
|
|
Chuck Ebbert |
b218718 |
9 1967
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
One CPU busy rest idle for 10 seconds
|
|
Chuck Ebbert |
b218718 |
0 10003
|
|
Chuck Ebbert |
b218718 |
10 601
|
|
Chuck Ebbert |
b218718 |
11 95
|
|
Chuck Ebbert |
b218718 |
12 966
|
|
Chuck Ebbert |
b218718 |
13 1597
|
|
Chuck Ebbert |
b218718 |
14 114
|
|
Chuck Ebbert |
b218718 |
15 98
|
|
Chuck Ebbert |
b218718 |
1 3457
|
|
Chuck Ebbert |
b218718 |
2 93
|
|
Chuck Ebbert |
b218718 |
3 6679
|
|
Chuck Ebbert |
b218718 |
4 1425
|
|
Chuck Ebbert |
b218718 |
5 1479
|
|
Chuck Ebbert |
b218718 |
6 595
|
|
Chuck Ebbert |
b218718 |
7 193
|
|
Chuck Ebbert |
b218718 |
8 633
|
|
Chuck Ebbert |
b218718 |
9 1687
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
All CPUs busy for 10 seconds
|
|
Chuck Ebbert |
b218718 |
0 10026
|
|
Chuck Ebbert |
b218718 |
10 10026
|
|
Chuck Ebbert |
b218718 |
11 10026
|
|
Chuck Ebbert |
b218718 |
12 10026
|
|
Chuck Ebbert |
b218718 |
13 10025
|
|
Chuck Ebbert |
b218718 |
14 10025
|
|
Chuck Ebbert |
b218718 |
15 10025
|
|
Chuck Ebbert |
b218718 |
1 10026
|
|
Chuck Ebbert |
b218718 |
2 10026
|
|
Chuck Ebbert |
b218718 |
3 10026
|
|
Chuck Ebbert |
b218718 |
4 10026
|
|
Chuck Ebbert |
b218718 |
5 10026
|
|
Chuck Ebbert |
b218718 |
6 10026
|
|
Chuck Ebbert |
b218718 |
7 10026
|
|
Chuck Ebbert |
b218718 |
8 10026
|
|
Chuck Ebbert |
b218718 |
9 10026
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
That is update_cpu_load works properly only when all CPUs are busy.
|
|
Chuck Ebbert |
b218718 |
If all are idle, all the CPUs get way lower updates. And when few
|
|
Chuck Ebbert |
b218718 |
CPUs are busy and rest are idle, only busy and ilb CPU does proper
|
|
Chuck Ebbert |
b218718 |
updates and rest of the idle CPUs will do lower updates.
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
The patch keeps track of when a last update was done and fixes up
|
|
Chuck Ebbert |
b218718 |
the load avg based on current time.
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
On one of my test system SPECjbb with warehouse 1..numcpus, patch
|
|
Chuck Ebbert |
b218718 |
improves throughput numbers by ~1% (average of 6 runs). On another
|
|
Chuck Ebbert |
b218718 |
test system (with different domain hierarchy) there is no noticable
|
|
Chuck Ebbert |
b218718 |
change in perf.
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
|
|
Chuck Ebbert |
b218718 |
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
|
Chuck Ebbert |
b218718 |
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Chuck Ebbert |
b218718 |
LKML-Reference: <AANLkTilLtDWQsAUrIxJ6s04WTgmw9GuOODc5AOrYsaR5@mail.gmail.com>
|
|
Chuck Ebbert |
b218718 |
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
|
Chuck Ebbert |
b218718 |
---
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
diff --git a/kernel/sched.c b/kernel/sched.c
|
|
Chuck Ebbert |
b218718 |
index f37a961..a757f6b 100644
|
|
Chuck Ebbert |
b218718 |
--- a/kernel/sched.c
|
|
Chuck Ebbert |
b218718 |
+++ b/kernel/sched.c
|
|
Chuck Ebbert |
b218718 |
@@ -457,6 +457,7 @@ struct rq {
|
|
Chuck Ebbert |
b218718 |
unsigned long nr_running;
|
|
Chuck Ebbert |
b218718 |
#define CPU_LOAD_IDX_MAX 5
|
|
Chuck Ebbert |
b218718 |
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
|
Chuck Ebbert |
b218718 |
+ unsigned long last_load_update_tick;
|
|
Chuck Ebbert |
b218718 |
#ifdef CONFIG_NO_HZ
|
|
Chuck Ebbert |
b218718 |
u64 nohz_stamp;
|
|
Chuck Ebbert |
b218718 |
unsigned char in_nohz_recently;
|
|
Chuck Ebbert |
b218718 |
@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|
Chuck Ebbert |
b218718 |
static void calc_load_account_idle(struct rq *this_rq);
|
|
Chuck Ebbert |
b218718 |
static void update_sysctl(void);
|
|
Chuck Ebbert |
b218718 |
static int get_update_sysctl_factor(void);
|
|
Chuck Ebbert |
b218718 |
+static void update_cpu_load(struct rq *this_rq);
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
Chuck Ebbert |
b218718 |
{
|
|
Chuck Ebbert |
b218718 |
@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq)
|
|
Chuck Ebbert |
b218718 |
}
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
/*
|
|
Chuck Ebbert |
b218718 |
+ * The exact cpuload at various idx values, calculated at every tick would be
|
|
Chuck Ebbert |
b218718 |
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
|
Chuck Ebbert |
b218718 |
+ *
|
|
Chuck Ebbert |
b218718 |
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
|
Chuck Ebbert |
b218718 |
+ * on nth tick when cpu may be busy, then we have:
|
|
Chuck Ebbert |
b218718 |
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
Chuck Ebbert |
b218718 |
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
|
Chuck Ebbert |
b218718 |
+ *
|
|
Chuck Ebbert |
b218718 |
+ * decay_load_missed() below does efficient calculation of
|
|
Chuck Ebbert |
b218718 |
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
Chuck Ebbert |
b218718 |
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
|
Chuck Ebbert |
b218718 |
+ *
|
|
Chuck Ebbert |
b218718 |
+ * The calculation is approximated on a 128 point scale.
|
|
Chuck Ebbert |
b218718 |
+ * degrade_zero_ticks is the number of ticks after which load at any
|
|
Chuck Ebbert |
b218718 |
+ * particular idx is approximated to be zero.
|
|
Chuck Ebbert |
b218718 |
+ * degrade_factor is a precomputed table, a row for each load idx.
|
|
Chuck Ebbert |
b218718 |
+ * Each column corresponds to degradation factor for a power of two ticks,
|
|
Chuck Ebbert |
b218718 |
+ * based on 128 point scale.
|
|
Chuck Ebbert |
b218718 |
+ * Example:
|
|
Chuck Ebbert |
b218718 |
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
|
|
Chuck Ebbert |
b218718 |
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
|
Chuck Ebbert |
b218718 |
+ *
|
|
Chuck Ebbert |
b218718 |
+ * With this power of 2 load factors, we can degrade the load n times
|
|
Chuck Ebbert |
b218718 |
+ * by looking at 1 bits in n and doing as many mult/shift instead of
|
|
Chuck Ebbert |
b218718 |
+ * n mult/shifts needed by the exact degradation.
|
|
Chuck Ebbert |
b218718 |
+ */
|
|
Chuck Ebbert |
b218718 |
+#define DEGRADE_SHIFT 7
|
|
Chuck Ebbert |
b218718 |
+static const unsigned char
|
|
Chuck Ebbert |
b218718 |
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
|
Chuck Ebbert |
b218718 |
+static const unsigned char
|
|
Chuck Ebbert |
b218718 |
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
|
Chuck Ebbert |
b218718 |
+ {0, 0, 0, 0, 0, 0, 0, 0},
|
|
Chuck Ebbert |
b218718 |
+ {64, 32, 8, 0, 0, 0, 0, 0},
|
|
Chuck Ebbert |
b218718 |
+ {96, 72, 40, 12, 1, 0, 0},
|
|
Chuck Ebbert |
b218718 |
+ {112, 98, 75, 43, 15, 1, 0},
|
|
Chuck Ebbert |
b218718 |
+ {120, 112, 98, 76, 45, 16, 2} };
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+/*
|
|
Chuck Ebbert |
b218718 |
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
|
Chuck Ebbert |
b218718 |
+ * would be when CPU is idle and so we just decay the old load without
|
|
Chuck Ebbert |
b218718 |
+ * adding any new load.
|
|
Chuck Ebbert |
b218718 |
+ */
|
|
Chuck Ebbert |
b218718 |
+static unsigned long
|
|
Chuck Ebbert |
b218718 |
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
|
Chuck Ebbert |
b218718 |
+{
|
|
Chuck Ebbert |
b218718 |
+ int j = 0;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ if (!missed_updates)
|
|
Chuck Ebbert |
b218718 |
+ return load;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ if (missed_updates >= degrade_zero_ticks[idx])
|
|
Chuck Ebbert |
b218718 |
+ return 0;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ if (idx == 1)
|
|
Chuck Ebbert |
b218718 |
+ return load >> missed_updates;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ while (missed_updates) {
|
|
Chuck Ebbert |
b218718 |
+ if (missed_updates % 2)
|
|
Chuck Ebbert |
b218718 |
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ missed_updates >>= 1;
|
|
Chuck Ebbert |
b218718 |
+ j++;
|
|
Chuck Ebbert |
b218718 |
+ }
|
|
Chuck Ebbert |
b218718 |
+ return load;
|
|
Chuck Ebbert |
b218718 |
+}
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+/*
|
|
Chuck Ebbert |
b218718 |
* Update rq->cpu_load[] statistics. This function is usually called every
|
|
Chuck Ebbert |
b218718 |
- * scheduler tick (TICK_NSEC).
|
|
Chuck Ebbert |
b218718 |
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
|
Chuck Ebbert |
b218718 |
+ * every tick. We fix it up based on jiffies.
|
|
Chuck Ebbert |
b218718 |
*/
|
|
Chuck Ebbert |
b218718 |
static void update_cpu_load(struct rq *this_rq)
|
|
Chuck Ebbert |
b218718 |
{
|
|
Chuck Ebbert |
b218718 |
unsigned long this_load = this_rq->load.weight;
|
|
Chuck Ebbert |
b218718 |
+ unsigned long curr_jiffies = jiffies;
|
|
Chuck Ebbert |
b218718 |
+ unsigned long pending_updates;
|
|
Chuck Ebbert |
b218718 |
int i, scale;
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
this_rq->nr_load_updates++;
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
+ /* Avoid repeated calls on same jiffy, when moving in and out of idle */
|
|
Chuck Ebbert |
b218718 |
+ if (curr_jiffies == this_rq->last_load_update_tick)
|
|
Chuck Ebbert |
b218718 |
+ return;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
Chuck Ebbert |
b218718 |
+ this_rq->last_load_update_tick = curr_jiffies;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
/* Update our load: */
|
|
Chuck Ebbert |
b218718 |
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
|
Chuck Ebbert |
b218718 |
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
|
Chuck Ebbert |
b218718 |
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
|
Chuck Ebbert |
b218718 |
unsigned long old_load, new_load;
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
/* scale is effectively 1 << i now, and >> i divides by scale */
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
old_load = this_rq->cpu_load[i];
|
|
Chuck Ebbert |
b218718 |
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
|
Chuck Ebbert |
b218718 |
new_load = this_load;
|
|
Chuck Ebbert |
b218718 |
/*
|
|
Chuck Ebbert |
b218718 |
* Round up the averaging division if load is increasing. This
|
|
Chuck Ebbert |
b218718 |
@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq)
|
|
Chuck Ebbert |
b218718 |
* example.
|
|
Chuck Ebbert |
b218718 |
*/
|
|
Chuck Ebbert |
b218718 |
if (new_load > old_load)
|
|
Chuck Ebbert |
b218718 |
- new_load += scale-1;
|
|
Chuck Ebbert |
b218718 |
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
|
|
Chuck Ebbert |
b218718 |
+ new_load += scale - 1;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
|
Chuck Ebbert |
b218718 |
}
|
|
Chuck Ebbert |
b218718 |
+}
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+static void update_cpu_load_active(struct rq *this_rq)
|
|
Chuck Ebbert |
b218718 |
+{
|
|
Chuck Ebbert |
b218718 |
+ update_cpu_load(this_rq);
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
calc_load_account_active(this_rq);
|
|
Chuck Ebbert |
b218718 |
}
|
|
Chuck Ebbert |
b218718 |
@@ -3464,7 +3551,7 @@ void scheduler_tick(void)
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
raw_spin_lock(&rq->lock);
|
|
Chuck Ebbert |
b218718 |
update_rq_clock(rq);
|
|
Chuck Ebbert |
b218718 |
- update_cpu_load(rq);
|
|
Chuck Ebbert |
b218718 |
+ update_cpu_load_active(rq);
|
|
Chuck Ebbert |
b218718 |
curr->sched_class->task_tick(rq, curr, 0);
|
|
Chuck Ebbert |
b218718 |
raw_spin_unlock(&rq->lock);
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
@@ -7688,6 +7775,9 @@ void __init sched_init(void)
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
|
Chuck Ebbert |
b218718 |
rq->cpu_load[j] = 0;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
+ rq->last_load_update_tick = jiffies;
|
|
Chuck Ebbert |
b218718 |
+
|
|
Chuck Ebbert |
b218718 |
#ifdef CONFIG_SMP
|
|
Chuck Ebbert |
b218718 |
rq->sd = NULL;
|
|
Chuck Ebbert |
b218718 |
rq->rd = NULL;
|
|
Chuck Ebbert |
b218718 |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
|
Chuck Ebbert |
b218718 |
index eed35ed..22b8b4f 100644
|
|
Chuck Ebbert |
b218718 |
--- a/kernel/sched_fair.c
|
|
Chuck Ebbert |
b218718 |
+++ b/kernel/sched_fair.c
|
|
Chuck Ebbert |
b218718 |
@@ -3420,9 +3420,12 @@ static void run_rebalance_domains(struct softirq_action *h)
|
|
Chuck Ebbert |
b218718 |
if (need_resched())
|
|
Chuck Ebbert |
b218718 |
break;
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
+ rq = cpu_rq(balance_cpu);
|
|
Chuck Ebbert |
b218718 |
+ raw_spin_lock_irq(&rq->lock);
|
|
Chuck Ebbert |
b218718 |
+ update_cpu_load(rq);
|
|
Chuck Ebbert |
b218718 |
+ raw_spin_unlock_irq(&rq->lock);
|
|
Chuck Ebbert |
b218718 |
rebalance_domains(balance_cpu, CPU_IDLE);
|
|
Chuck Ebbert |
b218718 |
|
|
Chuck Ebbert |
b218718 |
- rq = cpu_rq(balance_cpu);
|
|
Chuck Ebbert |
b218718 |
if (time_after(this_rq->next_balance, rq->next_balance))
|
|
Chuck Ebbert |
b218718 |
this_rq->next_balance = rq->next_balance;
|
|
Chuck Ebbert |
b218718 |
}
|