Chuck Ebbert b218718
From: Venkatesh Pallipadi <venki@google.com>
Chuck Ebbert b218718
Date: Sat, 22 May 2010 00:09:41 +0000 (-0700)
Chuck Ebbert b218718
Subject: sched: Change nohz idle load balancing logic to push model
Chuck Ebbert b218718
X-Git-Tag: v2.6.36-rc1~531^2~21
Chuck Ebbert b218718
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=83cd4fe27ad8446619b2e030b171b858501de87d
Chuck Ebbert b218718
Chuck Ebbert b218718
sched: Change nohz idle load balancing logic to push model
Chuck Ebbert b218718
Chuck Ebbert b218718
In the new push model, all idle CPUs indeed go into nohz mode. There is
Chuck Ebbert b218718
still the concept of idle load balancer (performing the load balancing
Chuck Ebbert b218718
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
Chuck Ebbert b218718
balancer when any of the nohz CPUs need idle load balancing.
Chuck Ebbert b218718
The kickee CPU does the idle load balancing on behalf of all idle CPUs
Chuck Ebbert b218718
instead of the normal idle balance.
Chuck Ebbert b218718
Chuck Ebbert b218718
This addresses the below two problems with the current nohz ilb logic:
Chuck Ebbert b218718
* the idle load balancer continued to have periodic ticks during idle and
Chuck Ebbert b218718
  wokeup frequently, even though it did not have any rebalancing to do on
Chuck Ebbert b218718
  behalf of any of the idle CPUs.
Chuck Ebbert b218718
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
Chuck Ebbert b218718
  periodic wakeup can result in a periodic additional interrupt on a CPU
Chuck Ebbert b218718
  doing the timer broadcast.
Chuck Ebbert b218718
Chuck Ebbert b218718
Also currently we are migrating the unpinned timers from an idle to the cpu
Chuck Ebbert b218718
doing idle load balancing (when all the cpus in the system are idle,
Chuck Ebbert b218718
there is no idle load balancing cpu and timers get added to the same idle cpu
Chuck Ebbert b218718
where the request was made. So the existing optimization works only on semi idle
Chuck Ebbert b218718
system).
Chuck Ebbert b218718
Chuck Ebbert b218718
And In semi idle system, we no longer have periodic ticks on the idle load
Chuck Ebbert b218718
balancer CPU. Using that cpu will add more delays to the timers than intended
Chuck Ebbert b218718
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
Chuck Ebbert b218718
causing mysterious slowdowns during boot etc.
Chuck Ebbert b218718
Chuck Ebbert b218718
For now, in the semi idle case, use the nearest busy cpu for migrating timers
Chuck Ebbert b218718
from an idle cpu.  This is good for power-savings anyway.
Chuck Ebbert b218718
Chuck Ebbert b218718
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Chuck Ebbert b218718
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Chuck Ebbert b218718
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Chuck Ebbert b218718
Cc: Thomas Gleixner <tglx@linutronix.de>
Chuck Ebbert b218718
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Chuck Ebbert b218718
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Chuck Ebbert b218718
---
Chuck Ebbert b218718
Chuck Ebbert b218718
[ backported for 2.6.35 ]
Chuck Ebbert b218718
Chuck Ebbert b218718
diff --git a/include/linux/sched.h b/include/linux/sched.h
Chuck Ebbert b218718
index c2d4316..a3e5b1c 100644
Chuck Ebbert b218718
--- a/include/linux/sched.h
Chuck Ebbert b218718
+++ b/include/linux/sched.h
Chuck Ebbert b218718
@@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 extern cpumask_var_t nohz_cpu_mask;
Chuck Ebbert b218718
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
Chuck Ebbert b218718
-extern int select_nohz_load_balancer(int cpu);
Chuck Ebbert b218718
-extern int get_nohz_load_balancer(void);
Chuck Ebbert b218718
+extern void select_nohz_load_balancer(int stop_tick);
Chuck Ebbert b218718
+extern int get_nohz_timer_target(void);
Chuck Ebbert b218718
 #else
Chuck Ebbert b218718
-static inline int select_nohz_load_balancer(int cpu)
Chuck Ebbert b218718
-{
Chuck Ebbert b218718
-	return 0;
Chuck Ebbert b218718
-}
Chuck Ebbert b218718
+static inline void select_nohz_load_balancer(int stop_tick) { }
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 /*
Chuck Ebbert b218718
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
Chuck Ebbert b218718
index 5c69e99..e934339 100644
Chuck Ebbert b218718
--- a/kernel/hrtimer.c
Chuck Ebbert b218718
+++ b/kernel/hrtimer.c
Chuck Ebbert b218718
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
Chuck Ebbert b218718
 static int hrtimer_get_target(int this_cpu, int pinned)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
 #ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
Chuck Ebbert b218718
-		int preferred_cpu = get_nohz_load_balancer();
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-		if (preferred_cpu >= 0)
Chuck Ebbert b218718
-			return preferred_cpu;
Chuck Ebbert b218718
-	}
Chuck Ebbert b218718
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
Chuck Ebbert b218718
+		return get_nohz_timer_target();
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 	return this_cpu;
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
diff --git a/kernel/sched.c b/kernel/sched.c
Chuck Ebbert b218718
index a757f6b..132950b 100644
Chuck Ebbert b218718
--- a/kernel/sched.c
Chuck Ebbert b218718
+++ b/kernel/sched.c
Chuck Ebbert b218718
@@ -460,7 +460,7 @@ struct rq {
Chuck Ebbert b218718
 	unsigned long last_load_update_tick;
Chuck Ebbert b218718
 #ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
 	u64 nohz_stamp;
Chuck Ebbert b218718
-	unsigned char in_nohz_recently;
Chuck Ebbert b218718
+	unsigned char nohz_balance_kick;
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 	unsigned int skip_clock_update;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 #ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
 /*
Chuck Ebbert b218718
+ * In the semi idle case, use the nearest busy cpu for migrating timers
Chuck Ebbert b218718
+ * from an idle cpu.  This is good for power-savings.
Chuck Ebbert b218718
+ *
Chuck Ebbert b218718
+ * We don't do similar optimization for completely idle system, as
Chuck Ebbert b218718
+ * selecting an idle cpu will add more delays to the timers than intended
Chuck Ebbert b218718
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
Chuck Ebbert b218718
+ */
Chuck Ebbert b218718
+int get_nohz_timer_target(void)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	int cpu = smp_processor_id();
Chuck Ebbert b218718
+	int i;
Chuck Ebbert b218718
+	struct sched_domain *sd;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	for_each_domain(cpu, sd) {
Chuck Ebbert b218718
+		for_each_cpu(i, sched_domain_span(sd))
Chuck Ebbert b218718
+			if (!idle_cpu(i))
Chuck Ebbert b218718
+				return i;
Chuck Ebbert b218718
+	}
Chuck Ebbert b218718
+	return cpu;
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+/*
Chuck Ebbert b218718
  * When add_timer_on() enqueues a timer into the timer wheel of an
Chuck Ebbert b218718
  * idle CPU then this timer might expire before the next timer event
Chuck Ebbert b218718
  * which is scheduled to wake up that CPU. In case of a completely
Chuck Ebbert b218718
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
Chuck Ebbert b218718
 		rq->idle_stamp = 0;
Chuck Ebbert b218718
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
Chuck Ebbert b218718
 		rq_attach_root(rq, &def_root_domain);
Chuck Ebbert b218718
+#ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
+		rq->nohz_balance_kick = 0;
Chuck Ebbert b218718
+		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
Chuck Ebbert b218718
+#endif
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 		init_rq_hrtick(rq);
Chuck Ebbert b218718
 		atomic_set(&rq->nr_iowait, 0);
Chuck Ebbert b218718
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
Chuck Ebbert b218718
 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
Chuck Ebbert b218718
 #ifdef CONFIG_SMP
Chuck Ebbert b218718
 #ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
Chuck Ebbert b218718
-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
Chuck Ebbert b218718
+	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Chuck Ebbert b218718
+	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
Chuck Ebbert b218718
+	atomic_set(&nohz.load_balancer, nr_cpu_ids);
Chuck Ebbert b218718
+	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
Chuck Ebbert b218718
+	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 	/* May be allocated at isolcpus cmdline parse time */
Chuck Ebbert b218718
 	if (cpu_isolated_map == NULL)
Chuck Ebbert b218718
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
Chuck Ebbert b218718
index 22b8b4f..6ee2e0a 100644
Chuck Ebbert b218718
--- a/kernel/sched_fair.c
Chuck Ebbert b218718
+++ b/kernel/sched_fair.c
Chuck Ebbert b218718
@@ -3091,13 +3091,40 @@ out_unlock:
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 #ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+static void trigger_sched_softirq(void *data)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	raise_softirq_irqoff(SCHED_SOFTIRQ);
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	csd->func = trigger_sched_softirq;
Chuck Ebbert b218718
+	csd->info = NULL;
Chuck Ebbert b218718
+	csd->flags = 0;
Chuck Ebbert b218718
+	csd->priv = 0;
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+/*
Chuck Ebbert b218718
+ * idle load balancing details
Chuck Ebbert b218718
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
Chuck Ebbert b218718
+ *   entering idle.
Chuck Ebbert b218718
+ * - This idle load balancer CPU will also go into tickless mode when
Chuck Ebbert b218718
+ *   it is idle, just like all other idle CPUs
Chuck Ebbert b218718
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
Chuck Ebbert b218718
+ *   needed, they will kick the idle load balancer, which then does idle
Chuck Ebbert b218718
+ *   load balancing for all the idle CPUs.
Chuck Ebbert b218718
+ */
Chuck Ebbert b218718
 static struct {
Chuck Ebbert b218718
 	atomic_t load_balancer;
Chuck Ebbert b218718
-	cpumask_var_t cpu_mask;
Chuck Ebbert b218718
-	cpumask_var_t ilb_grp_nohz_mask;
Chuck Ebbert b218718
-} nohz ____cacheline_aligned = {
Chuck Ebbert b218718
-	.load_balancer = ATOMIC_INIT(-1),
Chuck Ebbert b218718
-};
Chuck Ebbert b218718
+	atomic_t first_pick_cpu;
Chuck Ebbert b218718
+	atomic_t second_pick_cpu;
Chuck Ebbert b218718
+	cpumask_var_t idle_cpus_mask;
Chuck Ebbert b218718
+	cpumask_var_t grp_idle_mask;
Chuck Ebbert b218718
+	unsigned long next_balance;     /* in jiffy units */
Chuck Ebbert b218718
+} nohz ____cacheline_aligned;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 int get_nohz_load_balancer(void)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
Chuck Ebbert b218718
  */
Chuck Ebbert b218718
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
Chuck Ebbert b218718
+	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
Chuck Ebbert b218718
 					sched_group_cpus(ilb_group));
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 	/*
Chuck Ebbert b218718
 	 * A sched_group is semi-idle when it has atleast one busy cpu
Chuck Ebbert b218718
 	 * and atleast one idle cpu.
Chuck Ebbert b218718
 	 */
Chuck Ebbert b218718
-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
Chuck Ebbert b218718
+	if (cpumask_empty(nohz.grp_idle_mask))
Chuck Ebbert b218718
 		return 0;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
Chuck Ebbert b218718
+	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
Chuck Ebbert b218718
 		return 0;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 	return 1;
Chuck Ebbert b218718
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
Chuck Ebbert b218718
 	 * Optimize for the case when we have no idle CPUs or only one
Chuck Ebbert b218718
 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
Chuck Ebbert b218718
 	 */
Chuck Ebbert b218718
-	if (cpumask_weight(nohz.cpu_mask) < 2)
Chuck Ebbert b218718
+	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
Chuck Ebbert b218718
 		goto out_done;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
Chuck Ebbert b218718
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 		do {
Chuck Ebbert b218718
 			if (is_semi_idle_group(ilb_group))
Chuck Ebbert b218718
-				return cpumask_first(nohz.ilb_grp_nohz_mask);
Chuck Ebbert b218718
+				return cpumask_first(nohz.grp_idle_mask);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 			ilb_group = ilb_group->next;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
Chuck Ebbert b218718
 	}
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 out_done:
Chuck Ebbert b218718
-	return cpumask_first(nohz.cpu_mask);
Chuck Ebbert b218718
+	return nr_cpu_ids;
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
Chuck Ebbert b218718
 static inline int find_new_ilb(int call_cpu)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
-	return cpumask_first(nohz.cpu_mask);
Chuck Ebbert b218718
+	return nr_cpu_ids;
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 /*
Chuck Ebbert b218718
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
Chuck Ebbert b218718
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
Chuck Ebbert b218718
+ * CPU (if there is one).
Chuck Ebbert b218718
+ */
Chuck Ebbert b218718
+static void nohz_balancer_kick(int cpu)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	int ilb_cpu;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	nohz.next_balance++;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	ilb_cpu = get_nohz_load_balancer();
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (ilb_cpu >= nr_cpu_ids) {
Chuck Ebbert b218718
+		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
Chuck Ebbert b218718
+		if (ilb_cpu >= nr_cpu_ids)
Chuck Ebbert b218718
+			return;
Chuck Ebbert b218718
+	}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
Chuck Ebbert b218718
+		struct call_single_data *cp;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
Chuck Ebbert b218718
+		cp = &per_cpu(remote_sched_softirq_cb, cpu);
Chuck Ebbert b218718
+		__smp_call_function_single(ilb_cpu, cp, 0);
Chuck Ebbert b218718
+	}
Chuck Ebbert b218718
+	return;
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+/*
Chuck Ebbert b218718
  * This routine will try to nominate the ilb (idle load balancing)
Chuck Ebbert b218718
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
Chuck Ebbert b218718
- * load balancing on behalf of all those cpus. If all the cpus in the system
Chuck Ebbert b218718
- * go into this tickless mode, then there will be no ilb owner (as there is
Chuck Ebbert b218718
- * no need for one) and all the cpus will sleep till the next wakeup event
Chuck Ebbert b218718
- * arrives...
Chuck Ebbert b218718
- *
Chuck Ebbert b218718
- * For the ilb owner, tick is not stopped. And this tick will be used
Chuck Ebbert b218718
- * for idle load balancing. ilb owner will still be part of
Chuck Ebbert b218718
- * nohz.cpu_mask..
Chuck Ebbert b218718
+ * load balancing on behalf of all those cpus.
Chuck Ebbert b218718
  *
Chuck Ebbert b218718
- * While stopping the tick, this cpu will become the ilb owner if there
Chuck Ebbert b218718
- * is no other owner. And will be the owner till that cpu becomes busy
Chuck Ebbert b218718
- * or if all cpus in the system stop their ticks at which point
Chuck Ebbert b218718
- * there is no need for ilb owner.
Chuck Ebbert b218718
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
Chuck Ebbert b218718
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
Chuck Ebbert b218718
+ * idle load balancing by kicking one of the idle CPUs.
Chuck Ebbert b218718
  *
Chuck Ebbert b218718
- * When the ilb owner becomes busy, it nominates another owner, during the
Chuck Ebbert b218718
- * next busy scheduler_tick()
Chuck Ebbert b218718
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
Chuck Ebbert b218718
+ * ilb owner CPU in future (when there is a need for idle load balancing on
Chuck Ebbert b218718
+ * behalf of all idle CPUs).
Chuck Ebbert b218718
  */
Chuck Ebbert b218718
-int select_nohz_load_balancer(int stop_tick)
Chuck Ebbert b218718
+void select_nohz_load_balancer(int stop_tick)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
 	int cpu = smp_processor_id();
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 	if (stop_tick) {
Chuck Ebbert b218718
-		cpu_rq(cpu)->in_nohz_recently = 1;
Chuck Ebbert b218718
-
Chuck Ebbert b218718
 		if (!cpu_active(cpu)) {
Chuck Ebbert b218718
 			if (atomic_read(&nohz.load_balancer) != cpu)
Chuck Ebbert b218718
-				return 0;
Chuck Ebbert b218718
+				return;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 			/*
Chuck Ebbert b218718
 			 * If we are going offline and still the leader,
Chuck Ebbert b218718
 			 * give up!
Chuck Ebbert b218718
 			 */
Chuck Ebbert b218718
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
Chuck Ebbert b218718
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
Chuck Ebbert b218718
+					   nr_cpu_ids) != cpu)
Chuck Ebbert b218718
 				BUG();
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-			return 0;
Chuck Ebbert b218718
+			return;
Chuck Ebbert b218718
 		}
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
Chuck Ebbert b218718
+		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-		/* time for ilb owner also to sleep */
Chuck Ebbert b218718
-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
Chuck Ebbert b218718
-			if (atomic_read(&nohz.load_balancer) == cpu)
Chuck Ebbert b218718
-				atomic_set(&nohz.load_balancer, -1);
Chuck Ebbert b218718
-			return 0;
Chuck Ebbert b218718
-		}
Chuck Ebbert b218718
+		if (atomic_read(&nohz.first_pick_cpu) == cpu)
Chuck Ebbert b218718
+			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
Chuck Ebbert b218718
+		if (atomic_read(&nohz.second_pick_cpu) == cpu)
Chuck Ebbert b218718
+			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-		if (atomic_read(&nohz.load_balancer) == -1) {
Chuck Ebbert b218718
-			/* make me the ilb owner */
Chuck Ebbert b218718
-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
Chuck Ebbert b218718
-				return 1;
Chuck Ebbert b218718
-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
Chuck Ebbert b218718
+		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
Chuck Ebbert b218718
 			int new_ilb;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-			if (!(sched_smt_power_savings ||
Chuck Ebbert b218718
-						sched_mc_power_savings))
Chuck Ebbert b218718
-				return 1;
Chuck Ebbert b218718
+			/* make me the ilb owner */
Chuck Ebbert b218718
+			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
Chuck Ebbert b218718
+					   cpu) != nr_cpu_ids)
Chuck Ebbert b218718
+				return;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
 			/*
Chuck Ebbert b218718
 			 * Check to see if there is a more power-efficient
Chuck Ebbert b218718
 			 * ilb.
Chuck Ebbert b218718
 			 */
Chuck Ebbert b218718
 			new_ilb = find_new_ilb(cpu);
Chuck Ebbert b218718
 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
Chuck Ebbert b218718
-				atomic_set(&nohz.load_balancer, -1);
Chuck Ebbert b218718
+				atomic_set(&nohz.load_balancer, nr_cpu_ids);
Chuck Ebbert b218718
 				resched_cpu(new_ilb);
Chuck Ebbert b218718
-				return 0;
Chuck Ebbert b218718
+				return;
Chuck Ebbert b218718
 			}
Chuck Ebbert b218718
-			return 1;
Chuck Ebbert b218718
+			return;
Chuck Ebbert b218718
 		}
Chuck Ebbert b218718
 	} else {
Chuck Ebbert b218718
-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
Chuck Ebbert b218718
-			return 0;
Chuck Ebbert b218718
+		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
Chuck Ebbert b218718
+			return;
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
Chuck Ebbert b218718
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 		if (atomic_read(&nohz.load_balancer) == cpu)
Chuck Ebbert b218718
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
Chuck Ebbert b218718
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
Chuck Ebbert b218718
+					   nr_cpu_ids) != cpu)
Chuck Ebbert b218718
 				BUG();
Chuck Ebbert b218718
 	}
Chuck Ebbert b218718
-	return 0;
Chuck Ebbert b218718
+	return;
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 
Chuck Ebbert b218718
@@ -3383,11 +3428,101 @@ out:
Chuck Ebbert b218718
 		rq->next_balance = next_balance;
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 
Chuck Ebbert b218718
+#ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
 /*
Chuck Ebbert b218718
- * run_rebalance_domains is triggered when needed from the scheduler tick.
Chuck Ebbert b218718
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
Chuck Ebbert b218718
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
Chuck Ebbert b218718
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
Chuck Ebbert b218718
  */
Chuck Ebbert b218718
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	struct rq *this_rq = cpu_rq(this_cpu);
Chuck Ebbert b218718
+	struct rq *rq;
Chuck Ebbert b218718
+	int balance_cpu;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
Chuck Ebbert b218718
+		return;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Chuck Ebbert b218718
+		if (balance_cpu == this_cpu)
Chuck Ebbert b218718
+			continue;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+		/*
Chuck Ebbert b218718
+		 * If this cpu gets work to do, stop the load balancing
Chuck Ebbert b218718
+		 * work being done for other cpus. Next load
Chuck Ebbert b218718
+		 * balancing owner will pick it up.
Chuck Ebbert b218718
+		 */
Chuck Ebbert b218718
+		if (need_resched()) {
Chuck Ebbert b218718
+			this_rq->nohz_balance_kick = 0;
Chuck Ebbert b218718
+			break;
Chuck Ebbert b218718
+		}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+		raw_spin_lock_irq(&this_rq->lock);
Chuck Ebbert b218718
+		update_cpu_load(this_rq);
Chuck Ebbert b218718
+		raw_spin_unlock_irq(&this_rq->lock);
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+		rebalance_domains(balance_cpu, CPU_IDLE);
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+		rq = cpu_rq(balance_cpu);
Chuck Ebbert b218718
+		if (time_after(this_rq->next_balance, rq->next_balance))
Chuck Ebbert b218718
+			this_rq->next_balance = rq->next_balance;
Chuck Ebbert b218718
+	}
Chuck Ebbert b218718
+	nohz.next_balance = this_rq->next_balance;
Chuck Ebbert b218718
+	this_rq->nohz_balance_kick = 0;
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+/*
Chuck Ebbert b218718
+ * Current heuristic for kicking the idle load balancer
Chuck Ebbert b218718
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
Chuck Ebbert b218718
+ *   idle load balancer when it has more than one process active. This
Chuck Ebbert b218718
+ *   eliminates the need for idle load balancing altogether when we have
Chuck Ebbert b218718
+ *   only one running process in the system (common case).
Chuck Ebbert b218718
+ * - If there are more than one busy CPU, idle load balancer may have
Chuck Ebbert b218718
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
Chuck Ebbert b218718
+ *   SMT or core siblings and can run better if they move to different
Chuck Ebbert b218718
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
Chuck Ebbert b218718
+ *   which will kick idle load balancer as soon as it has any load.
Chuck Ebbert b218718
+ */
Chuck Ebbert b218718
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
Chuck Ebbert b218718
+{
Chuck Ebbert b218718
+	unsigned long now = jiffies;
Chuck Ebbert b218718
+	int ret;
Chuck Ebbert b218718
+	int first_pick_cpu, second_pick_cpu;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (time_before(now, nohz.next_balance))
Chuck Ebbert b218718
+		return 0;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (!rq->nr_running)
Chuck Ebbert b218718
+		return 0;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
Chuck Ebbert b218718
+	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
Chuck Ebbert b218718
+	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
Chuck Ebbert b218718
+		return 0;
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
Chuck Ebbert b218718
+	if (ret == nr_cpu_ids || ret == cpu) {
Chuck Ebbert b218718
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
Chuck Ebbert b218718
+		if (rq->nr_running > 1)
Chuck Ebbert b218718
+			return 1;
Chuck Ebbert b218718
+	} else {
Chuck Ebbert b218718
+		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
Chuck Ebbert b218718
+		if (ret == nr_cpu_ids || ret == cpu) {
Chuck Ebbert b218718
+			if (rq->nr_running)
Chuck Ebbert b218718
+				return 1;
Chuck Ebbert b218718
+		}
Chuck Ebbert b218718
+	}
Chuck Ebbert b218718
+	return 0;
Chuck Ebbert b218718
+}
Chuck Ebbert b218718
+#else
Chuck Ebbert b218718
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
Chuck Ebbert b218718
+#endif
Chuck Ebbert b218718
+
Chuck Ebbert b218718
+/*
Chuck Ebbert b218718
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
Chuck Ebbert b218718
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Chuck Ebbert b218718
+ */
Chuck Ebbert b218718
 static void run_rebalance_domains(struct softirq_action *h)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
 	int this_cpu = smp_processor_id();
Chuck Ebbert b218718
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 	rebalance_domains(this_cpu, idle);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
-#ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
 	/*
Chuck Ebbert b218718
-	 * If this cpu is the owner for idle load balancing, then do the
Chuck Ebbert b218718
+	 * If this cpu has a pending nohz_balance_kick, then do the
Chuck Ebbert b218718
 	 * balancing on behalf of the other idle cpus whose ticks are
Chuck Ebbert b218718
 	 * stopped.
Chuck Ebbert b218718
 	 */
Chuck Ebbert b218718
-	if (this_rq->idle_at_tick &&
Chuck Ebbert b218718
-	    atomic_read(&nohz.load_balancer) == this_cpu) {
Chuck Ebbert b218718
-		struct rq *rq;
Chuck Ebbert b218718
-		int balance_cpu;
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
Chuck Ebbert b218718
-			if (balance_cpu == this_cpu)
Chuck Ebbert b218718
-				continue;
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-			/*
Chuck Ebbert b218718
-			 * If this cpu gets work to do, stop the load balancing
Chuck Ebbert b218718
-			 * work being done for other cpus. Next load
Chuck Ebbert b218718
-			 * balancing owner will pick it up.
Chuck Ebbert b218718
-			 */
Chuck Ebbert b218718
-			if (need_resched())
Chuck Ebbert b218718
-				break;
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-			rq = cpu_rq(balance_cpu);
Chuck Ebbert b218718
-			raw_spin_lock_irq(&rq->lock);
Chuck Ebbert b218718
-			update_cpu_load(rq);
Chuck Ebbert b218718
-			raw_spin_unlock_irq(&rq->lock);
Chuck Ebbert b218718
-			rebalance_domains(balance_cpu, CPU_IDLE);
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-			if (time_after(this_rq->next_balance, rq->next_balance))
Chuck Ebbert b218718
-				this_rq->next_balance = rq->next_balance;
Chuck Ebbert b218718
-		}
Chuck Ebbert b218718
-	}
Chuck Ebbert b218718
-#endif
Chuck Ebbert b218718
+	nohz_idle_balance(this_cpu, idle);
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 static inline int on_null_domain(int cpu)
Chuck Ebbert b218718
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 /*
Chuck Ebbert b218718
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Chuck Ebbert b218718
- *
Chuck Ebbert b218718
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
Chuck Ebbert b218718
- * idle load balancing owner or decide to stop the periodic load balancing,
Chuck Ebbert b218718
- * if the whole system is idle.
Chuck Ebbert b218718
  */
Chuck Ebbert b218718
 static inline void trigger_load_balance(struct rq *rq, int cpu)
Chuck Ebbert b218718
 {
Chuck Ebbert b218718
-#ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
-	/*
Chuck Ebbert b218718
-	 * If we were in the nohz mode recently and busy at the current
Chuck Ebbert b218718
-	 * scheduler tick, then check if we need to nominate new idle
Chuck Ebbert b218718
-	 * load balancer.
Chuck Ebbert b218718
-	 */
Chuck Ebbert b218718
-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
Chuck Ebbert b218718
-		rq->in_nohz_recently = 0;
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-		if (atomic_read(&nohz.load_balancer) == cpu) {
Chuck Ebbert b218718
-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
Chuck Ebbert b218718
-			atomic_set(&nohz.load_balancer, -1);
Chuck Ebbert b218718
-		}
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-		if (atomic_read(&nohz.load_balancer) == -1) {
Chuck Ebbert b218718
-			int ilb = find_new_ilb(cpu);
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-			if (ilb < nr_cpu_ids)
Chuck Ebbert b218718
-				resched_cpu(ilb);
Chuck Ebbert b218718
-		}
Chuck Ebbert b218718
-	}
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-	/*
Chuck Ebbert b218718
-	 * If this cpu is idle and doing idle load balancing for all the
Chuck Ebbert b218718
-	 * cpus with ticks stopped, is it time for that to stop?
Chuck Ebbert b218718
-	 */
Chuck Ebbert b218718
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
Chuck Ebbert b218718
-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
Chuck Ebbert b218718
-		resched_cpu(cpu);
Chuck Ebbert b218718
-		return;
Chuck Ebbert b218718
-	}
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-	/*
Chuck Ebbert b218718
-	 * If this cpu is idle and the idle load balancing is done by
Chuck Ebbert b218718
-	 * someone else, then no need raise the SCHED_SOFTIRQ
Chuck Ebbert b218718
-	 */
Chuck Ebbert b218718
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
Chuck Ebbert b218718
-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
Chuck Ebbert b218718
-		return;
Chuck Ebbert b218718
-#endif
Chuck Ebbert b218718
 	/* Don't need to rebalance while attached to NULL domain */
Chuck Ebbert b218718
 	if (time_after_eq(jiffies, rq->next_balance) &&
Chuck Ebbert b218718
 	    likely(!on_null_domain(cpu)))
Chuck Ebbert b218718
 		raise_softirq(SCHED_SOFTIRQ);
Chuck Ebbert b218718
+#ifdef CONFIG_NO_HZ
Chuck Ebbert b218718
+	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Chuck Ebbert b218718
+		nohz_balancer_kick(cpu);
Chuck Ebbert b218718
+#endif
Chuck Ebbert b218718
 }
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 static void rq_online_fair(struct rq *rq)
Chuck Ebbert b218718
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
Chuck Ebbert b218718
index 1d7b9bc..5f171f0 100644
Chuck Ebbert b218718
--- a/kernel/time/tick-sched.c
Chuck Ebbert b218718
+++ b/kernel/time/tick-sched.c
Chuck Ebbert b218718
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
Chuck Ebbert b218718
 		 * the scheduler tick in nohz_restart_sched_tick.
Chuck Ebbert b218718
 		 */
Chuck Ebbert b218718
 		if (!ts->tick_stopped) {
Chuck Ebbert b218718
-			if (select_nohz_load_balancer(1)) {
Chuck Ebbert b218718
-				/*
Chuck Ebbert b218718
-				 * sched tick not stopped!
Chuck Ebbert b218718
-				 */
Chuck Ebbert b218718
-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
Chuck Ebbert b218718
-				goto out;
Chuck Ebbert b218718
-			}
Chuck Ebbert b218718
+			select_nohz_load_balancer(1);
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
Chuck Ebbert b218718
 			ts->tick_stopped = 1;
Chuck Ebbert b218718
diff --git a/kernel/timer.c b/kernel/timer.c
Chuck Ebbert b218718
index ee305c8..48d6aec 100644
Chuck Ebbert b218718
--- a/kernel/timer.c
Chuck Ebbert b218718
+++ b/kernel/timer.c
Chuck Ebbert b218718
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
Chuck Ebbert b218718
 	cpu = smp_processor_id();
Chuck Ebbert b218718
 
Chuck Ebbert b218718
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
Chuck Ebbert b218718
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
Chuck Ebbert b218718
-		int preferred_cpu = get_nohz_load_balancer();
Chuck Ebbert b218718
-
Chuck Ebbert b218718
-		if (preferred_cpu >= 0)
Chuck Ebbert b218718
-			cpu = preferred_cpu;
Chuck Ebbert b218718
-	}
Chuck Ebbert b218718
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
Chuck Ebbert b218718
+		cpu = get_nohz_timer_target();
Chuck Ebbert b218718
 #endif
Chuck Ebbert b218718
 	new_base = per_cpu(tvec_bases, cpu);
Chuck Ebbert b218718