From b218718b2b345b73aa274b30f83f544b1e5d9817 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Sep 21 2010 13:27:25 +0000 Subject: Scheduler fixes for Bugzilla #635813 and #633037 --- diff --git a/kernel.spec b/kernel.spec index 316d1c0..62004ac 100644 --- a/kernel.spec +++ b/kernel.spec @@ -742,6 +742,13 @@ Patch12540: irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch Patch12550: keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch Patch12551: keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch +Patch12560: sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch +Patch12565: sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch +Patch12570: sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch +Patch12575: sched-15-update-rq-clock-for-nohz-balanced-cpus.patch +Patch12580: sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch +Patch12585: sched-25-move-sched_avg_update-to-update_cpu_load.patch + %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1371,6 +1378,14 @@ ApplyPatch irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch ApplyPatch keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch ApplyPatch keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch +# Scheduler fixes (#635813 and #633037) +ApplyPatch sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch +ApplyPatch sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch +ApplyPatch sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch +ApplyPatch sched-15-update-rq-clock-for-nohz-balanced-cpus.patch +ApplyPatch sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch +ApplyPatch sched-25-move-sched_avg_update-to-update_cpu_load.patch + # END OF PATCH APPLICATIONS %endif @@ -1957,7 +1972,10 @@ fi # and build. %changelog -* Mon Sep 20 2010 Chuck Ebbert 2.6.35.5-29 +* Tue Sep 21 2010 Chuck Ebbert 2.6.35.5-29 +- Scheduler fixes for Bugzilla #635813 and #633037 + +* Mon Sep 20 2010 Chuck Ebbert - Linux 2.6.35.5 - Drop merged patches: 01-compat-make-compat_alloc_user_space-incorporate-the-access_ok-check.patch diff --git a/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch b/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch new file mode 100644 index 0000000..7c15122 --- /dev/null +++ b/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch @@ -0,0 +1,55 @@ +From: Stanislaw Gruszka +Date: Tue, 14 Sep 2010 14:35:14 +0000 (+0200) +Subject: sched: Fix user time incorrectly accounted as system time on 32-bit +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fx86%2Flinux-2.6-tip.git;a=commitdiff_plain;h=e75e863dd5c7d96b91ebbd241da5328fc38a78cc + +sched: Fix user time incorrectly accounted as system time on 32-bit + +We have 32-bit variable overflow possibility when multiply in +task_times() and thread_group_times() functions. When the +overflow happens then the scaled utime value becomes erroneously +small and the scaled stime becomes i erroneously big. + +Reported here: + + https://bugzilla.redhat.com/show_bug.cgi?id=633037 + https://bugzilla.kernel.org/show_bug.cgi?id=16559 + +Reported-by: Michael Chapman +Reported-by: Ciriaco Garcia de Celis +Signed-off-by: Stanislaw Gruszka +Signed-off-by: Peter Zijlstra +Cc: Hidetoshi Seto +Cc: # 2.6.32.19+ (partially) and 2.6.33+ +LKML-Reference: <20100914143513.GB8415@redhat.com> +Signed-off-by: Ingo Molnar +--- + +diff --git a/kernel/sched.c b/kernel/sched.c +index ed09d4f..dc85ceb 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) { +- u64 temp; ++ u64 temp = rtime; + +- temp = (u64)(rtime * utime); ++ temp *= utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else +@@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { +- u64 temp; ++ u64 temp = rtime; + +- temp = (u64)(rtime * cputime.utime); ++ temp *= cputime.utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else diff --git a/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch b/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch new file mode 100644 index 0000000..ea7e48e --- /dev/null +++ b/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch @@ -0,0 +1,276 @@ +From: Venkatesh Pallipadi +Date: Tue, 18 May 2010 01:14:43 +0000 (-0700) +Subject: sched: Avoid side-effect of tickless idle on update_cpu_load +X-Git-Tag: v2.6.36-rc1~531^2~22 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=fdf3e95d3916f18bf8703fb065499fdbc4dfe34c + +sched: Avoid side-effect of tickless idle on update_cpu_load + +tickless idle has a negative side effect on update_cpu_load(), which +in turn can affect load balancing behavior. + +update_cpu_load() is supposed to be called every tick, to keep track +of various load indicies. With tickless idle, there are no scheduler +ticks called on the idle CPUs. Idle CPUs may still do load balancing +(with idle_load_balance CPU) using the stale cpu_load. It will also +cause problems when all CPUs go idle for a while and become active +again. In this case loads would not degrade as expected. + +This is how rq->nr_load_updates change looks like under different +conditions: + + +All CPUS idle for 10 seconds (HZ=1000) +0 1621 +10 496 +11 139 +12 875 +13 1672 +14 12 +15 21 +1 1472 +2 2426 +3 1161 +4 2108 +5 1525 +6 701 +7 249 +8 766 +9 1967 + +One CPU busy rest idle for 10 seconds +0 10003 +10 601 +11 95 +12 966 +13 1597 +14 114 +15 98 +1 3457 +2 93 +3 6679 +4 1425 +5 1479 +6 595 +7 193 +8 633 +9 1687 + +All CPUs busy for 10 seconds +0 10026 +10 10026 +11 10026 +12 10026 +13 10025 +14 10025 +15 10025 +1 10026 +2 10026 +3 10026 +4 10026 +5 10026 +6 10026 +7 10026 +8 10026 +9 10026 + +That is update_cpu_load works properly only when all CPUs are busy. +If all are idle, all the CPUs get way lower updates. And when few +CPUs are busy and rest are idle, only busy and ilb CPU does proper +updates and rest of the idle CPUs will do lower updates. + +The patch keeps track of when a last update was done and fixes up +the load avg based on current time. + +On one of my test system SPECjbb with warehouse 1..numcpus, patch +improves throughput numbers by ~1% (average of 6 runs). On another +test system (with different domain hierarchy) there is no noticable +change in perf. + +Signed-off-by: Venkatesh Pallipadi +Signed-off-by: Peter Zijlstra +Cc: Thomas Gleixner +LKML-Reference: +Signed-off-by: Ingo Molnar +--- + +diff --git a/kernel/sched.c b/kernel/sched.c +index f37a961..a757f6b 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -457,6 +457,7 @@ struct rq { + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; ++ unsigned long last_load_update_tick; + #ifdef CONFIG_NO_HZ + u64 nohz_stamp; + unsigned char in_nohz_recently; +@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) + static void calc_load_account_idle(struct rq *this_rq); + static void update_sysctl(void); + static int get_update_sysctl_factor(void); ++static void update_cpu_load(struct rq *this_rq); + + static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + { +@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq) + } + + /* ++ * The exact cpuload at various idx values, calculated at every tick would be ++ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load ++ * ++ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called ++ * on nth tick when cpu may be busy, then we have: ++ * load = ((2^idx - 1) / 2^idx)^(n-1) * load ++ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load ++ * ++ * decay_load_missed() below does efficient calculation of ++ * load = ((2^idx - 1) / 2^idx)^(n-1) * load ++ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load ++ * ++ * The calculation is approximated on a 128 point scale. ++ * degrade_zero_ticks is the number of ticks after which load at any ++ * particular idx is approximated to be zero. ++ * degrade_factor is a precomputed table, a row for each load idx. ++ * Each column corresponds to degradation factor for a power of two ticks, ++ * based on 128 point scale. ++ * Example: ++ * row 2, col 3 (=12) says that the degradation at load idx 2 after ++ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). ++ * ++ * With this power of 2 load factors, we can degrade the load n times ++ * by looking at 1 bits in n and doing as many mult/shift instead of ++ * n mult/shifts needed by the exact degradation. ++ */ ++#define DEGRADE_SHIFT 7 ++static const unsigned char ++ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; ++static const unsigned char ++ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { ++ {0, 0, 0, 0, 0, 0, 0, 0}, ++ {64, 32, 8, 0, 0, 0, 0, 0}, ++ {96, 72, 40, 12, 1, 0, 0}, ++ {112, 98, 75, 43, 15, 1, 0}, ++ {120, 112, 98, 76, 45, 16, 2} }; ++ ++/* ++ * Update cpu_load for any missed ticks, due to tickless idle. The backlog ++ * would be when CPU is idle and so we just decay the old load without ++ * adding any new load. ++ */ ++static unsigned long ++decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) ++{ ++ int j = 0; ++ ++ if (!missed_updates) ++ return load; ++ ++ if (missed_updates >= degrade_zero_ticks[idx]) ++ return 0; ++ ++ if (idx == 1) ++ return load >> missed_updates; ++ ++ while (missed_updates) { ++ if (missed_updates % 2) ++ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; ++ ++ missed_updates >>= 1; ++ j++; ++ } ++ return load; ++} ++ ++/* + * Update rq->cpu_load[] statistics. This function is usually called every +- * scheduler tick (TICK_NSEC). ++ * scheduler tick (TICK_NSEC). With tickless idle this will not be called ++ * every tick. We fix it up based on jiffies. + */ + static void update_cpu_load(struct rq *this_rq) + { + unsigned long this_load = this_rq->load.weight; ++ unsigned long curr_jiffies = jiffies; ++ unsigned long pending_updates; + int i, scale; + + this_rq->nr_load_updates++; + ++ /* Avoid repeated calls on same jiffy, when moving in and out of idle */ ++ if (curr_jiffies == this_rq->last_load_update_tick) ++ return; ++ ++ pending_updates = curr_jiffies - this_rq->last_load_update_tick; ++ this_rq->last_load_update_tick = curr_jiffies; ++ + /* Update our load: */ +- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { ++ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ ++ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; ++ old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This +@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq) + * example. + */ + if (new_load > old_load) +- new_load += scale-1; +- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; ++ new_load += scale - 1; ++ ++ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } ++} ++ ++static void update_cpu_load_active(struct rq *this_rq) ++{ ++ update_cpu_load(this_rq); + + calc_load_account_active(this_rq); + } +@@ -3464,7 +3551,7 @@ void scheduler_tick(void) + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); +- update_cpu_load(rq); ++ update_cpu_load_active(rq); + curr->sched_class->task_tick(rq, curr, 0); + raw_spin_unlock(&rq->lock); + +@@ -7688,6 +7775,9 @@ void __init sched_init(void) + + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; ++ ++ rq->last_load_update_tick = jiffies; ++ + #ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index eed35ed..22b8b4f 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -3420,9 +3420,12 @@ static void run_rebalance_domains(struct softirq_action *h) + if (need_resched()) + break; + ++ rq = cpu_rq(balance_cpu); ++ raw_spin_lock_irq(&rq->lock); ++ update_cpu_load(rq); ++ raw_spin_unlock_irq(&rq->lock); + rebalance_domains(balance_cpu, CPU_IDLE); + +- rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } diff --git a/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch b/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch new file mode 100644 index 0000000..622e9f1 --- /dev/null +++ b/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch @@ -0,0 +1,651 @@ +From: Venkatesh Pallipadi +Date: Sat, 22 May 2010 00:09:41 +0000 (-0700) +Subject: sched: Change nohz idle load balancing logic to push model +X-Git-Tag: v2.6.36-rc1~531^2~21 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=83cd4fe27ad8446619b2e030b171b858501de87d + +sched: Change nohz idle load balancing logic to push model + +In the new push model, all idle CPUs indeed go into nohz mode. There is +still the concept of idle load balancer (performing the load balancing +on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz +balancer when any of the nohz CPUs need idle load balancing. +The kickee CPU does the idle load balancing on behalf of all idle CPUs +instead of the normal idle balance. + +This addresses the below two problems with the current nohz ilb logic: +* the idle load balancer continued to have periodic ticks during idle and + wokeup frequently, even though it did not have any rebalancing to do on + behalf of any of the idle CPUs. +* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this + periodic wakeup can result in a periodic additional interrupt on a CPU + doing the timer broadcast. + +Also currently we are migrating the unpinned timers from an idle to the cpu +doing idle load balancing (when all the cpus in the system are idle, +there is no idle load balancing cpu and timers get added to the same idle cpu +where the request was made. So the existing optimization works only on semi idle +system). + +And In semi idle system, we no longer have periodic ticks on the idle load +balancer CPU. Using that cpu will add more delays to the timers than intended +(as that cpu's timer base may not be uptodate wrt jiffies etc). This was +causing mysterious slowdowns during boot etc. + +For now, in the semi idle case, use the nearest busy cpu for migrating timers +from an idle cpu. This is good for power-savings anyway. + +Signed-off-by: Venkatesh Pallipadi +Signed-off-by: Suresh Siddha +Signed-off-by: Peter Zijlstra +Cc: Thomas Gleixner +LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> +Signed-off-by: Ingo Molnar +--- + +[ backported for 2.6.35 ] + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c2d4316..a3e5b1c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu); + + extern cpumask_var_t nohz_cpu_mask; + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) +-extern int select_nohz_load_balancer(int cpu); +-extern int get_nohz_load_balancer(void); ++extern void select_nohz_load_balancer(int stop_tick); ++extern int get_nohz_timer_target(void); + #else +-static inline int select_nohz_load_balancer(int cpu) +-{ +- return 0; +-} ++static inline void select_nohz_load_balancer(int stop_tick) { } + #endif + + /* +diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c +index 5c69e99..e934339 100644 +--- a/kernel/hrtimer.c ++++ b/kernel/hrtimer.c +@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + static int hrtimer_get_target(int this_cpu, int pinned) + { + #ifdef CONFIG_NO_HZ +- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { +- int preferred_cpu = get_nohz_load_balancer(); +- +- if (preferred_cpu >= 0) +- return preferred_cpu; +- } ++ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) ++ return get_nohz_timer_target(); + #endif + return this_cpu; + } +diff --git a/kernel/sched.c b/kernel/sched.c +index a757f6b..132950b 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -460,7 +460,7 @@ struct rq { + unsigned long last_load_update_tick; + #ifdef CONFIG_NO_HZ + u64 nohz_stamp; +- unsigned char in_nohz_recently; ++ unsigned char nohz_balance_kick; + #endif + unsigned int skip_clock_update; + +@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu) + + #ifdef CONFIG_NO_HZ + /* ++ * In the semi idle case, use the nearest busy cpu for migrating timers ++ * from an idle cpu. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle cpu will add more delays to the timers than intended ++ * (as that cpu's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int cpu = smp_processor_id(); ++ int i; ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) ++ if (!idle_cpu(i)) ++ return i; ++ } ++ return cpu; ++} ++/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely +@@ -7791,6 +7812,10 @@ void __init sched_init(void) + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; + rq_attach_root(rq, &def_root_domain); ++#ifdef CONFIG_NO_HZ ++ rq->nohz_balance_kick = 0; ++ init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); ++#endif + #endif + init_rq_hrtick(rq); + atomic_set(&rq->nr_iowait, 0); +@@ -7835,8 +7860,11 @@ void __init sched_init(void) + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + #ifdef CONFIG_SMP + #ifdef CONFIG_NO_HZ +- zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); +- alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); ++ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); ++ alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); ++ atomic_set(&nohz.load_balancer, nr_cpu_ids); ++ atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); ++ atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); + #endif + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index 22b8b4f..6ee2e0a 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -3091,13 +3091,40 @@ out_unlock: + } + + #ifdef CONFIG_NO_HZ ++ ++static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); ++ ++static void trigger_sched_softirq(void *data) ++{ ++ raise_softirq_irqoff(SCHED_SOFTIRQ); ++} ++ ++static inline void init_sched_softirq_csd(struct call_single_data *csd) ++{ ++ csd->func = trigger_sched_softirq; ++ csd->info = NULL; ++ csd->flags = 0; ++ csd->priv = 0; ++} ++ ++/* ++ * idle load balancing details ++ * - One of the idle CPUs nominates itself as idle load_balancer, while ++ * entering idle. ++ * - This idle load balancer CPU will also go into tickless mode when ++ * it is idle, just like all other idle CPUs ++ * - When one of the busy CPUs notice that there may be an idle rebalancing ++ * needed, they will kick the idle load balancer, which then does idle ++ * load balancing for all the idle CPUs. ++ */ + static struct { + atomic_t load_balancer; +- cpumask_var_t cpu_mask; +- cpumask_var_t ilb_grp_nohz_mask; +-} nohz ____cacheline_aligned = { +- .load_balancer = ATOMIC_INIT(-1), +-}; ++ atomic_t first_pick_cpu; ++ atomic_t second_pick_cpu; ++ cpumask_var_t idle_cpus_mask; ++ cpumask_var_t grp_idle_mask; ++ unsigned long next_balance; /* in jiffy units */ ++} nohz ____cacheline_aligned; + + int get_nohz_load_balancer(void) + { +@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + */ + static inline int is_semi_idle_group(struct sched_group *ilb_group) + { +- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, ++ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ +- if (cpumask_empty(nohz.ilb_grp_nohz_mask)) ++ if (cpumask_empty(nohz.grp_idle_mask)) + return 0; + +- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) ++ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu) + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ +- if (cpumask_weight(nohz.cpu_mask) < 2) ++ if (cpumask_weight(nohz.idle_cpus_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { +@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu) + + do { + if (is_semi_idle_group(ilb_group)) +- return cpumask_first(nohz.ilb_grp_nohz_mask); ++ return cpumask_first(nohz.grp_idle_mask); + + ilb_group = ilb_group->next; + +@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu) + } + + out_done: +- return cpumask_first(nohz.cpu_mask); ++ return nr_cpu_ids; + } + #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ + static inline int find_new_ilb(int call_cpu) + { +- return cpumask_first(nohz.cpu_mask); ++ return nr_cpu_ids; + } + #endif + + /* ++ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the ++ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle ++ * CPU (if there is one). ++ */ ++static void nohz_balancer_kick(int cpu) ++{ ++ int ilb_cpu; ++ ++ nohz.next_balance++; ++ ++ ilb_cpu = get_nohz_load_balancer(); ++ ++ if (ilb_cpu >= nr_cpu_ids) { ++ ilb_cpu = cpumask_first(nohz.idle_cpus_mask); ++ if (ilb_cpu >= nr_cpu_ids) ++ return; ++ } ++ ++ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { ++ struct call_single_data *cp; ++ ++ cpu_rq(ilb_cpu)->nohz_balance_kick = 1; ++ cp = &per_cpu(remote_sched_softirq_cb, cpu); ++ __smp_call_function_single(ilb_cpu, cp, 0); ++ } ++ return; ++} ++ ++/* + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle +- * load balancing on behalf of all those cpus. If all the cpus in the system +- * go into this tickless mode, then there will be no ilb owner (as there is +- * no need for one) and all the cpus will sleep till the next wakeup event +- * arrives... +- * +- * For the ilb owner, tick is not stopped. And this tick will be used +- * for idle load balancing. ilb owner will still be part of +- * nohz.cpu_mask.. ++ * load balancing on behalf of all those cpus. + * +- * While stopping the tick, this cpu will become the ilb owner if there +- * is no other owner. And will be the owner till that cpu becomes busy +- * or if all cpus in the system stop their ticks at which point +- * there is no need for ilb owner. ++ * When the ilb owner becomes busy, we will not have new ilb owner until some ++ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick ++ * idle load balancing by kicking one of the idle CPUs. + * +- * When the ilb owner becomes busy, it nominates another owner, during the +- * next busy scheduler_tick() ++ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this ++ * ilb owner CPU in future (when there is a need for idle load balancing on ++ * behalf of all idle CPUs). + */ +-int select_nohz_load_balancer(int stop_tick) ++void select_nohz_load_balancer(int stop_tick) + { + int cpu = smp_processor_id(); + + if (stop_tick) { +- cpu_rq(cpu)->in_nohz_recently = 1; +- + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) +- return 0; ++ return; + + /* + * If we are going offline and still the leader, + * give up! + */ +- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) ++ if (atomic_cmpxchg(&nohz.load_balancer, cpu, ++ nr_cpu_ids) != cpu) + BUG(); + +- return 0; ++ return; + } + +- cpumask_set_cpu(cpu, nohz.cpu_mask); ++ cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + +- /* time for ilb owner also to sleep */ +- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { +- if (atomic_read(&nohz.load_balancer) == cpu) +- atomic_set(&nohz.load_balancer, -1); +- return 0; +- } ++ if (atomic_read(&nohz.first_pick_cpu) == cpu) ++ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); ++ if (atomic_read(&nohz.second_pick_cpu) == cpu) ++ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + +- if (atomic_read(&nohz.load_balancer) == -1) { +- /* make me the ilb owner */ +- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) +- return 1; +- } else if (atomic_read(&nohz.load_balancer) == cpu) { ++ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { + int new_ilb; + +- if (!(sched_smt_power_savings || +- sched_mc_power_savings)) +- return 1; ++ /* make me the ilb owner */ ++ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, ++ cpu) != nr_cpu_ids) ++ return; ++ + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { +- atomic_set(&nohz.load_balancer, -1); ++ atomic_set(&nohz.load_balancer, nr_cpu_ids); + resched_cpu(new_ilb); +- return 0; ++ return; + } +- return 1; ++ return; + } + } else { +- if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) +- return 0; ++ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) ++ return; + +- cpumask_clear_cpu(cpu, nohz.cpu_mask); ++ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) +- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) ++ if (atomic_cmpxchg(&nohz.load_balancer, cpu, ++ nr_cpu_ids) != cpu) + BUG(); + } +- return 0; ++ return; + } + #endif + +@@ -3383,11 +3428,101 @@ out: + rq->next_balance = next_balance; + } + ++#ifdef CONFIG_NO_HZ + /* +- * run_rebalance_domains is triggered when needed from the scheduler tick. +- * In CONFIG_NO_HZ case, the idle load balance owner will do the ++ * In CONFIG_NO_HZ case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ ++static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ struct rq *rq; ++ int balance_cpu; ++ ++ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) ++ return; ++ ++ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { ++ if (balance_cpu == this_cpu) ++ continue; ++ ++ /* ++ * If this cpu gets work to do, stop the load balancing ++ * work being done for other cpus. Next load ++ * balancing owner will pick it up. ++ */ ++ if (need_resched()) { ++ this_rq->nohz_balance_kick = 0; ++ break; ++ } ++ ++ raw_spin_lock_irq(&this_rq->lock); ++ update_cpu_load(this_rq); ++ raw_spin_unlock_irq(&this_rq->lock); ++ ++ rebalance_domains(balance_cpu, CPU_IDLE); ++ ++ rq = cpu_rq(balance_cpu); ++ if (time_after(this_rq->next_balance, rq->next_balance)) ++ this_rq->next_balance = rq->next_balance; ++ } ++ nohz.next_balance = this_rq->next_balance; ++ this_rq->nohz_balance_kick = 0; ++} ++ ++/* ++ * Current heuristic for kicking the idle load balancer ++ * - first_pick_cpu is the one of the busy CPUs. It will kick ++ * idle load balancer when it has more than one process active. This ++ * eliminates the need for idle load balancing altogether when we have ++ * only one running process in the system (common case). ++ * - If there are more than one busy CPU, idle load balancer may have ++ * to run for active_load_balance to happen (i.e., two busy CPUs are ++ * SMT or core siblings and can run better if they move to different ++ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs ++ * which will kick idle load balancer as soon as it has any load. ++ */ ++static inline int nohz_kick_needed(struct rq *rq, int cpu) ++{ ++ unsigned long now = jiffies; ++ int ret; ++ int first_pick_cpu, second_pick_cpu; ++ ++ if (time_before(now, nohz.next_balance)) ++ return 0; ++ ++ if (!rq->nr_running) ++ return 0; ++ ++ first_pick_cpu = atomic_read(&nohz.first_pick_cpu); ++ second_pick_cpu = atomic_read(&nohz.second_pick_cpu); ++ ++ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && ++ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) ++ return 0; ++ ++ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); ++ if (ret == nr_cpu_ids || ret == cpu) { ++ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); ++ if (rq->nr_running > 1) ++ return 1; ++ } else { ++ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); ++ if (ret == nr_cpu_ids || ret == cpu) { ++ if (rq->nr_running) ++ return 1; ++ } ++ } ++ return 0; ++} ++#else ++static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } ++#endif ++ ++/* ++ * run_rebalance_domains is triggered when needed from the scheduler tick. ++ * Also triggered for nohz idle balancing (with nohz_balancing_kick set). ++ */ + static void run_rebalance_domains(struct softirq_action *h) + { + int this_cpu = smp_processor_id(); +@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h) + + rebalance_domains(this_cpu, idle); + +-#ifdef CONFIG_NO_HZ + /* +- * If this cpu is the owner for idle load balancing, then do the ++ * If this cpu has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ +- if (this_rq->idle_at_tick && +- atomic_read(&nohz.load_balancer) == this_cpu) { +- struct rq *rq; +- int balance_cpu; +- +- for_each_cpu(balance_cpu, nohz.cpu_mask) { +- if (balance_cpu == this_cpu) +- continue; +- +- /* +- * If this cpu gets work to do, stop the load balancing +- * work being done for other cpus. Next load +- * balancing owner will pick it up. +- */ +- if (need_resched()) +- break; +- +- rq = cpu_rq(balance_cpu); +- raw_spin_lock_irq(&rq->lock); +- update_cpu_load(rq); +- raw_spin_unlock_irq(&rq->lock); +- rebalance_domains(balance_cpu, CPU_IDLE); +- +- if (time_after(this_rq->next_balance, rq->next_balance)) +- this_rq->next_balance = rq->next_balance; +- } +- } +-#endif ++ nohz_idle_balance(this_cpu, idle); + } + + static inline int on_null_domain(int cpu) +@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu) + + /* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. +- * +- * In case of CONFIG_NO_HZ, this is the place where we nominate a new +- * idle load balancing owner or decide to stop the periodic load balancing, +- * if the whole system is idle. + */ + static inline void trigger_load_balance(struct rq *rq, int cpu) + { +-#ifdef CONFIG_NO_HZ +- /* +- * If we were in the nohz mode recently and busy at the current +- * scheduler tick, then check if we need to nominate new idle +- * load balancer. +- */ +- if (rq->in_nohz_recently && !rq->idle_at_tick) { +- rq->in_nohz_recently = 0; +- +- if (atomic_read(&nohz.load_balancer) == cpu) { +- cpumask_clear_cpu(cpu, nohz.cpu_mask); +- atomic_set(&nohz.load_balancer, -1); +- } +- +- if (atomic_read(&nohz.load_balancer) == -1) { +- int ilb = find_new_ilb(cpu); +- +- if (ilb < nr_cpu_ids) +- resched_cpu(ilb); +- } +- } +- +- /* +- * If this cpu is idle and doing idle load balancing for all the +- * cpus with ticks stopped, is it time for that to stop? +- */ +- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && +- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { +- resched_cpu(cpu); +- return; +- } +- +- /* +- * If this cpu is idle and the idle load balancing is done by +- * someone else, then no need raise the SCHED_SOFTIRQ +- */ +- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && +- cpumask_test_cpu(cpu, nohz.cpu_mask)) +- return; +-#endif + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) + raise_softirq(SCHED_SOFTIRQ); ++#ifdef CONFIG_NO_HZ ++ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) ++ nohz_balancer_kick(cpu); ++#endif + } + + static void rq_online_fair(struct rq *rq) +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index 1d7b9bc..5f171f0 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle) + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { +- if (select_nohz_load_balancer(1)) { +- /* +- * sched tick not stopped! +- */ +- cpumask_clear_cpu(cpu, nohz_cpu_mask); +- goto out; +- } ++ select_nohz_load_balancer(1); + + ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; +diff --git a/kernel/timer.c b/kernel/timer.c +index ee305c8..48d6aec 100644 +--- a/kernel/timer.c ++++ b/kernel/timer.c +@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, + cpu = smp_processor_id(); + + #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { +- int preferred_cpu = get_nohz_load_balancer(); +- +- if (preferred_cpu >= 0) +- cpu = preferred_cpu; +- } ++ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) ++ cpu = get_nohz_timer_target(); + #endif + new_base = per_cpu(tvec_bases, cpu); + diff --git a/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch b/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch new file mode 100644 index 0000000..7c5432e --- /dev/null +++ b/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch @@ -0,0 +1,28 @@ +From: Suresh Siddha +Date: Fri, 9 Jul 2010 13:19:54 +0000 (+0200) +Subject: sched: Update rq->clock for nohz balanced cpus +X-Git-Tag: v2.6.36-rc1~531^2~5 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=5343bdb8fd076f16edc9d113a9e35e2a1d1f4966 + +sched: Update rq->clock for nohz balanced cpus + +Suresh spotted that we don't update the rq->clock in the nohz +load-balancer path. + +Signed-off-by: Peter Zijlstra +LKML-Reference: <1278626014.2834.74.camel@sbs-t61.sc.intel.com> +Signed-off-by: Ingo Molnar +--- + +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index b4da534..e44a591 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -3596,6 +3596,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) + } + + raw_spin_lock_irq(&this_rq->lock); ++ update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + diff --git a/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch b/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch new file mode 100644 index 0000000..466dd2d --- /dev/null +++ b/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch @@ -0,0 +1,38 @@ +From: Peter Zijlstra +Date: Thu, 19 Aug 2010 11:31:43 +0000 (+0200) +Subject: sched: Fix rq->clock synchronization when migrating tasks +X-Git-Tag: v2.6.36-rc3~25^2~1 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=861d034ee814917a83bd5de4b26e3b8336ddeeb8 + +sched: Fix rq->clock synchronization when migrating tasks + +sched_fork() -- we do task placement in ->task_fork_fair() ensure we + update_rq_clock() so we work with current time. We leave the vruntime + in relative state, so the time delay until wake_up_new_task() doesn't + matter. + +wake_up_new_task() -- Since task_fork_fair() left p->vruntime in + relative state we can safely migrate, the activate_task() on the + remote rq will call update_rq_clock() and causes the clock to be + synced (enough). + +Tested-by: Jack Daniel +Tested-by: Philby John +Signed-off-by: Peter Zijlstra +LKML-Reference: <1281002322.1923.1708.camel@laptop> +Signed-off-by: Ingo Molnar +--- + +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index 806d1b2..ab661eb 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p) + + raw_spin_lock_irqsave(&rq->lock, flags); + ++ update_rq_clock(rq); ++ + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); + diff --git a/sched-25-move-sched_avg_update-to-update_cpu_load.patch b/sched-25-move-sched_avg_update-to-update_cpu_load.patch new file mode 100644 index 0000000..556c8ce --- /dev/null +++ b/sched-25-move-sched_avg_update-to-update_cpu_load.patch @@ -0,0 +1,58 @@ +From: Suresh Siddha +Date: Mon, 23 Aug 2010 20:42:51 +0000 (-0700) +Subject: sched: Move sched_avg_update() to update_cpu_load() +X-Git-Tag: v2.6.36-rc4~8^2~1 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=da2b71edd8a7db44fe1746261410a981f3e03632 + +sched: Move sched_avg_update() to update_cpu_load() + +Currently sched_avg_update() (which updates rt_avg stats in the rq) +is getting called from scale_rt_power() (in the load balance context) +which doesn't take rq->lock. + +Fix it by moving the sched_avg_update() to more appropriate +update_cpu_load() where the CFS load gets updated as well. + +Signed-off-by: Suresh Siddha +Signed-off-by: Peter Zijlstra +LKML-Reference: <1282596171.2694.3.camel@sbsiddha-MOBL3> +Signed-off-by: Ingo Molnar +--- + +diff --git a/kernel/sched.c b/kernel/sched.c +index 09b574e..ed09d4f 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p) + static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) + { + } ++ ++static void sched_avg_update(struct rq *rq) ++{ ++} + #endif /* CONFIG_SMP */ + + #if BITS_PER_LONG == 32 +@@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq) + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } ++ ++ sched_avg_update(this_rq); + } + + static void update_cpu_load_active(struct rq *this_rq) +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index ab661eb..f53ec75 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu) + struct rq *rq = cpu_rq(cpu); + u64 total, available; + +- sched_avg_update(rq); +- + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; +