From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 14:48:10 -0500 Subject: sched: Remove unlikely() from rt_policy() in sched.c The rt_policy() has an unlikely() that the policy it is checking is of RT priority (SCHED_FIFO or SCHED_RR). According to the annotate branch profiler it is incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 36667 654674 94 rt_policy sched.c 126 This makes sense because the rt_policy() is used by the sched_set_scheduler() and nice(). Although users may use sys_nice a bit, all RT users use the sched_set_scheduler() to set their RT priority, including kernel threads. The above numbers were from a normal desktop computer running firefox, evolution, xchat and was part of a distcc compile farm. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..269a0450281c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -123,7 +123,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } -- cgit v1.2.3 From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 17:10:31 -0500 Subject: sched: Remove unlikely() from ttwu_post_activation The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp is set. But since this is for a wakeup, and wakeups happen when tasks block on IO, and blocking tasks on IO may put the system into idle, this can actually be a common occurence. Running the annotated branch profiler on an average desktop running firefox, evolution, xchat and distcc, the report shows: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 34884862 146110926 80 ttwu_post_activation sched.c 2309 80% of the time, this unlikely is incorrect. Best not to assume what the result is, and just remove the branch annotation. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 269a0450281c..6d24b2e8d82d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; -- cgit v1.2.3 From f01114cb59d670e9b4f2c335930dd57db96e9360 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 12:26:55 +0200 Subject: sched: Fix cross-cpu clock sync on remote wakeups Markus reported that commit 317f394160e ("sched: Move the second half of ttwu() to the remote cpu") caused some accounting funnies on his AMD Phenom II X4, such as weird 'top' results. It turns out that this is due to non-synced TSC and the queued remote wakeups stopped coupeling the two relevant cpu clocks, which leads to wakeups seeing time jumps, which in turn lead to skewed runtime stats. Add an explicit call to sched_clock_cpu() to couple the per-cpu clocks to restore the normal flow of time. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306835745.2353.3.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cbb3a0eee58e..49cc70b152cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2600,6 +2600,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } -- cgit v1.2.3 From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 10:49:20 +0200 Subject: sched: Fix schedstat.nr_wakeups_migrate While looking over the code I found that with the ttwu rework the nr_wakeups_migrate test broke since we now switch cpus prior to calling ttwu_stat(), hence the test is always true. Cure this by passing the migration state in wake_flags. Also move the whole test under CONFIG_SMP, its hard to migrate tasks on UP :-) Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 49cc70b152cf..2fe98ed474da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2447,6 +2447,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } rcu_read_unlock(); } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); @@ -2455,9 +2459,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (cpu != task_cpu(p)) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SCHEDSTATS */ } @@ -2675,8 +2676,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); + } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); -- cgit v1.2.3 From 6c6c54e1807faf116724451ef2bd14993780470a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jun 2011 17:37:07 +0200 Subject: sched: Fix/clarify set_task_cpu() locking rules Sergey reported a CONFIG_PROVE_RCU warning in push_rt_task where set_task_cpu() was called with both relevant rq->locks held, which should be sufficient for running tasks since holding its rq->lock will serialize against sched_move_task(). Update the comments and fix the task_group() lockdep test. Reported-and-tested-by: Sergey Senozhatsky Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307115427.2353.3456.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2fe98ed474da..3f2e502d609b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() - * holds that lock for each task it moves into the cgroup. Therefore - * by holding that lock, we pin the task to the current cgroup. + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. */ static inline struct task_group *task_group(struct task_struct *p) { @@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock)); + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif -- cgit v1.2.3 From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Jun 2011 22:53:39 +0200 Subject: sched: Remove pointless in_atomic() definition check It's really supposed to be defined here. If it's not then we actually want the build to crash so that we know it, and not keep it silent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fd18f395a1bf..01d9536aaa8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif -- cgit v1.2.3 From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:13:27 +0200 Subject: sched: Isolate preempt counting in its own config option Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec of preempt count offset independently. So that the offset can be updated by preempt_disable() and preempt_enable() even without the need for CONFIG_PREEMPT beeing set. This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working with !CONFIG_PREEMPT where it currently doesn't detect code that sleeps inside explicit preemption disabled sections. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 01d9536aaa8e..90ad7cf2b290 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -- cgit v1.2.3 From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 19:31:56 +0200 Subject: sched: Generalize sleep inside spinlock detection The sleeping inside spinlock detection is actually used for more general sleeping inside atomic sections debugging: preemption disabled, rcu read side critical sections, interrupts, interrupt disabled, etc... Change the name of the config and its help section to reflect its more general role. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 90ad7cf2b290..a5f318b8d659 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,7 +8018,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); -- cgit v1.2.3 From cd62287e364c0d15d517c6ced4e4808b54711475 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 4 Jun 2011 15:03:20 +0200 Subject: sched, cgroups: Fix MIN_SHARES on 64-bit boxen Commit c8b28116 ("sched: Increase SCHED_LOAD_SCALE resolution") intended to have no user-visible effect, but allows setting cpu.shares to < MIN_SHARES, which the user then sees. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Link: http://lkml.kernel.org/r/1307192600.8618.3.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609b..9769c756ad66 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,8 +292,8 @@ static DEFINE_SPINLOCK(task_group_lock); * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -8450,10 +8450,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); mutex_lock(&shares_mutex); if (tg->shares == shares) -- cgit v1.2.3 From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jun 2011 15:08:55 +0200 Subject: sched: Simplify mutex_spin_on_owner() It does not make sense to rcu_read_lock/unlock() in every loop iteration while spinning on the mutex. Move the rcu protection outside the loop. Also simplify the return path to always check for lock->owner == NULL which meets the requirements of both owner changed and need_resched() caused loop exits. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 59252754fbe0..e355ee72e83f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif -- cgit v1.2.3 From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609b..d08d110b8976 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } __set_task_cpu(p, new_cpu); -- cgit v1.2.3 From 732375c6a5a4cc825b676c922d547aba96b8ce15 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Thu, 7 Jul 2011 17:27:59 -0700 Subject: plist: Remove the need to supply locks to plist heads This was legacy code brought over from the RT tree and is no longer necessary. Signed-off-by: Dima Zavin Acked-by: Thomas Gleixner Cc: Daniel Walker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andi Kleen Cc: Lai Jiangshan Link: http://lkml.kernel.org/r/1310084879-10351-2-git-send-email-dima@android.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609b..71bc127e96ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7781,7 +7781,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks); #endif rt_rq->rt_time = 0; @@ -7986,7 +7986,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init(&init_task.pi_waiters); #endif /* -- cgit v1.2.3 From d8bf4ca9ca9576548628344c9725edd3786e90b1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 8 Jul 2011 14:39:41 +0200 Subject: rcu: treewide: Do not use rcu_read_lock_held when calling rcu_dereference_check Since ca5ecddf (rcu: define __rcu address space modifier for sparse) rcu_dereference_check use rcu_read_lock_held as a part of condition automatically so callers do not have to do that as well. Signed-off-by: Michal Hocko Acked-by: Paul E. McKenney Signed-off-by: Jiri Kosina --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609b..71e5a25a8a58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -581,7 +581,6 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* -- cgit v1.2.3 From e6e6685accfa81f509fadfc9624bc7c3862d75c4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:17 -0400 Subject: KVM guest: Steal time accounting This patch accounts steal time time in account_process_tick. If one or more tick is considered stolen in the current accounting cycle, user/system accounting is skipped. Idle is fine, since the hypervisor does not report steal time if the guest is halted. Accounting steal time from the core scheduler give us the advantage of direct acess to the runqueue data. In a later opportunity, it can be used to tweak cpu power and make the scheduler aware of the time it lost. [avi: doesn't exist on many archs] Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609b..f98a28b19b2a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,9 @@ #include #include #include +#ifdef CONFIG_PARAVIRT +#include +#endif #include "sched_cpupri.h" #include "workqueue_sched.h" @@ -528,6 +531,9 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1953,6 +1959,18 @@ void account_system_vtime(struct task_struct *curr) } EXPORT_SYMBOL_GPL(account_system_vtime); +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static void update_rq_clock_task(struct rq *rq, s64 delta) { s64 irq_delta; @@ -3845,6 +3863,25 @@ void account_idle_time(cputime_t cputime) cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_branch(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3876,6 +3913,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + if (steal_account_process_tick()) + return; + if (irqtime_account_hi_update()) { cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { @@ -3929,6 +3969,9 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } + if (steal_account_process_tick()) + return; + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -- cgit v1.2.3 From 095c0aa83e52d6c3dd7168610746703921f570af Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:18 -0400 Subject: sched: adjust scheduler cpu power for stolen time This patch makes update_rq_clock() aware of steal time. The mechanism of operation is not different from irq_time, and follows the same principles. This lives in a CONFIG option itself, and can be compiled out independently of the rest of steal time reporting. The effect of disabling it is that the scheduler will still report steal time (that cannot be disabled), but won't use this information for cpu power adjustments. Everytime update_rq_clock_task() is invoked, we query information about how much time was stolen since last call, and feed it into sched_rt_avg_update(). Although steal time reporting in account_process_tick() keeps track of the last time we read the steal clock, in prev_steal_time, this patch do it independently using another field, prev_steal_time_rq. This is because otherwise, information about time accounted in update_process_tick() would never reach us in update_rq_clock(). Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- kernel/sched.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f98a28b19b2a..b35ac50b26c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,6 +534,9 @@ struct rq { #ifdef CONFIG_PARAVIRT u64 prev_steal_time; #endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1973,8 +1976,14 @@ static inline u64 steal_ticks(u64 steal) static void update_rq_clock_task(struct rq *rq, s64 delta) { - s64 irq_delta; - +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* @@ -1997,12 +2006,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; - if (irq_delta && sched_feat(NONIRQ_POWER)) - sched_rt_avg_update(rq, irq_delta); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2037,12 +2069,7 @@ static int irqtime_account_si_update(void) #define sched_clock_irqtime (0) -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ - rq->clock_task += delta; -} - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif #include "sched_idletask.c" #include "sched_fair.c" -- cgit v1.2.3 From c64be78ffb415278d7d32d6f55de95c73dcc19a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jul 2011 16:28:50 +0200 Subject: sched: Fix 32bit race Commit 3fe1698b7fe0 ("sched: Deal with non-atomic min_vruntime reads on 32bit") forgot to initialize min_vruntime_copy which could lead to an infinite while loop in task_waking_fair() under some circumstances (early boot, lucky timing). [ This bug was also reported by others that blamed it on the RCU initialization problems ] Reported-and-tested-by: Bruno Wolff III Signed-off-by: Peter Zijlstra Reviewed-by: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9769c756ad66..3dc716f6d8ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7757,6 +7757,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -- cgit v1.2.3 From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2011 13:00:06 +0200 Subject: sched: Break out cpu_power from the sched_group structure In order to prepare for non-unique sched_groups per domain, we need to carry the cpu_power elsewhere, so put a level of indirection in. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3dc716f6d8ad..36c10d25d4cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void) static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + } return cpu; } @@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (cpu == cpumask_first(sched_group_cpus(sg))) { WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); *per_cpu_ptr(sdd->sg, cpu) = NULL; + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } } @@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } -- cgit v1.2.3 From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jul 2011 10:35:52 +0200 Subject: sched: Allow for overlapping sched_domain spans Allow for sched_domain spans that overlap by giving such domains their own sched_group list instead of sharing the sched_groups amongst each-other. This is needed for machines with more than 16 nodes, because sched_domain_node_span() will generate a node mask from the 16 nearest nodes without regard if these masks have any overlap. Currently sched_domains have a sched_group that maps to their child sched_domain span, and since there is no overlap we share the sched_group between the sched_domains of the various CPUs. If however there is overlap, we would need to link the sched_group list in different ways for each cpu, and hence sharing isn't possible. In order to solve this, allocate private sched_groups for each CPU's sched_domain but have the sched_groups share a sched_group_power structure such that we can uniquely track the power. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 128 insertions(+), 29 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 36c10d25d4cd..921adf6f6fad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) { + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { kfree(sd->groups->sgp); kfree(sd->groups); } @@ -6967,15 +6993,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) *per_cpu_ptr(sdd->sgp, cpu) = NULL; - } } #ifdef CONFIG_SCHED_SMT @@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } @@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + } while (sd->child) sd = sd->child; @@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } -- cgit v1.2.3 From d110235d2c331c4f79e0879f51104be79e17a469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jul 2011 18:42:57 +0200 Subject: sched: Avoid creating superfluous NUMA domains on non-NUMA systems When creating sched_domains, stop when we've covered the entire target span instead of continuing to create domains, only to later find they're redundant and throw them away again. This avoids single node systems from touching funny NUMA sched_domain creation code and reduces the risks of the new SD_OVERLAP code. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Anton Blanchard Cc: mahesh@linux.vnet.ibm.com Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1311180177.29152.57.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 921adf6f6fad..14168c49a154 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7436,6 +7436,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; } while (sd->child) -- cgit v1.2.3 From c5d753a55ac92e09816d410cd17093813f1a904b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Jul 2011 15:07:25 -0700 Subject: sched: Add irq_{enter,exit}() to scheduler_ipi() Ensure scheduler_ipi() calls irq_{enter,exit} when it does some actual work. Traditionally we never did any actual work from the resched IPI and all magic happened in the return from interrupt path. Now that we do do some work, we need to ensure irq_{enter,exit} are called so that we don't confuse things. This affects things like timekeeping, NO_HZ and RCU, basically everything with a hook in irq_enter/exit. Explicit examples of things going wrong are: sched_clock_cpu() -- has a callback when leaving NO_HZ state to take a new reading from GTOD and TSC. Without this callback, time is stuck in the past. RCU -- needs in_irq() to work in order to avoid some nasty deadlocks Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9769c756ad66..1930ee19d98b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2563,9 +2559,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) -- cgit v1.2.3 From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2011 13:09:25 +0200 Subject: sched, cgroup: Optimize load_balance_fair() Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this achieves that load_balance_fair() only iterates those task_groups that actually have tasks on busiest, and that we iterate bottom-up, trying to move light groups before the heavier ones. No idea if it will actually work out to be beneficial in practice, does anybody have a cgroup workload that might show a difference one way or the other? [ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ] Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b0e7ad796d3b..474f341d6f91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); -- cgit v1.2.3 From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:31 +0200 Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' This patch fixes a typo located in a comment. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 474f341d6f91..3b3826ebe793 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -- cgit v1.2.3 From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001 From: Bianca Lutz Date: Wed, 13 Jul 2011 20:13:36 +0200 Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth If a task group is to be created and alloc_fair_sched_group() fails, then the rt_bandwidth of the corresponding task group is not yet initialized. The caller, sched_create_group(), starts a clean up procedure which calls free_rt_sched_group() which unconditionally destroys the not yet initialized rt_bandwidth. This crashes or hangs the system in lock_hrtimer_base(): UP systems dereference a NULL pointer, while SMP systems loop endlessly on a condition that cannot become true. This patch simply avoids the destruction of rt_bandwidth when the initialization code path was not reached. (This was discovered by accident with a custom kernel modification.) Signed-off-by: Bianca Lutz Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3b3826ebe793..f107204db53f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) -- cgit v1.2.3 From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 15 Jul 2011 11:41:31 +0100 Subject: sched: Reorder root_domain to remove 64 bit alignment padding Reorder root_domain to remove 8 bytes of alignment padding on 64 bit builds, this shrinks the size from 1736 to 1728 bytes, therefore using one fewer cachelines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f107204db53f..e3f0bac05270 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -422,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +432,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; -- cgit v1.2.3 From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 14 Jul 2011 18:32:43 +0200 Subject: sched: Separate group-scheduling code more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up cfs/rt runqueue initialization by moving group scheduling related code into the corresponding functions. Also, keep group scheduling as an add-on, so that things are only done additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry(). (This removes a redundant initalization during sched_init()). In case of group scheduling rt_rq->highest_prio.curr is now initialized twice, but adding another #ifdef seems not worth it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e3f0bac05270..6fdf7ffbebc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); @@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8032,7 +8024,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } -- cgit v1.2.3