From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b7480fb5c3dc..254d56de2548 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); + perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (current->sched_class->post_schedule) @@ -4296,6 +4319,7 @@ void scheduler_tick(void) rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ -- cgit v1.2.3 From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5c3f4106314e..382cfdb5e38d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; -- cgit v1.2.3 From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38d..4d84ff4c8774 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8db1a4cf2082..173768f142ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -558,6 +558,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; + new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2810,6 +2812,21 @@ unsigned long nr_active(void) return running + uninterruptible; } +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_switches(cpu) - number of context switches on that cpu + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_switches(int cpu) +{ + return cpu_rq(cpu)->nr_switches; +} + +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; +} + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.3 From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5757e03cfac0..7530fdd7c982 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(); if (unlikely(!mm)) { next->active_mm = oldmm; -- cgit v1.2.3 From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7530fdd7c982..133762aece50 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_start_context_switch(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; -- cgit v1.2.3 From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 39e708602169..f76e3c0188a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2900,14 +2900,8 @@ unsigned long nr_active(void) /* * Externally visible per-cpu scheduler statistics: - * cpu_nr_switches(cpu) - number of context switches on that cpu * cpu_nr_migrations(cpu) - number of migrations into that cpu */ -u64 cpu_nr_switches(int cpu) -{ - return cpu_rq(cpu)->nr_switches; -} - u64 cpu_nr_migrations(int cpu) { return cpu_rq(cpu)->nr_migrations_in; -- cgit v1.2.3 From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:12 +0200 Subject: perf_counter: remove rq->lock usage Now that all the task runtime clock users are gone, remove the ugly rq->lock usage from perf counters, which solves the nasty deadlock seen when a software task clock counter was read from an NMI overflow context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.531137582@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f76e3c0188a2..0de2f814fb18 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void curr_rq_lock_irq_save(unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_save(*flags); - rq = cpu_rq(smp_processor_id()); - spin_lock(&rq->lock); -} - -void curr_rq_unlock_irq_restore(unsigned long *flags) - __releases(rq->lock) -{ - struct rq *rq; - - rq = cpu_rq(smp_processor_id()); - spin_unlock(&rq->lock); - local_irq_restore(*flags); -} - void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); -- cgit v1.2.3 From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d5072..f91bc8141dc3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } +/* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + /* * wait_task_inactive - wait for a thread to unschedule. * -- cgit v1.2.3 From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:30 +0530 Subject: sched: Nominate idle load balancer from a semi-idle package. Currently the nomination of idle-load balancer is done by choosing the first idle cpu in the nohz.cpu_mask. This may not be power-efficient, since such an idle cpu could come from a completely idle core/package thereby preventing the whole core/package from being in a low-power state. For eg, consider a quad-core dual package system. The cpu numbering need not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7]. With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep as fewer Packages/Cores active. But the current idle load balancer logic goes against this by choosing the first_cpu in the nohz.cpu_mask and not taking the system topology into consideration. Improve the algorithm to nominate the idle load balancer from a semi idle cores/packages thereby increasing the probability of the cores/packages being in deeper sleep states for longer duration. The algorithm is activated only when sched_mc/smt_power_savings != 0. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b66..b0fefa300b40 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return first_cpu(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -9051,6 +9159,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ -- cgit v1.2.3 From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:35 +0530 Subject: sched: Nominate a power-efficient ilb in select_nohz_balancer() The CPU that first goes idle becomes the idle-load-balancer and remains that until either it picks up a task or till all the CPUs of the system goes idle. Optimize this further to allow it to relinquish it's post once all it's siblings in the power-aware sched_domain go idle, thereby allowing the whole package-core to go idle. While relinquising the post, nominate another an idle-load balancer from a semi-idle core/package. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b0fefa300b40..36d213bca473 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; -- cgit v1.2.3 From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 Apr 2009 16:53:05 +0200 Subject: wait: don't use __wake_up_common() '777c6c5 wait: prevent exclusive waiter starvation' made __wake_up_common() global to be used from abort_exclusive_wait(). It was needed to do a wake-up with the waitqueue lock held while passing down a key to the wake-up function. Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()' there is an appropriate wrapper for this case: __wake_up_locked_key(). Use it here and make __wake_up_common() private to the scheduler again. Signed-off-by: Johannes Weiner Cc: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 36d213bca473..92b4b56ad093 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; -- cgit v1.2.3 From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/sched.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b66..e6d4518d47e0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); -- cgit v1.2.3 From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e6d4518d47e0..9f7ffd00b6ea 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,7 +79,7 @@ #include "sched_cpupri.h" #define CREATE_TRACE_POINTS -#include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] -- cgit v1.2.3 From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Apr 2009 09:59:10 +0800 Subject: sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) Impact: cleanup This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu() for maintainability. Signed-off-by: Miao Xie Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 92b4b56ad093..7601ceebf7ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; -- cgit v1.2.3 From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 20:38:21 +0200 Subject: perfcounters, sched: remove __task_delta_exec() This function was left orphan by the latest round of sw-counter cleanups. [ Impact: remove unused kernel function ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b66a08c2480e..a69278eef425 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); -/* - * Return any ns on the sched_clock that have not yet been banked in - * @p in case that task is currently running. - */ -unsigned long long __task_delta_exec(struct task_struct *p, int update) -{ - s64 delta_exec; - struct rq *rq; - - rq = task_rq(p); - WARN_ON_ONCE(!runqueue_is_locked()); - WARN_ON_ONCE(!task_current(rq, p)); - - if (update) - update_rq_clock(rq); - - delta_exec = rq->clock - p->se.exec_start; - - WARN_ON_ONCE(delta_exec < 0); - - return delta_exec; -} - /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. -- cgit v1.2.3 From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:26 +0100 Subject: sched: remove extra call overhead for schedule() Lai Jiangshan's patch reminded me that I promised Nick to remove that extra call overhead in schedule(). Signed-off-by: Peter Zijlstra LKML-Reference: <20090313112300.927414207@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7601ceebf7ce..797f6fdabadf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5194,15 +5196,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); -- cgit v1.2.3 From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 21 Apr 2009 08:40:49 +0530 Subject: sched: Replace first_cpu() with cpumask_first() in ILB nomination code Stephen Rothwell reported this build warning: > kernel/sched.c: In function 'find_new_ilb': > kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type > > Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b > ("sched: Nominate idle load balancer from a semi-idle package") from > the sched tree. Should this call to first_cpu be cpumask_first? For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the Idle load balancer as the first cpu from the nohz.cpu_mask. This code uses the older API first_cpu(). Replace it with cpumask_first(), which is the correct API here. [ Impact: cleanup, address build warning ] Reported-by: Stephen Rothwell Signed-off-by: Gautham R Shenoy Cc: Rusty Russell LKML-Reference: <20090421031049.GA4140@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 797f6fdabadf..54d67b94f1a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4356,7 +4356,7 @@ out_done: #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return first_cpu(nohz.cpu_mask); + return cpumask_first(nohz.cpu_mask); } #endif -- cgit v1.2.3 From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2009 15:01:38 +0100 Subject: sched: Document memory barriers implied by sleep/wake-up primitives Add a section to the memory barriers document to note the implied memory barriers of sleep primitives (set_current_state() and wrappers) and wake-up primitives (wake_up() and co.). Also extend the in-code comments on the wake_up() functions to note these implied barriers. [ Impact: add documentation ] Signed-off-by: David Howells Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a0..fd0c2cee3f35 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { -- cgit v1.2.3 From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:13:30 +0200 Subject: perf_counter: initialize the per-cpu context earlier percpu scheduling for perfcounters wants to take the context lock, but that lock first needs to be initialized. Currently it is an early_initcall() - but that is too late, the task tick runs much sooner than that. Call it explicitly from the scheduler init sequence instead. [ Impact: fix access-before-init crash ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2f600e30dcf0..a728976a3a6c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -8996,7 +8997,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9097,6 +9098,8 @@ void __init sched_init(void) alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } -- cgit v1.2.3 From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:21 +0200 Subject: sched: rt: document the risk of small values in the bandwidth settings Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for (!RT_GROUP) since the root group always has some RT tasks in it. Further, update the documentation to inspire clue. [ Impact: exclude corner-case sysctl_sched_rt_runtime value ] Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: <20090505155436.863098054@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 54d67b94f1a9..2a43a581ead3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.2.3 From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 May 2009 16:49:59 +0800 Subject: tracepoint: trace_sched_migrate_task(): remove parameter The orig_cpu parameter in trace_sched_migrate_task() is not necessary, it can be got by using task_cpu(p) in the probe. [ Impact: micro-optimization ] Signed-off-by: Mathieu Desnoyers [ modified from Mathieu's patch. The original patch is at: http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ] Signed-off-by: Xiao Guangrong Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org Cc: Li Zefan Cc: zhaolei@cn.fujitsu.com Cc: laijs@cn.fujitsu.com LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9f7ffd00b6ea..9cdedbd181ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) -- cgit v1.2.3 From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 4 May 2009 01:38:05 -0700 Subject: sched: emit thread info flags with stack trace When a thread is oom killed and fails to exit, it's helpful to know which threads have access to memory reserves if the machine livelocks. This is done by testing for the TIF_MEMDIE thread info flag and should be displayed alongside stack traces to identify tasks that have access to such reserves but are still stuck allocating pages, for instance. It would probably be helpful in other cases as well, so all thread info flags are emitted when showing a task. ( v2: fix warning reported by Stephen Rothwell ) [ Impact: extend debug printout info ] Signed-off-by: David Rientjes Cc: Peter Zijlstra Cc: Stephen Rothwell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2a43a581ead3..5aa63f50c696 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } -- cgit v1.2.3 From 5c333864a6ba811052d52ef14fbed056b9ac3512 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:14:37 +0530 Subject: timers: Identifying the existing pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: The following pinned hrtimers have been identified and marked: 1)sched_rt_period_timer 2)tick_sched_timer 3)stack_trace_timer_fn [ tglx: fixup the hrtimer pinned mode ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a0..9c5b4d3f97ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -244,7 +244,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) hard = hrtimer_get_expires(&rt_b->rt_period_timer); delta = ktime_to_ns(ktime_sub(hard, soft)); __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS, 0); + HRTIMER_MODE_ABS_PINNED, 0); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -1154,7 +1154,7 @@ static __init void init_hrtick(void) static void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL, 0); + HRTIMER_MODE_REL_PINNED, 0); } static inline void init_hrtick(void) -- cgit v1.2.3 From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:15:34 +0530 Subject: timers: /proc/sys sysctl hook to enable timer migration * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch creates the /proc/sys sysctl interface at /proc/sys/kernel/timer_migration Timer migration is enabled by default. To disable timer migration, when CONFIG_SCHED_DEBUG = y, echo 0 > /proc/sys/kernel/timer_migration Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9c5b4d3f97ab..7f1dd56af863 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8731,6 +8731,8 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ +const_debug unsigned int sysctl_timer_migration = 1; + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || -- cgit v1.2.3 From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:16:41 +0530 Subject: timers: Logic to move non pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch migrates all non pinned timers and hrtimers to the current idle load balancer, from all the idle CPUs. Timers firing on busy CPUs are not migrated. While migrating hrtimers, care should be taken to check if migrating a hrtimer would result in a latency or not. So we compare the expiry of the hrtimer with the next timer interrupt on the target cpu and migrate the hrtimer only if it expires *after* the next interrupt on the target cpu. So, added a clockevents_get_next_event() helper function to return the next_event on the target cpu's clock_event_device. [ tglx: cleanups and simplifications ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7f1dd56af863..9fe3774a0fd3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4244,6 +4244,11 @@ static struct { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle -- cgit v1.2.3 From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Apr 2009 10:43:41 +0200 Subject: sched, timers: move calc_load() to scheduler Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/sched.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a348..f4eb88153bd1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) { - unsigned long i, running = 0, uninterruptible = 0; + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - return running + uninterruptible; + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } } /* @@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -9059,6 +9118,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9166,6 +9227,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ -- cgit v1.2.3 From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/sched.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f4eb88153bd1..497c09ba61e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2868,6 +2868,21 @@ static unsigned long calc_load_update; unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { -- cgit v1.2.3 From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 09:22:19 +0200 Subject: sched: properly define the sched_group::cpumask and sched_domain::span fields Properly document the variable-size structure tricks we are doing wrt. struct sched_group and sched_domain, and use the field[0] GCC extension instead of defining a vla array. Dont use unions for this, as pointed out by Linus. [ Impact: cleanup, un-confuse Sparse and LLVM ] Reported-by: Jeff Garzik Acked-by: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 497c09ba61e7..228acae8821f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; -- cgit v1.2.3 From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:27:22 +1000 Subject: perf_counter: Optimize context switch between identical inherited contexts When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 419a39d0988f..4c0d58bce6b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5091,7 +5091,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, cpu); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:55 +0200 Subject: perf_counter: Fix dynamic irq_period logging We call perf_adjust_freq() from perf_counter_task_tick() which is is called under the rq->lock causing lock recursion. However, it's no longer required to be called under the rq->lock, so remove it from under it. Also, fix up some related comments. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.476197912@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4c0d58bce6b2..ad079f07c9c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4875,9 +4875,10 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); -- cgit v1.2.3 From f38b082081bf69a06fffb8b32a175999e2320c5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 2 Jun 2009 21:05:16 +1000 Subject: perf_counter: Initialize per-cpu context earlier on cpu up This arranges for perf_counter's notifier for cpu hotplug operations to be called earlier than the migration notifier in sched.c by increasing its priority to 20, compared to the 10 for the migration notifier. The reason for doing this is that a subsequent commit to convert the cpu migration counter to use the generic swcounter infrastructure will add a call into the perf_counter subsystem when tasks get migrated. Therefore the perf_counter subsystem needs a chance to initialize its per-cpu data for the new cpu before it can get called from the migration code. This also adds a comment to the migration notifier noting that its priority needs to be lower than that of the perf_counter notifier. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18981.1900.792795.836858@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ad079f07c9c8..3226cc132e9f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7319,8 +7319,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_counter subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, -- cgit v1.2.3 From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:52:30 +1000 Subject: perf_counter: Fix cpu migration counter This fixes the cpu migration software counter to count correctly even when contexts get swapped from one task to another. Previously the cpu migration counts reported by perf stat were bogus, ranging from negative to several thousand for a single "lat_ctx 2 8 32" run. With this patch the cpu migration count reported for "lat_ctx 2 8 32" is almost always between 35 and 44. This fixes the problem by adding a call into the perf_counter code from set_task_cpu when tasks are migrated. This enables us to use the generic swcounter code (with some modifications) for the cpu migration counter. This modifies the swcounter code to allow a NULL regs pointer to be passed in to perf_swcounter_ctx_event() etc. The cpu migration counter does this because there isn't necessarily a pt_regs struct for the task available. In this case, the counter will not have interrupt capability - but the migration counter didn't have interrupt capability before, so this is no loss. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3226cc132e9f..8d43347a0c0d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif + perf_counter_task_migration(p, new_cpu); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; -- cgit v1.2.3 From 36b7b6d465489c4754c4fd66fcec6086eba87896 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 10 Jun 2009 23:42:36 +0300 Subject: sched: use kzalloc() instead of the bootmem allocator Now that kmem_cache_init() happens before sched_init(), we should use kzalloc() and not the bootmem allocator. Signed-off-by: Pekka Enberg --- kernel/sched.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 14c447ae5d53..a9ff9533355b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include @@ -7782,21 +7781,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { + gfp_t gfp = GFP_KERNEL; + memset(rd, 0, sizeof(*rd)); - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } + if (bootmem) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->span, gfp)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->online, gfp)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; if (cpupri_init(&rd->cpupri, false) != 0) @@ -9123,7 +9119,7 @@ void __init sched_init(void) * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; -- cgit v1.2.3 From 4bdddf8ff9bbb8aa7b4d7847586202bd25842c90 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 11 Jun 2009 08:35:27 +0300 Subject: sched: use alloc_cpumask_var() instead of alloc_bootmem_cpumask_var() Slab is initialized when sched_init() runs now so lets use alloc_cpumask_var(). Cc: Ingo Molnar Cc: Linus Torvalds Cc: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a9ff9533355b..12cc09cf5147 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9310,13 +9310,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); + alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); - alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); + alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); + alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ scheduler_running = 1; -- cgit v1.2.3 From 0fb530291621c8b8a1b16abeeab05d9262489f71 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 11 Jun 2009 08:41:22 +0300 Subject: sched: use slab in cpupri_init() Lets not use the bootmem allocator in cpupri_init() as slab is already up when it is run. Cc: Ingo Molnar Cc: Linus Torvalds Cc: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 12cc09cf5147..dcf2dc28931a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7795,7 +7795,7 @@ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; - if (cpupri_init(&rd->cpupri, false) != 0) + if (cpupri_init(&rd->cpupri, bootmem) != 0) goto free_rto_mask; return 0; -- cgit v1.2.3 From b43e352139f51216a8c56b0bd5fc3d4e05c65619 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:00 -0600 Subject: sched: export kick_process lguest needs kick_process: wake_up_process() does nothing if a process is running, which isn't sufficient (we need it in the kernel). And lguest support is usually modular. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f04aa9664504..8ec9d13140be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted -- cgit v1.2.3 From fd5e1b5dbaa8b4aacc0e251d74182eda37062194 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 13:34:19 +0800 Subject: sched: Remove unneeded __ref tag Those two functions no longer call alloc_bootmmem_cpumask_var(), so no need to tag them with __init_refok. Signed-off-by: Li Zefan Acked-by: Pekka Enberg LKML-Reference: <4A35DD5B.9050106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aaa..00567959ab17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7828,7 +7828,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { gfp_t gfp = GFP_KERNEL; -- cgit v1.2.3 From 371cbb387e33651b4c1326457116568ff01ac422 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 17 Jun 2009 16:27:45 -0700 Subject: kthreads: simplify migration_thread() exit path Now that kthread_stop() can be used even if the task has already exited, we can kill the "wait_to_die:" loop in migration_thread(). But we must pin rq->migration_thread after creation. Actually, I don't think CPU_UP_CANCELED or CPU_DEAD should wait for ->migration_thread exit. Perhaps we can simplify this code a bit more. migration_call() can set ->should_stop and forget about this thread. But we need a new helper in kthred.c for that. Signed-off-by: Oleg Nesterov Cc: Christoph Hellwig Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Pavel Emelyanov Cc: Rusty Russell Cc: Vitaliy Gusev Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aaa..247fd0fedd0b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7045,7 +7045,7 @@ static int migration_thread(void *data) if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); - goto wait_to_die; + break; } if (rq->active_balance) { @@ -7071,16 +7071,7 @@ static int migration_thread(void *data) complete(&req->done); } __set_current_state(TASK_RUNNING); - return 0; -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); return 0; } @@ -7494,6 +7485,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = task_rq_lock(p, &flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); + get_task_struct(p); cpu_rq(cpu)->migration_thread = p; break; @@ -7524,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) kthread_bind(cpu_rq(cpu)->migration_thread, cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); + put_task_struct(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@ -7533,6 +7526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); + put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ spin_lock_irq(&rq->lock); -- cgit v1.2.3 From e5289d4a181fb6c0b7a7607649af2ffdc491335c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 13:22:51 +0200 Subject: perf_counter: Simplify and fix task migration counting The task migrations counter was causing rare and hard to decypher memory corruptions under load. After a day of debugging and bisection we found that the problem was introduced with: 3f731ca: perf_counter: Fix cpu migration counter Turning them off fixes the crashes. Incidentally, the whole perf_counter_task_migration() logic can be done simpler as well, by injecting a proper sw-counter event. This cleanup also fixed the crashes. The precise failure mode is not completely clear yet, but we are clearly not unhappy about having a fix ;-) Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aaa..f46540b359c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_counter_task_migration(p, new_cpu); + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; -- cgit v1.2.3