diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 23 | ||||
-rw-r--r-- | kernel/audit.c | 11 | ||||
-rw-r--r-- | kernel/marker.c | 31 | ||||
-rw-r--r-- | kernel/printk.c | 83 | ||||
-rw-r--r-- | kernel/relay.c | 5 | ||||
-rw-r--r-- | kernel/sched.c | 28 | ||||
-rw-r--r-- | kernel/sched_debug.c | 1 | ||||
-rw-r--r-- | kernel/sched_fair.c | 205 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 14 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 4 |
10 files changed, 236 insertions, 169 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb99..91e1cfd734d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -58,6 +58,7 @@ #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ +#include <linux/pid_namespace.h> /* * These constants control the amount of freespace that suspend and @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct file *); +static void do_acct_process(struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock @@ -86,6 +87,7 @@ struct acct_glbs { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; }; @@ -175,9 +177,11 @@ out: static void acct_file_reopen(struct file *file) { struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; if (acct_globals.file) { old_acct = acct_globals.file; + old_ns = acct_globals.ns; del_timer(&acct_globals.timer); acct_globals.active = 0; acct_globals.needcheck = 0; @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) } if (file) { acct_globals.file = file; + acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); acct_globals.needcheck = 0; acct_globals.active = 1; /* It's been deleted if it was used before so this is safe */ @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); - do_acct_process(old_acct); + do_acct_process(old_ns, old_acct); filp_close(old_acct, NULL); + put_pid_ns(old_ns); spin_lock(&acct_globals.lock); } } @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct file *file) +static void do_acct_process(struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -481,8 +487,10 @@ static void do_acct_process(struct file *file) ac.ac_gid16 = current->gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; - ac.ac_ppid = current->real_parent->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); #endif spin_lock_irq(¤t->sighand->siglock); @@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) void acct_process(void) { struct file *file = NULL; + struct pid_namespace *ns; /* * accelerate the common fastpath: @@ -592,8 +601,10 @@ void acct_process(void) return; } get_file(file); + ns = get_pid_ns(acct_globals.ns); spin_unlock(&acct_globals.lock); - do_acct_process(file); + do_acct_process(ns, file); fput(file); + put_pid_ns(ns); } diff --git a/kernel/audit.c b/kernel/audit.c index 10c4930c2bbf..be55cb503633 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -78,9 +78,13 @@ static int audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static int audit_failure = AUDIT_FAIL_PRINTK; -/* If audit records are to be written to the netlink socket, audit_pid - * contains the (non-zero) pid. */ +/* + * If audit records are to be written to the netlink socket, audit_pid + * contains the pid of the auditd process and audit_nlk_pid contains + * the pid to use to send netlink messages to that process. + */ int audit_pid; +static int audit_nlk_pid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -350,7 +354,7 @@ static int kauditd_thread(void *dummy) wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -626,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sid, 1); audit_pid = new_pid; + audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, diff --git a/kernel/marker.c b/kernel/marker.c index 48a4ea5afffd..041c33e3e95c 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, char ptype; /* - * disabling preemption to make sure the teardown of the callbacks can - * be done correctly when they are in modules and they insure RCU read - * coherency. + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. */ preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, fmt); multi[i].func(multi[i].probe_private, call_private, fmt, @@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata, char ptype; preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, fmt, &args); @@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { @@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem) elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin, /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. * * Internal callback only changed before the first probe is connected to it. * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 diff --git a/kernel/printk.c b/kernel/printk.c index 9adc2a473e6e..c46a20a19a15 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...) /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_semaphore held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int acquire_console_semaphore_for_printk(unsigned int cpu) +{ + int retval = 0; + + if (can_use_console(cpu)) + retval = !try_acquire_console_sem(); + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} + const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; @@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { - /* - * We own the drivers. We can drop the spinlock and - * let release_console_sem() print the text, maybe ... - */ - console_locked = 1; - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The acquire_console_semaphore_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); - /* - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { - console_may_schedule = 0; - release_console_sem(); - } else { - /* Release by hand to avoid flushing the buffer. */ - console_locked = 0; - up(&console_sem); - } - lockdep_on(); - raw_local_irq_restore(flags); - } else { - /* - * Someone else owns the drivers. We drop the spinlock, which - * allows the semaphore holder to proceed and to call the - * console drivers with the output which we just produced. - */ - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - lockdep_on(); + lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); - } + raw_local_irq_restore(flags); preempt_enable(); return printed_len; diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..4c035a8a248c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; diff --git a/kernel/sched.c b/kernel/sched.c index d1ad69b270ca..28c73f07efb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -594,18 +594,14 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, - SCHED_FEAT_HRTICK = 32, - SCHED_FEAT_DOUBLE_TICK = 64, + SCHED_FEAT_HRTICK = 8, + SCHED_FEAT_DOUBLE_TICK = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_HRTICK * 1 | SCHED_FEAT_DOUBLE_TICK * 0; @@ -1396,6 +1392,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (&p->se == cfs_rq_of(&p->se)->next) + return 1; + if (p->sched_class != &fair_sched_class) return 0; @@ -1855,10 +1857,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1892,6 +1895,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -3877,7 +3882,7 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + signal_pending(prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, 1); @@ -6802,6 +6807,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ */ static cpumask_t fallback_doms; +void __attribute__((weak)) arch_update_cpu_topology(void) +{ +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -6811,6 +6820,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) { int err; + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) @@ -6915,7 +6925,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static int arch_reinit_sched_domains(void) +int arch_reinit_sched_domains(void) { int err; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2cc59080efa..86a93376282c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -302,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) -{ - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, @@ -504,15 +499,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } else vruntime = cfs_rq->min_vruntime; - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; - /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a @@ -556,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -566,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -980,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP -static int select_task_rq_fair(struct task_struct *p, int sync) + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) { - int cpu, this_cpu; - struct rq *rq; - struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + return 1; + } + return 0; +} - if (cpu == this_cpu) - goto out_set_cpu; +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; } } - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ @@ -1092,6 +1119,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776b..278534bbca95 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - /* Cycle through CPUs to check if the CPUs stay synchronized to - * each other. */ - int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); - if (next_cpu >= NR_CPUS) - next_cpu = first_cpu(cpu_online_map); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); } spin_unlock(&watchdog_lock); } @@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 671af612b768..a3fa587c350c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -191,8 +191,12 @@ static void change_clocksource(void) tick_clock_notify(); + /* + * We're holding xtime lock and waking up klogd would deadlock + * us on enqueue. So no printing! printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); + */ } #else static inline void change_clocksource(void) { } |