From 781b0203423c228b100aaaf169c77b2b556f8a49 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sat, 4 Jul 2015 09:06:32 +0200
Subject: sched, sysctl: Delete an unnecessary check before
 unregister_sysctl_table()

The unregister_sysctl_table() function tests whether its argument is NULL
and then returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5597877E.3060503@users.sourceforge.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..48be7dc3d497 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5311,8 +5311,7 @@ static void register_sched_domain_sysctl(void)
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-	if (sd_sysctl_header)
-		unregister_sysctl_table(sd_sysctl_header);
+	unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-- 
cgit v1.2.3


From fbd705a0c6184580d0e2fbcbd47a37b6e5822511 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Jun 2015 11:13:36 +0200
Subject: sched: Introduce the 'trace_sched_waking' tracepoint

Mathieu reported that since 317f394160e9 ("sched: Move the second half
of ttwu() to the remote cpu") trace_sched_wakeup() can happen out of
context of the waker.

This is a problem when you want to analyse wakeup paths because it is
now very hard to correlate the wakeup event to whoever issued the
wakeup.

OTOH trace_sched_wakeup() is issued at the point where we set
p->state = TASK_RUNNING, which is right were we hand the task off to
the scheduler, so this is an important point when looking at
scheduling behaviour, up to here its been the wakeup path everything
hereafter is due to scheduler policy.

To bridge this gap, introduce a second tracepoint: trace_sched_waking.
It is guaranteed to be called in the waker context.

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Francis Giraldeau <francis.giraldeau@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150609091336.GQ3644@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48be7dc3d497..fa5826cc612f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1654,9 +1654,9 @@ static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	check_preempt_curr(rq, p, wake_flags);
-	trace_sched_wakeup(p, true);
-
 	p->state = TASK_RUNNING;
+	trace_sched_wakeup(p);
+
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
@@ -1874,6 +1874,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	if (!(p->state & state))
 		goto out;
 
+	trace_sched_waking(p);
+
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
@@ -1949,6 +1951,8 @@ static void try_to_wake_up_local(struct task_struct *p)
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 
+	trace_sched_waking(p);
+
 	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
@@ -2307,7 +2311,7 @@ void wake_up_new_task(struct task_struct *p)
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-	trace_sched_wakeup_new(p, true);
+	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
-- 
cgit v1.2.3


From fe32d3cd5e8eb0f82e459763374aa80797023403 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 15 Jul 2015 12:52:04 +0300
Subject: sched/preempt: Fix cond_resched_lock() and cond_resched_softirq()

These functions check should_resched() before unlocking spinlock/bh-enable:
preempt_count always non-zero => should_resched() always returns false.
cond_resched_lock() worked iff spin_needbreak is set.

This patch adds argument "preempt_offset" to should_resched().

preempt_count offset constants for that:

  PREEMPT_DISABLE_OFFSET  - offset after preempt_disable()
  PREEMPT_LOCK_OFFSET     - offset after spin_lock()
  SOFTIRQ_DISABLE_OFFSET  - offset after local_bh_distable()
  SOFTIRQ_LOCK_OFFSET     - offset after spin_lock_bh()

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Graf <agraf@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: bdb438065890 ("sched: Extract the basic add/sub preempt_count modifiers")
Link: http://lkml.kernel.org/r/20150715095204.12246.98268.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa5826cc612f..f5fad2b12baf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4496,7 +4496,7 @@ SYSCALL_DEFINE0(sched_yield)
 
 int __sched _cond_resched(void)
 {
-	if (should_resched()) {
+	if (should_resched(0)) {
 		preempt_schedule_common();
 		return 1;
 	}
@@ -4514,7 +4514,7 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
-	int resched = should_resched();
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
 	int ret = 0;
 
 	lockdep_assert_held(lock);
@@ -4536,7 +4536,7 @@ int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	if (should_resched()) {
+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
 		local_bh_enable();
 		preempt_schedule_common();
 		local_bh_disable();
-- 
cgit v1.2.3


From 9d89c257dfb9c51a532d69397f6eed75e5168c35 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 15 Jul 2015 08:04:37 +0800
Subject: sched/fair: Rewrite runnable load and utilization average tracking

The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:

1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
   updated at the granularity of an entity at a time, which results in the
   cfs_rq's load average is stale or partially updated: at any time, only
   one entity is up to date, all other entities are effectively lagging
   behind. This is undesirable.

   To illustrate, if we have n runnable entities in the cfs_rq, as time
   elapses, they certainly become outdated:

     t0: cfs_rq { e1_old, e2_old, ..., en_old }

   and when we update:

     t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }

     t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }

     ...

   We solve this by combining all runnable entities' load averages together
   in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
   on the fact that if we regard the update as a function, then:

   w * update(e) = update(w * e) and

   update(e1) + update(e2) = update(e1 + e2), then

   w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)

   therefore, by this rewrite, we have an entirely updated cfs_rq at the
   time we update it:

     t1: update cfs_rq { e1_new, e2_new, ..., en_new }

     t2: update cfs_rq { e1_new, e2_new, ..., en_new }

     ...

2. cfs_rq's load average is different between top rq->cfs_rq and other
   task_group's per CPU cfs_rqs in whether or not blocked_load_average
   contributes to the load.

   The basic idea behind runnable load average (the same for utilization)
   is that the blocked state is taken into account as opposed to only
   accounting for the currently runnable state. Therefore, the average
   should include both the runnable/running and blocked load averages.
   This rewrite does that.

   In addition, we also combine runnable/running and blocked averages
   of all entities into the cfs_rq's average, and update it together at
   once. This is based on the fact that:

     update(runnable) + update(blocked) = update(runnable + blocked)

   This significantly reduces the code as we don't need to separately
   maintain/update runnable/running load and blocked load.

3. How task_group entities' share is calculated is complex and imprecise.

   We reduce the complexity in this rewrite to allow a very simple rule:
   the task_group's load_avg is aggregated from its per CPU cfs_rqs's
   load_avgs. Then group entity's weight is simply proportional to its
   own cfs_rq's load_avg / task_group's load_avg. To illustrate,

   if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,

   task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then

   cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share

To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.

As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5fad2b12baf..3981526539c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2020,9 +2020,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
-#ifdef CONFIG_SMP
-	p->se.avg.decay_count		= 0;
-#endif
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
-- 
cgit v1.2.3


From 540247fb5ddf6d2364f90387fa1f8f428d15e683 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 15 Jul 2015 08:04:39 +0800
Subject: sched/fair: Init cfs_rq's sched_entity load average

The runnable load and utilization averages of cfs_rq's sched_entity
were not initiated. Like done to a task, give new cfs_rq' sched_entity
start values to heavy its load in infant time.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-5-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3981526539c5..5ca9ae0b7e31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2304,7 +2304,7 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 
 	/* Initialize new task's runnable average */
-	init_task_runnable_average(p);
+	init_entity_runnable_average(&p->se);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-- 
cgit v1.2.3


From 3c8e4793556981a7f532599959aa3303968056f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Jul 2015 17:31:47 +0200
Subject: sched: Remove finish_arch_switch()

One less arch hook..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ca9ae0b7e31..b11f6240709b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2470,7 +2470,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	vtime_task_switch(prev);
-	finish_arch_switch(prev);
 	perf_event_task_sched_in(prev, current);
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
-- 
cgit v1.2.3


From e237882b8f83dd1a0eece1608bcb689d4f4b221b Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 10 Aug 2015 20:20:48 -0500
Subject: sched/numa: Fix NUMA_DIRECT topology identification

Systems which have all nodes at a distance of at most 1 hop should be
identified as 'NUMA_DIRECT'.

However, the scheduler incorrectly identifies it as 'NUMA_BACKPLANE'.
This is because 'n' is assigned to sched_max_numa_distance but the
code (mis)interprets it to mean 'number of hops'.

Rik had actually used sched_domains_numa_levels for detecting a
'NUMA_DIRECT' topology:

  http://marc.info/?l=linux-kernel&m=141279712429834&w=2

But that was changed when he removed the hops table in the
subsequent version:

  http://marc.info/?l=linux-kernel&m=141353106106771&w=2

Fixing the issue here.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439256048-3748-1-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b11f6240709b..ea6d74345e60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6444,8 +6444,10 @@ static void init_numa_topology_type(void)
 
 	n = sched_max_numa_distance;
 
-	if (n <= 1)
+	if (sched_domains_numa_levels <= 1) {
 		sched_numa_topology_type = NUMA_DIRECT;
+		return;
+	}
 
 	for_each_online_node(a) {
 		for_each_online_node(b) {
-- 
cgit v1.2.3


From 25834c73f93af7f0712c98ca4593691592e6b360 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:34 +0200
Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity()

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it:

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the bug by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea6d74345e60..2e3b983da836 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1153,6 +1153,8 @@ static int migration_cpu_stop(void *data)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -1169,7 +1171,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -1178,6 +1181,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
 	rq = task_rq_lock(p, &flags);
 
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
@@ -1214,6 +1226,11 @@ out:
 
 	return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1612,15 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -4340,7 +4366,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = set_cpus_allowed_ptr(p, new_mask);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -4865,7 +4891,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
@@ -4891,7 +4918,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
-- 
cgit v1.2.3


From c5b2803840817115e9b568d5054e5007ae36176b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:35 +0200
Subject: sched: Make sched_class::set_cpus_allowed() unconditional

Give every class a set_cpus_allowed() method, this enables some small
optimization in the RT,DL implementation by avoiding a double
cpumask_weight() call.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.614517487@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e3b983da836..740f90bdc67b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,17 +1151,22 @@ static int migration_cpu_stop(void *data)
 	return 0;
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-	lockdep_assert_held(&p->pi_lock);
-
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
 	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	lockdep_assert_held(&p->pi_lock);
+	p->sched_class->set_cpus_allowed(p, new_mask);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
-- 
cgit v1.2.3


From 6c37067e27867db172b988cc11b9ff921175dee5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:36 +0200
Subject: sched: Change the sched_class::set_cpus_allowed() calling context

Change the calling context of sched_class::set_cpus_allowed() such
that we can assume the task is inactive.

This allows us to easily make changes that affect accounting done by
enqueue/dequeue. This does in fact completely remove
set_cpus_allowed_rt() and greatly reduces set_cpus_allowed_dl().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.667516139@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 740f90bdc67b..9917c962be99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1163,8 +1163,31 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
 	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+		dequeue_task(rq, p, 0);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
 	p->sched_class->set_cpus_allowed(p, new_mask);
+
+	if (running)
+		p->sched_class->set_curr_task(rq);
+	if (queued)
+		enqueue_task(rq, p, 0);
 }
 
 /*
-- 
cgit v1.2.3