From f4ecea309d3e17ba5e90082308125ad23bd5701b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Jul 2015 17:28:11 -0700
Subject: rcu: Use rsp->expedited_wq instead of sync_rcu_preempt_exp_wq

Now that there is an ->expedited_wq waitqueue in each rcu_state structure,
there is no need for the sync_rcu_preempt_exp_wq global variable.  This
commit therefore substitutes ->expedited_wq for sync_rcu_preempt_exp_wq.
It also initializes ->expedited_wq only once at boot instead of at the
start of each expedited grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 775d36cc0050..53d66ebb4811 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3556,7 +3556,6 @@ void synchronize_sched_expedited(void)
 	rcu_exp_gp_seq_start(rsp);
 
 	/* Stop each CPU that is online, non-idle, and not us. */
-	init_waitqueue_head(&rsp->expedited_wq);
 	atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
 	for_each_online_cpu(cpu) {
 		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
@@ -4179,6 +4178,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	init_waitqueue_head(&rsp->gp_wq);
+	init_waitqueue_head(&rsp->expedited_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
-- 
cgit v1.2.3


From 7922cd0e562cb2b8da2c8a0afda2e1c9bb4a94e2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 13:34:32 -0700
Subject: rcu: Move rcu_report_exp_rnp() to allow consolidation

This is a nearly pure code-movement commit, moving rcu_report_exp_rnp(),
sync_rcu_preempt_exp_done(), and rcu_preempted_readers_exp() so
that later commits can make synchronize_sched_expedited() use them.
The non-code-movement portion of this commit tags rcu_report_exp_rnp()
as __maybe_unused to avoid build errors when CONFIG_PREEMPT=n.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 53d66ebb4811..59af27d8bc6a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3379,6 +3379,72 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 	return rcu_seq_done(&rsp->expedited_sequence, s);
 }
 
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+	return rnp->exp_tasks != NULL;
+}
+
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+	return !rcu_preempted_readers_exp(rnp) &&
+	       READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+					      struct rcu_node *rnp, bool wake)
+{
+	unsigned long flags;
+	unsigned long mask;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	for (;;) {
+		if (!sync_rcu_preempt_exp_done(rnp)) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			break;
+		}
+		if (rnp->parent == NULL) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			if (wake) {
+				smp_mb(); /* EGP done before wake_up(). */
+				wake_up(&rsp->expedited_wq);
+			}
+			break;
+		}
+		mask = rnp->grpmask;
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+		rnp = rnp->parent;
+		raw_spin_lock(&rnp->lock); /* irqs already disabled */
+		smp_mb__after_unlock_lock();
+		rnp->expmask &= ~mask;
+	}
+}
+
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp,
-- 
cgit v1.2.3


From b9585e940a0d78770cda8f9aebf81b17b4d19e6d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 16:04:45 -0700
Subject: rcu: Consolidate tree setup for synchronize_rcu_expedited()

This commit replaces sync_rcu_preempt_exp_init1(() and
sync_rcu_preempt_exp_init2() with sync_exp_reset_tree_hotplug()
and sync_exp_reset_tree(), which will also be used by
synchronize_sched_expedited(), and sync_rcu_exp_select_nodes(), which
contains code specific to synchronize_rcu_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 59af27d8bc6a..8526896afea7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3379,6 +3379,87 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 	return rcu_seq_done(&rsp->expedited_sequence, s);
 }
 
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity.  Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online.  This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+	bool done;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long oldmask;
+	int ncpus = READ_ONCE(rsp->ncpus);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_up;
+
+	/* If no new CPUs onlined since last time, nothing to do. */
+	if (likely(ncpus == rsp->ncpus_snap))
+		return;
+	rsp->ncpus_snap = ncpus;
+
+	/*
+	 * Each pass through the following loop propagates newly onlined
+	 * CPUs for the current rcu_node structure up the rcu_node tree.
+	 */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+		if (rnp->expmaskinit == rnp->expmaskinitnext) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			continue;  /* No new CPUs, nothing to do. */
+		}
+
+		/* Update this node's mask, track old value for propagation. */
+		oldmask = rnp->expmaskinit;
+		rnp->expmaskinit = rnp->expmaskinitnext;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+		/* If was already nonzero, nothing to propagate. */
+		if (oldmask)
+			continue;
+
+		/* Propagate the new CPU up the tree. */
+		mask = rnp->grpmask;
+		rnp_up = rnp->parent;
+		done = false;
+		while (rnp_up) {
+			raw_spin_lock_irqsave(&rnp_up->lock, flags);
+			smp_mb__after_unlock_lock();
+			if (rnp_up->expmaskinit)
+				done = true;
+			rnp_up->expmaskinit |= mask;
+			raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+			if (done)
+				break;
+			mask = rnp_up->grpmask;
+			rnp_up = rnp_up->parent;
+		}
+	}
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree_hotplug(rsp);
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(rnp->expmask);
+		rnp->expmask = rnp->expmaskinit;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	}
+}
+
 /*
  * Return non-zero if there are any tasks in RCU read-side critical
  * sections blocking the current preemptible-RCU expedited grace period.
@@ -3971,7 +4052,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
@@ -3993,6 +4073,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
 	smp_mb__after_unlock_lock();
 	rnp->qsmaskinitnext |= mask;
+	rnp->expmaskinitnext |= mask;
+	if (!rdp->beenonline)
+		WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
+	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
-- 
cgit v1.2.3


From 8203d6d0ee784cfb2ebf89053f7fe399abc867d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 Aug 2015 13:53:17 -0700
Subject: rcu: Use single-stage IPI algorithm for RCU expedited grace period

The current preemptible-RCU expedited grace-period algorithm invokes
synchronize_sched_expedited() to enqueue all tasks currently running
in a preemptible-RCU read-side critical section, then waits for all the
->blkd_tasks lists to drain.  This works, but results in both an IPI and
a double context switch even on CPUs that do not happen to be running
in a preemptible RCU read-side critical section.

This commit implements a new algorithm that causes less OS jitter.
This new algorithm IPIs all online CPUs that are not idle (from an
RCU perspective), but refrains from self-IPIs.  If a CPU receiving
this IPI is not in a preemptible RCU read-side critical section (or
is just now exiting one), it pushes quiescence up the rcu_node tree,
otherwise, it sets a flag that will be handled by the upcoming outermost
rcu_read_unlock(), which will then push quiescence up the tree.

The expedited grace period must of course wait on any pre-existing blocked
readers, and newly blocked readers must be queued carefully based on
the state of both the normal and the expedited grace periods.  This
new queueing approach also avoids the need to update boost state,
courtesy of the fact that blocked tasks are no longer ever migrated to
the root rcu_node structure.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 75 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 20 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8526896afea7..6e92bf4337bd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3461,18 +3461,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 }
 
 /*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
-	return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
+ * Return non-zero if there is no RCU expedited grace period in progress
  * for the specified rcu_node structure, in other words, if all CPUs and
  * tasks covered by the specified rcu_node structure have done their bit
  * for the current expedited grace period.  Works only for preemptible
@@ -3482,7 +3471,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
-	return !rcu_preempted_readers_exp(rnp) &&
+	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
@@ -3494,19 +3483,21 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex and the
+ * specified rcu_node structure's ->lock.
  */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
-					      struct rcu_node *rnp, bool wake)
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+				 bool wake, unsigned long flags)
+	__releases(rnp->lock)
 {
-	unsigned long flags;
 	unsigned long mask;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
 	for (;;) {
 		if (!sync_rcu_preempt_exp_done(rnp)) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			if (!rnp->expmask)
+				rcu_initiate_boost(rnp, flags);
+			else
+				raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			break;
 		}
 		if (rnp->parent == NULL) {
@@ -3522,10 +3513,54 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 		rnp = rnp->parent;
 		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(!(rnp->expmask & mask));
 		rnp->expmask &= ~mask;
 	}
 }
 
+/*
+ * Report expedited quiescent state for specified node.  This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+					      struct rcu_node *rnp, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure.  Caller must hold the root
+ * rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+				    unsigned long mask, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	WARN_ON_ONCE((rnp->expmask & mask) != mask);
+	rnp->expmask &= ~mask;
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp,
+					      struct rcu_data *rdp, bool wake)
+{
+	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp,
-- 
cgit v1.2.3


From bce5fa12aad148e15efd9bc0015dc4898b6e723b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Aug 2015 16:03:54 -0700
Subject: rcu: Move synchronize_sched_expedited() to combining tree

Currently, synchronize_sched_expedited() uses a single global counter
to track the number of remaining context switches that the current
expedited grace period must wait on.  This is problematic on large
systems, where the resulting memory contention can be pathological.
This commit therefore makes synchronize_sched_expedited() instead use
the combining tree in the same manner as synchronize_rcu_expedited(),
keeping memory contention down to a dull roar.

This commit creates a temporary function sync_sched_exp_select_cpus()
that is very similar to sync_rcu_exp_select_cpus().  A later commit
will consolidate these two functions, which becomes possible when
synchronize_sched_expedited() switches from stop_one_cpu_nowait() to
smp_call_function_single().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 82 insertions(+), 41 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6e92bf4337bd..d2cdcada6fe0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3642,19 +3642,77 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	struct rcu_data *rdp = data;
 	struct rcu_state *rsp = rdp->rsp;
 
-	/* We are here: If we are last, do the wakeup. */
-	rdp->exp_done = true;
-	if (atomic_dec_and_test(&rsp->expedited_need_qs))
-		wake_up(&rsp->expedited_wq);
+	/* Report the quiescent state. */
+	rcu_report_exp_rdp(rsp, rdp, true);
 	return 0;
 }
 
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
+{
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long mask_ofl_test;
+	unsigned long mask_ofl_ipi;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree(rsp);
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+
+		/* Each pass checks a CPU for identity, offline, and idle. */
+		mask_ofl_test = 0;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+			if (raw_smp_processor_id() == cpu ||
+			    cpu_is_offline(cpu) ||
+			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+				mask_ofl_test |= rdp->grpmask;
+		}
+		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+		/*
+		 * Need to wait for any blocked tasks as well.  Note that
+		 * additional blocking tasks will also block the expedited
+		 * GP until such time as the ->expmask bits are cleared.
+		 */
+		if (rcu_preempt_has_tasks(rnp))
+			rnp->exp_tasks = rnp->blkd_tasks.next;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+		/* IPI the remaining CPUs for expedited quiescent state. */
+		mask = 1;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			if (!(mask_ofl_ipi & mask))
+				continue;
+			rdp = per_cpu_ptr(rsp->rda, cpu);
+			stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+					    rdp, &rdp->exp_stop_work);
+			mask_ofl_ipi &= ~mask;
+		}
+		/* Report quiescent states for those that went offline. */
+		mask_ofl_test |= mask_ofl_ipi;
+		if (mask_ofl_test)
+			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+	}
+}
+
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 {
 	int cpu;
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
-	struct rcu_data *rdp;
+	unsigned long mask;
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	int ret;
 
 	jiffies_stall = rcu_jiffies_till_stall_check();
@@ -3663,33 +3721,36 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	for (;;) {
 		ret = wait_event_interruptible_timeout(
 				rsp->expedited_wq,
-				!atomic_read(&rsp->expedited_need_qs),
+				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
 		if (ret > 0)
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
 			wait_event(rsp->expedited_wq,
-				   !atomic_read(&rsp->expedited_need_qs));
+				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
 		pr_err("INFO: %s detected expedited stalls on CPUs: {",
 		       rsp->name);
-		for_each_online_cpu(cpu) {
-			rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			if (rdp->exp_done)
-				continue;
-			pr_cont(" %d", cpu);
+		rcu_for_each_leaf_node(rsp, rnp) {
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				if (!(rnp->expmask & mask))
+					continue;
+				pr_cont(" %d", cpu);
+			}
+			mask <<= 1;
 		}
 		pr_cont(" } %lu jiffies s: %lu\n",
 			jiffies - jiffies_start, rsp->expedited_sequence);
-		for_each_online_cpu(cpu) {
-			rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			if (rdp->exp_done)
-				continue;
-			dump_cpu_task(cpu);
+		rcu_for_each_leaf_node(rsp, rnp) {
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				if (!(rnp->expmask & mask))
+					continue;
+				dump_cpu_task(cpu);
+			}
 		}
 		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
 	}
@@ -3713,7 +3774,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
  */
 void synchronize_sched_expedited(void)
 {
-	int cpu;
 	unsigned long s;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
@@ -3736,27 +3796,8 @@ void synchronize_sched_expedited(void)
 	}
 
 	rcu_exp_gp_seq_start(rsp);
-
-	/* Stop each CPU that is online, non-idle, and not us. */
-	atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
-	for_each_online_cpu(cpu) {
-		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-		rdp->exp_done = false;
-
-		/* Skip our CPU and any idle CPUs. */
-		if (raw_smp_processor_id() == cpu ||
-		    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-			continue;
-		atomic_inc(&rsp->expedited_need_qs);
-		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-				    rdp, &rdp->exp_stop_work);
-	}
-
-	/* Remove extra count and, if necessary, wait for CPUs to stop. */
-	if (!atomic_dec_and_test(&rsp->expedited_need_qs))
-		synchronize_sched_expedited_wait(rsp);
+	sync_sched_exp_select_cpus(rsp);
+	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
 	mutex_unlock(&rnp->exp_funnel_mutex);
-- 
cgit v1.2.3


From 97c668b8e983b722e2ed765b98b05f644aff1b13 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 11:31:51 -0700
Subject: rcu: Rename qs_pending to core_needs_qs

An upcoming commit needs to invert the sense of the ->passed_quiesce
rcu_data structure field, so this commit is taking this opportunity
to clarify things a bit by renaming ->qs_pending to ->core_needs_qs.

So if !rdp->core_needs_qs, then this CPU need not concern itself with
quiescent states, in particular, it need not acquire its leaf rcu_node
structure's ->lock to check.  Otherwise, it needs to report the next
quiescent state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d2cdcada6fe0..7c158ffc7769 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1746,7 +1746,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
 		rdp->passed_quiesce = 0;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
 		WRITE_ONCE(rdp->gpwrap, false);
 	}
@@ -2357,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	if ((rnp->qsmask & mask) == 0) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
-		rdp->qs_pending = 0;
+		rdp->core_needs_qs = 0;
 
 		/*
 		 * This GP can't end until cpu checks in, so all of our
@@ -2388,7 +2388,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Does this CPU still need to do its part for current grace period?
 	 * If no, return and let the other CPUs do their part as well.
 	 */
-	if (!rdp->qs_pending)
+	if (!rdp->core_needs_qs)
 		return;
 
 	/*
@@ -3828,10 +3828,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->qs_pending && !rdp->passed_quiesce &&
+	    rdp->core_needs_qs && !rdp->passed_quiesce &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
-		rdp->n_rp_qs_pending++;
-	} else if (rdp->qs_pending &&
+		rdp->n_rp_core_needs_qs++;
+	} else if (rdp->core_needs_qs &&
 		   (rdp->passed_quiesce ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
@@ -4157,7 +4157,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
-	rdp->qs_pending = false;
+	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
-- 
cgit v1.2.3


From 0d43eb34f9aabcddf41c99b7af2d0ced33e9a3cc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 14:17:29 -0700
Subject: rcu: Invert passed_quiesce and rename to cpu_no_qs

This commit inverts the sense of the rcu_data structure's ->passed_quiesce
field and renames it to ->cpu_no_qs.  This will allow a later commit to
use an "aggregate OR" operation to test expedited as well as normal grace
periods without added overhead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7c158ffc7769..31e7021ced4d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+	if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_sched_data.cpu_no_qs, false);
 	}
 }
 
 void rcu_bh_qs(void)
 {
-	if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+	if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) {
 		trace_rcu_grace_period(TPS("rcu_bh"),
 				       __this_cpu_read(rcu_bh_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_bh_data.cpu_no_qs, false);
 	}
 }
 
@@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-		rdp->passed_quiesce = 0;
+		rdp->cpu_no_qs = true;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
@@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if ((rdp->passed_quiesce == 0 &&
+	if ((rdp->cpu_no_qs &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
 	    rdp->gpwrap) {
@@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * We will instead need a new quiescent state that lies
 		 * within the current grace period.
 		 */
-		rdp->passed_quiesce = 0;	/* need qs for new gp. */
+		rdp->cpu_no_qs = true;	/* need qs for new gp. */
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesce &&
+	if (rdp->cpu_no_qs &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
@@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->core_needs_qs && !rdp->passed_quiesce &&
+	    rdp->core_needs_qs && rdp->cpu_no_qs &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_core_needs_qs++;
 	} else if (rdp->core_needs_qs &&
-		   (rdp->passed_quiesce ||
+		   (!rdp->cpu_no_qs ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
@@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
-	rdp->passed_quiesce = false;
+	rdp->cpu_no_qs = true;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-- 
cgit v1.2.3


From 5b74c458906fc4a62f932ee8bb801d29baf15fec Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 15:16:57 -0700
Subject: rcu: Make ->cpu_no_qs be a union for aggregate OR

This commit converts the rcu_data structure's ->cpu_no_qs field
to a union.  The bytewise side of this union allows individual access
to indications as to whether this CPU needs to find a quiescent state
for a normal (.norm) and/or expedited (.exp) grace period.  The setwise
side of the union allows testing whether or not a quiescent state is
needed at all, for either type of grace period.

For now, only .norm is used.  A later commit will introduce the expedited
usage.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 31e7021ced4d..3e2875b38eae 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) {
+	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.cpu_no_qs, false);
+		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
 	}
 }
 
 void rcu_bh_qs(void)
 {
-	if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) {
+	if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_bh"),
 				       __this_cpu_read(rcu_bh_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_bh_data.cpu_no_qs, false);
+		__this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
 	}
 }
 
@@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-		rdp->cpu_no_qs = true;
+		rdp->cpu_no_qs.b.norm = true;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
@@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if ((rdp->cpu_no_qs &&
+	if ((rdp->cpu_no_qs.b.norm &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
 	    rdp->gpwrap) {
@@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * We will instead need a new quiescent state that lies
 		 * within the current grace period.
 		 */
-		rdp->cpu_no_qs = true;	/* need qs for new gp. */
+		rdp->cpu_no_qs.b.norm = true;	/* need qs for new gp. */
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (rdp->cpu_no_qs &&
+	if (rdp->cpu_no_qs.b.norm &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
@@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->core_needs_qs && rdp->cpu_no_qs &&
+	    rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_core_needs_qs++;
 	} else if (rdp->core_needs_qs &&
-		   (!rdp->cpu_no_qs ||
+		   (!rdp->cpu_no_qs.b.norm ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
@@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
-	rdp->cpu_no_qs = true;
+	rdp->cpu_no_qs.b.norm = true;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-- 
cgit v1.2.3


From b6a4ae766e3133a4db73fabc81e522d1601cb623 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 29 Jul 2015 13:29:38 +0800
Subject: rcu: Use rcu_callback_t in call_rcu*() and friends

As we now have rcu_callback_t typedefs as the type of rcu callbacks, we
should use it in call_rcu*() and friends as the type of parameters. This
could save us a few lines of code and make it clear which function
requires an rcu callbacks rather than other callbacks as its argument.

Besides, this can also help cscope to generate a better database for
code reading.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 775d36cc0050..b9d9e0249e2f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3017,7 +3017,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
  * is expected to specify a CPU.
  */
 static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+__call_rcu(struct rcu_head *head, rcu_callback_t func,
 	   struct rcu_state *rsp, int cpu, bool lazy)
 {
 	unsigned long flags;
@@ -3088,7 +3088,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 /*
  * Queue an RCU-sched callback for invocation after a grace period.
  */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_sched_state, -1, 0);
 }
@@ -3097,7 +3097,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
  * Queue an RCU callback for invocation after a quicker grace period.
  */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
@@ -3111,7 +3111,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  * function may only be called from __kfree_rcu().
  */
 void kfree_call_rcu(struct rcu_head *head,
-		    void (*func)(struct rcu_head *rcu))
+		    rcu_callback_t func)
 {
 	__call_rcu(head, func, rcu_state_p, -1, 1);
 }
-- 
cgit v1.2.3


From bb73c52bad3666997ed2ec83c0c80c3f8ef55008 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 30 Jul 2015 16:55:38 -0700
Subject: rcu: Don't disable preemption for Tiny and Tree RCU readers

Because preempt_disable() maps to barrier() for non-debug builds,
it forces the compiler to spill and reload registers.  Because Tree
RCU and Tiny RCU now only appear in CONFIG_PREEMPT=n builds, these
barrier() instances generate needless extra code for each instance of
rcu_read_lock() and rcu_read_unlock().  This extra code slows down Tree
RCU and bloats Tiny RCU.

This commit therefore removes the preempt_disable() and preempt_enable()
from the non-preemptible implementations of __rcu_read_lock() and
__rcu_read_unlock(), respectively.  However, for debug purposes,
preempt_disable() and preempt_enable() are still invoked if
CONFIG_PREEMPT_COUNT=y, because this allows detection of sleeping inside
atomic sections in non-preemptible kernels.

However, Tiny and Tree RCU operates by coalescing all RCU read-side
critical sections on a given CPU that lie between successive quiescent
states.  It is therefore necessary to compensate for removing barriers
from __rcu_read_lock() and __rcu_read_unlock() by adding them to a
couple of the RCU functions invoked during quiescent states, namely to
rcu_all_qs() and rcu_note_context_switch().  However, note that the latter
is more paranoia than necessity, at least until link-time optimizations
become more aggressive.

This is based on an earlier patch by Paul E. McKenney, fixing
a bug encountered in kernels built with CONFIG_PREEMPT=n and
CONFIG_PREEMPT_COUNT=y.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b9d9e0249e2f..93c0f23c3e45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -337,12 +337,14 @@ static void rcu_momentary_dyntick_idle(void)
  */
 void rcu_note_context_switch(void)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	rcu_sched_qs();
 	rcu_preempt_note_context_switch();
 	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
 		rcu_momentary_dyntick_idle();
 	trace_rcu_utilization(TPS("End context switch"));
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -353,12 +355,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * RCU flavors in desperate need of a quiescent state, which will normally
  * be none of them).  Either way, do a lightweight quiescent state for
  * all RCU flavors.
+ *
+ * The barrier() calls are redundant in the common case when this is
+ * called externally, but just in case this is called from within this
+ * file.
+ *
  */
 void rcu_all_qs(void)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
 		rcu_momentary_dyntick_idle();
 	this_cpu_inc(rcu_qs_ctr);
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
 
-- 
cgit v1.2.3


From ee968ac61d5a3440b931375d81113e0eedfcb249 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 08:28:35 -0700
Subject: rcu: Eliminate panic when silly boot-time fanout specified

This commit loosens rcutree.rcu_fanout_leaf range checks
and replaces a panic() with a fallback to compile-time values.
This fallback is accompanied by a WARN_ON(), and both occur when the
rcutree.rcu_fanout_leaf value is too small to accommodate the number of
CPUs.  For example, given the current four-level limit for the rcu_node
tree, a system with more than 16 CPUs built with CONFIG_FANOUT=2 must
have rcutree.rcu_fanout_leaf larger than 2.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 93c0f23c3e45..713eb92314b4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4230,13 +4230,12 @@ static void __init rcu_init_geometry(void)
 		rcu_fanout_leaf, nr_cpu_ids);
 
 	/*
-	 * The boot-time rcu_fanout_leaf parameter is only permitted
-	 * to increase the leaf-level fanout, not decrease it.  Of course,
-	 * the leaf-level fanout cannot exceed the number of bits in
-	 * the rcu_node masks.  Complain and fall back to the compile-
-	 * time values if these limits are exceeded.
+	 * The boot-time rcu_fanout_leaf parameter must be at least two
+	 * and cannot exceed the number of bits in the rcu_node masks.
+	 * Complain and fall back to the compile-time values if this
+	 * limit is exceeded.
 	 */
-	if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
+	if (rcu_fanout_leaf < 2 ||
 	    rcu_fanout_leaf > sizeof(unsigned long) * 8) {
 		rcu_fanout_leaf = RCU_FANOUT_LEAF;
 		WARN_ON(1);
@@ -4253,10 +4252,13 @@ static void __init rcu_init_geometry(void)
 
 	/*
 	 * The tree must be able to accommodate the configured number of CPUs.
-	 * If this limit is exceeded than we have a serious problem elsewhere.
+	 * If this limit is exceeded, fall back to the compile-time values.
 	 */
-	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
-		panic("rcu_init_geometry: rcu_capacity[] is too small");
+	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
+		rcu_fanout_leaf = RCU_FANOUT_LEAF;
+		WARN_ON(1);
+		return;
+	}
 
 	/* Calculate the number of levels in the tree. */
 	for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
-- 
cgit v1.2.3


From 77f81fe08ebd99d7e0eefde42ddac06a675bc4ad Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 9 Sep 2015 12:09:49 -0700
Subject: rcu: Finish folding ->fqs_state into ->gp_state

Commit commit 4cdfc175c25c89ee ("rcu: Move quiescent-state forcing
into kthread") started the process of folding the old ->fqs_state into
->gp_state, but did not complete it.  This situation does not cause
any malfunction, but can result in extremely confusing trace output.
This commit completes this task of eliminating ->fqs_state in favor
of ->gp_state.

The old ->fqs_state was also used to decide when to collect dyntick-idle
snapshots.  For this purpose, we add a boolean variable into the kthread,
which is set on the first call to rcu_gp_fqs() for a given grace period
and clear otherwise.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 713eb92314b4..4d296b0fb987 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -98,7 +98,7 @@ struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
 	.rda = &sname##_data, \
 	.call = cr, \
-	.fqs_state = RCU_GP_IDLE, \
+	.gp_state = RCU_GP_IDLE, \
 	.gpnum = 0UL - 300UL, \
 	.completed = 0UL - 300UL, \
 	.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@@ -1936,16 +1936,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
 /*
  * Do one round of quiescent-state forcing.
  */
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 {
-	int fqs_state = fqs_state_in;
 	bool isidle = false;
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
-	if (fqs_state == RCU_SAVE_DYNTICK) {
+	if (first_time) {
 		/* Collect dyntick-idle snapshots. */
 		if (is_sysidle_rcu_state(rsp)) {
 			isidle = true;
@@ -1954,7 +1953,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		force_qs_rnp(rsp, dyntick_save_progress_counter,
 			     &isidle, &maxj);
 		rcu_sysidle_report_gp(rsp, isidle, maxj);
-		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
 		isidle = true;
@@ -1968,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
 	}
-	return fqs_state;
 }
 
 /*
@@ -2032,7 +2029,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	/* Declare grace period done. */
 	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
-	rsp->fqs_state = RCU_GP_IDLE;
+	rsp->gp_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Advance CBs to reduce false positives below. */
 	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@@ -2050,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
  */
 static int __noreturn rcu_gp_kthread(void *arg)
 {
-	int fqs_state;
+	bool first_gp_fqs;
 	int gf;
 	unsigned long j;
 	int ret;
@@ -2082,7 +2079,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 
 		/* Handle quiescent-state forcing. */
-		fqs_state = RCU_SAVE_DYNTICK;
+		first_gp_fqs = true;
 		j = jiffies_till_first_fqs;
 		if (j > HZ) {
 			j = HZ;
@@ -2110,7 +2107,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsstart"));
-				fqs_state = rcu_gp_fqs(rsp, fqs_state);
+				rcu_gp_fqs(rsp, first_gp_fqs);
+				first_gp_fqs = false;
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
-- 
cgit v1.2.3


From 6587a23b6b9bdb47205ec96c703e5bf8a2d39701 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 16:50:39 -0700
Subject: rcu: Switch synchronize_sched_expedited() to IPI

This commit switches synchronize_sched_expedited() from stop_one_cpu_nowait()
to smp_call_function_single(), thus moving from an IPI and a pair of
context switches to an IPI and a single pass through the scheduler.
Of course, if the scheduler actually does decide to switch to a different
task, there will still be a pair of context switches, but there would
likely have been a pair of context switches anyway, just a bit later.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3e2875b38eae..a42168911941 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -161,6 +161,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+static void rcu_report_exp_rdp(struct rcu_state *rsp,
+			       struct rcu_data *rdp, bool wake);
 
 /* rcuc/rcub kthread realtime priority */
 #ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -250,6 +252,12 @@ void rcu_sched_qs(void)
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
+			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+			rcu_report_exp_rdp(&rcu_sched_state,
+					   this_cpu_ptr(&rcu_sched_data),
+					   true);
+		}
 	}
 }
 
@@ -3555,8 +3563,8 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
  * Report expedited quiescent state for specified rcu_data (CPU).
  * Caller must hold the root rcu_node's exp_funnel_mutex.
  */
-static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp,
-					      struct rcu_data *rdp, bool wake)
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+			       bool wake)
 {
 	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
 }
@@ -3637,14 +3645,10 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
-static int synchronize_sched_expedited_cpu_stop(void *data)
+static void synchronize_sched_expedited_cpu_stop(void *data)
 {
-	struct rcu_data *rdp = data;
-	struct rcu_state *rsp = rdp->rsp;
-
-	/* Report the quiescent state. */
-	rcu_report_exp_rdp(rsp, rdp, true);
-	return 0;
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+	resched_cpu(smp_processor_id());
 }
 
 /*
@@ -3659,6 +3663,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	struct rcu_data *rdp;
+	int ret;
 	struct rcu_node *rnp;
 
 	sync_exp_reset_tree(rsp);
@@ -3694,9 +3699,9 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 			if (!(mask_ofl_ipi & mask))
 				continue;
 			rdp = per_cpu_ptr(rsp->rda, cpu);
-			stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-					    rdp, &rdp->exp_stop_work);
-			mask_ofl_ipi &= ~mask;
+			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
+			if (!ret)
+				mask_ofl_ipi &= ~mask;
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -4201,6 +4206,9 @@ int rcu_cpu_notify(struct notifier_block *self,
 			rcu_cleanup_dying_cpu(rsp);
 		break;
 	case CPU_DYING_IDLE:
+		/* QS for any half-done expedited RCU-sched GP. */
+		rcu_sched_qs();
+
 		for_each_rcu_flavor(rsp) {
 			rcu_cleanup_dying_idle_cpu(cpu, rsp);
 		}
-- 
cgit v1.2.3


From 83c2c735e78da1a0d994911f730f6e1d36c88d7a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 20:43:02 -0700
Subject: rcu: Stop silencing lockdep false positive for expedited grace
 periods

This reverts commit af859beaaba4 (rcu: Silence lockdep false positive
for expedited grace periods).  Because synchronize_rcu_expedited()
no longer invokes synchronize_sched_expedited(), ->exp_funnel_mutex
acquisition is no longer nested, so the false positive no longer happens.
This commit therefore removes the extra lockdep data structures, as they
are no longer needed.
---
 kernel/rcu/tree.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a42168911941..15b19baaade5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
 
 /*
  * In order to export the rcu_state name to the tracing tools, it
@@ -4095,7 +4094,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
-	static struct lock_class_key rcu_exp_sched_rdp_class;
 	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
@@ -4111,10 +4109,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	if (rsp == &rcu_sched_state)
-		lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
-					   &rcu_exp_sched_rdp_class,
-					   "rcu_data_exp_sched");
 }
 
 /*
@@ -4340,7 +4334,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
 	static const char * const exp[] = RCU_EXP_NAME_INIT;
-	static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4400,14 +4393,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
 			mutex_init(&rnp->exp_funnel_mutex);
-			if (rsp == &rcu_sched_state)
-				lockdep_set_class_and_name(
-					&rnp->exp_funnel_mutex,
-					&rcu_exp_sched_class[i], exp_sched[i]);
-			else
-				lockdep_set_class_and_name(
-					&rnp->exp_funnel_mutex,
-					&rcu_exp_class[i], exp[i]);
+			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+						   &rcu_exp_class[i], exp[i]);
 		}
 	}
 
-- 
cgit v1.2.3


From 807226e2fbb504d82cd504b7b6114896db41ef63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Aug 2015 12:03:45 -0700
Subject: rcu: Stop excluding CPU hotplug in synchronize_sched_expedited()

Now that synchronize_sched_expedited() uses IPIs, a hook in
rcu_sched_qs(), and the ->expmask field in the rcu_node combining
tree, it is no longer necessary to exclude CPU hotplug.  Any
races with CPU hotplug will be detected when attempting to send
the IPI.  This commit therefore removes the code excluding
CPU hotplug operations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 15b19baaade5..7e7df5deadf9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3785,19 +3785,9 @@ void synchronize_sched_expedited(void)
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
 
-	if (!try_get_online_cpus()) {
-		/* CPU hotplug operation in flight, fall back to normal GP. */
-		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_normal);
-		return;
-	}
-	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-
 	rnp = exp_funnel_lock(rsp, s);
-	if (rnp == NULL) {
-		put_online_cpus();
+	if (rnp == NULL)
 		return;  /* Someone else did our work for us. */
-	}
 
 	rcu_exp_gp_seq_start(rsp);
 	sync_sched_exp_select_cpus(rsp);
@@ -3805,8 +3795,6 @@ void synchronize_sched_expedited(void)
 
 	rcu_exp_gp_seq_end(rsp);
 	mutex_unlock(&rnp->exp_funnel_mutex);
-
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.2.3


From 66fe6cbee44e3f78d05882195a6a38e30b97b936 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2015 17:20:58 -0700
Subject: rcu: Prepare for consolidating expedited CPU selection

This commit brings sync_sched_exp_select_cpus() into alignment with
sync_rcu_exp_select_cpus(), as a first step towards consolidating them
into one function.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7e7df5deadf9..ae582e3d4136 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3661,7 +3661,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 	unsigned long mask;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
-	struct rcu_data *rdp;
 	int ret;
 	struct rcu_node *rnp;
 
@@ -3697,7 +3696,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
-			rdp = per_cpu_ptr(rsp->rda, cpu);
 			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
 			if (!ret)
 				mask_ofl_ipi &= ~mask;
-- 
cgit v1.2.3


From dcdb8807ba0f5127d4dc67daeeb154edf50a93d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2015 19:00:31 -0700
Subject: rcu: Consolidate expedited CPU selection

Now that sync_sched_exp_select_cpus() and sync_rcu_exp_select_cpus()
are identical aside from the the argument to smp_call_function_single(),
this commit consolidates them with a functional argument.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ae582e3d4136..f44f4b30c68a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3654,7 +3654,8 @@ static void synchronize_sched_expedited_cpu_stop(void *data)
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
  */
-static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+				     smp_call_func_t func)
 {
 	int cpu;
 	unsigned long flags;
@@ -3696,7 +3697,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
-			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
+			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret)
 				mask_ofl_ipi &= ~mask;
 		}
@@ -3788,7 +3789,7 @@ void synchronize_sched_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
-	sync_sched_exp_select_cpus(rsp);
+	sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
-- 
cgit v1.2.3


From 74611ecb0fc4c850a8f89a744ce99cbf0dd43cb2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2015 10:20:43 -0700
Subject: rcu: Add online/offline info to expedited stall warning message

This commit makes the RCU CPU stall warning message print online/offline
indications immediately after the CPU number.  A "O" indicates global
offline, a "." global online, and a "o" indicates RCU believes that the
CPU is offline for the current grace period and "." otherwise, and an
"N" indicates that RCU believes that the CPU will be offline for the
next grace period, and "." otherwise, all right after the CPU number.
So for CPU 10, you would normally see "10-...:" indicating that everything
believes that the CPU is online.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f44f4b30c68a..3d033fee0dcb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3737,11 +3737,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		pr_err("INFO: %s detected expedited stalls on CPUs: {",
 		       rsp->name);
 		rcu_for_each_leaf_node(rsp, rnp) {
+			(void)rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				struct rcu_data *rdp;
+
 				if (!(rnp->expmask & mask))
 					continue;
-				pr_cont(" %d", cpu);
+				rdp = per_cpu_ptr(rsp->rda, cpu);
+				pr_cont(" %d-%c%c%c", cpu,
+					"O."[cpu_online(cpu)],
+					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
+					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
 			mask <<= 1;
 		}
-- 
cgit v1.2.3


From c58656382e5f2919b05913584f2c54b4f841bc9f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2015 11:25:48 -0700
Subject: rcu: Add tasks to expedited stall-warning messages

This commit adds task-print ability to the expedited RCU CPU stall
warning messages in preparation for adding stall warnings to
synchornize_rcu_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3d033fee0dcb..b4f71ffa299c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3734,7 +3734,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
-		pr_err("INFO: %s detected expedited stalls on CPUs: {",
+		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rsp->name);
 		rcu_for_each_leaf_node(rsp, rnp) {
 			(void)rcu_print_task_exp_stall(rnp);
-- 
cgit v1.2.3


From 338b0f760e84676130c6e4d8268cb8c923b38c8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 3 Sep 2015 00:45:02 -0700
Subject: rcu: Better hotplug handling for synchronize_sched_expedited()

Earlier versions of synchronize_sched_expedited() can prematurely end
grace periods due to the fact that a CPU marked as cpu_is_offline()
can still be using RCU read-side critical sections during the time that
CPU makes its last pass through the scheduler and into the idle loop
and during the time that a given CPU is in the process of coming online.
This commit therefore eliminates this window by adding additional
interaction with the CPU-hotplug operations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 6 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b4f71ffa299c..42b317e13776 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -246,17 +246,23 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
+	unsigned long flags;
+
 	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+			return;
+		local_irq_save(flags);
 		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
 			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
 			rcu_report_exp_rdp(&rcu_sched_state,
 					   this_cpu_ptr(&rcu_sched_data),
 					   true);
 		}
+		local_irq_restore(flags);
 	}
 }
 
@@ -3553,7 +3559,10 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	WARN_ON_ONCE((rnp->expmask & mask) != mask);
+	if (!(rnp->expmask & mask)) {
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
 	rnp->expmask &= ~mask;
 	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
 }
@@ -3644,12 +3653,37 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void synchronize_sched_expedited_cpu_stop(void *data)
+static void sync_sched_exp_handler(void *data)
 {
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = data;
+
+	rdp = this_cpu_ptr(rsp->rda);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
 	resched_cpu(smp_processor_id());
 }
 
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+	struct rcu_data *rdp;
+	int ret;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = &rcu_sched_state;
+
+	rdp = per_cpu_ptr(rsp->rda, cpu);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+		return;
+	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+	WARN_ON_ONCE(ret);
+}
+
 /*
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
@@ -3677,7 +3711,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
 			if (raw_smp_processor_id() == cpu ||
-			    cpu_is_offline(cpu) ||
 			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
 				mask_ofl_test |= rdp->grpmask;
 		}
@@ -3697,9 +3730,28 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
+retry_ipi:
 			ret = smp_call_function_single(cpu, func, rsp, 0);
-			if (!ret)
+			if (!ret) {
 				mask_ofl_ipi &= ~mask;
+			} else {
+				/* Failed, raced with offline. */
+				raw_spin_lock_irqsave(&rnp->lock, flags);
+				if (cpu_online(cpu) &&
+				    (rnp->expmask & mask)) {
+					raw_spin_unlock_irqrestore(&rnp->lock,
+								   flags);
+					schedule_timeout_uninterruptible(1);
+					if (cpu_online(cpu) &&
+					    (rnp->expmask & mask))
+						goto retry_ipi;
+					raw_spin_lock_irqsave(&rnp->lock,
+							      flags);
+				}
+				if (!(rnp->expmask & mask))
+					mask_ofl_ipi &= ~mask;
+				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			}
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -3796,7 +3848,7 @@ void synchronize_sched_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
-	sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop);
+	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
@@ -4183,6 +4235,7 @@ int rcu_cpu_notify(struct notifier_block *self,
 		break;
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
+		sync_sched_exp_online_cleanup(cpu);
 		rcu_boost_kthread_setaffinity(rnp, -1);
 		break;
 	case CPU_DOWN_PREPARE:
@@ -4195,7 +4248,10 @@ int rcu_cpu_notify(struct notifier_block *self,
 		break;
 	case CPU_DYING_IDLE:
 		/* QS for any half-done expedited RCU-sched GP. */
-		rcu_sched_qs();
+		preempt_disable();
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(rcu_sched_state.rda), true);
+		preempt_enable();
 
 		for_each_rcu_flavor(rsp) {
 			rcu_cleanup_dying_idle_cpu(cpu, rsp);
-- 
cgit v1.2.3