diff options
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 681 | 
1 files changed, 399 insertions, 282 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..9f75f25cc5d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");  static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];  static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];  /*   * In order to export the rcu_state name to the tracing tools, it @@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);  static int rcu_fanout_leaf = RCU_FANOUT_LEAF;  module_param(rcu_fanout_leaf, int, 0444);  int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */ -	NUM_RCU_LVL_0, -	NUM_RCU_LVL_1, -	NUM_RCU_LVL_2, -	NUM_RCU_LVL_3, -	NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */  /* @@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)  	 * It is illegal to enter an extended quiescent state while  	 * in an RCU read-side critical section.  	 */ -	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), -			   "Illegal idle entry in RCU read-side critical section."); -	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), -			   "Illegal idle entry in RCU-bh read-side critical section."); -	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), -			   "Illegal idle entry in RCU-sched read-side critical section."); +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), +			 "Illegal idle entry in RCU read-side critical section."); +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), +			 "Illegal idle entry in RCU-bh read-side critical section."); +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), +			 "Illegal idle entry in RCU-sched read-side critical section.");  }  /* @@ -701,7 +698,7 @@ void rcu_idle_enter(void)  }  EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL  /**   * rcu_user_enter - inform RCU that we are resuming userspace.   * @@ -714,7 +711,7 @@ void rcu_user_enter(void)  {  	rcu_eqs_enter(1);  } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */  /**   * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -828,7 +825,7 @@ void rcu_idle_exit(void)  }  EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL  /**   * rcu_user_exit - inform RCU that we are exiting userspace.   * @@ -839,7 +836,7 @@ void rcu_user_exit(void)  {  	rcu_eqs_exit(1);  } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */  /**   * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)  {  	bool ret; -	preempt_disable(); +	preempt_disable_notrace();  	ret = __rcu_is_watching(); -	preempt_enable(); +	preempt_enable_notrace();  	return ret;  }  EXPORT_SYMBOL_GPL(rcu_is_watching); @@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)  	j = jiffies;  	gpa = READ_ONCE(rsp->gp_activity);  	if (j - gpa > 2 * HZ) -		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", +		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",  		       rsp->name, j - gpa, -		       rsp->gpnum, rsp->completed, rsp->gp_flags); +		       rsp->gpnum, rsp->completed, +		       rsp->gp_flags, rsp->gp_state, +		       rsp->gp_kthread ? rsp->gp_kthread->state : 0);  }  /* @@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)  }  /* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ +	struct rcu_node *rnp = rcu_get_root(rsp); + +	/* Someone like call_rcu() requested a force-quiescent-state scan. */ +	*gfp = READ_ONCE(rsp->gp_flags); +	if (*gfp & RCU_GP_FLAG_FQS) +		return true; + +	/* The current grace period has completed. */ +	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) +		return true; + +	return false; +} + +/*   * Do one round of quiescent-state forcing.   */  static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) @@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)  			wait_event_interruptible(rsp->gp_wq,  						 READ_ONCE(rsp->gp_flags) &  						 RCU_GP_FLAG_INIT); +			rsp->gp_state = RCU_GP_DONE_GPS;  			/* Locking provides needed memory barrier. */  			if (rcu_gp_init(rsp))  				break; @@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)  					       TPS("fqswait"));  			rsp->gp_state = RCU_GP_WAIT_FQS;  			ret = wait_event_interruptible_timeout(rsp->gp_wq, -					((gf = READ_ONCE(rsp->gp_flags)) & -					 RCU_GP_FLAG_FQS) || -					(!READ_ONCE(rnp->qsmask) && -					 !rcu_preempt_blocked_readers_cgp(rnp)), -					j); +					rcu_gp_fqs_check_wake(rsp, &gf), j); +			rsp->gp_state = RCU_GP_DOING_FQS;  			/* Locking provides needed memory barriers. */  			/* If grace period done, leave loop. */  			if (!READ_ONCE(rnp->qsmask) && @@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)  		}  		/* Handle grace-period end. */ +		rsp->gp_state = RCU_GP_CLEANUP;  		rcu_gp_cleanup(rsp); +		rsp->gp_state = RCU_GP_CLEANED;  	}  } @@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)   */  void synchronize_sched(void)  { -	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && -			   !lock_is_held(&rcu_lock_map) && -			   !lock_is_held(&rcu_sched_lock_map), -			   "Illegal synchronize_sched() in RCU-sched read-side critical section"); +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || +			 lock_is_held(&rcu_lock_map) || +			 lock_is_held(&rcu_sched_lock_map), +			 "Illegal synchronize_sched() in RCU-sched read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	if (rcu_gp_is_expedited()) @@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);   */  void synchronize_rcu_bh(void)  { -	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && -			   !lock_is_held(&rcu_lock_map) && -			   !lock_is_held(&rcu_sched_lock_map), -			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || +			 lock_is_held(&rcu_lock_map) || +			 lock_is_held(&rcu_sched_lock_map), +			 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	if (rcu_gp_is_expedited()) @@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)  }  EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -static int synchronize_sched_expedited_cpu_stop(void *data) +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void)  {  	/* -	 * There must be a full memory barrier on each affected CPU -	 * between the time that try_stop_cpus() is called and the -	 * time that it returns. -	 * -	 * In the current initial implementation of cpu_stop, the -	 * above condition is already met when the control reaches -	 * this point and the following smp_mb() is not strictly -	 * necessary.  Do smp_mb() anyway for documentation and -	 * robustness against future implementation changes. +	 * Any prior manipulation of RCU-protected data must happen +	 * before the load from ->gpnum. +	 */ +	smp_mb();  /* ^^^ */ + +	/* +	 * Make sure this load happens before the purportedly +	 * time-consuming work between get_state_synchronize_sched() +	 * and cond_synchronize_sched(). +	 */ +	return smp_load_acquire(&rcu_sched_state.gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); + +/** + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_sched() + * + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return.  Otherwise, invoke + * synchronize_sched() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account.  But + * counter wrap is harmless.  If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_sched(unsigned long oldstate) +{ +	unsigned long newstate; + +	/* +	 * Ensure that this load happens before any RCU-destructive +	 * actions the caller might carry out after we return.  	 */ -	smp_mb(); /* See above comment block. */ +	newstate = smp_load_acquire(&rcu_sched_state.completed); +	if (ULONG_CMP_GE(oldstate, newstate)) +		synchronize_sched(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ +	WRITE_ONCE(*sp, *sp + 1); +	smp_mb(); /* Ensure update-side operation after counter increment. */ +	WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ +	smp_mb(); /* Ensure update-side operation before counter increment. */ +	WRITE_ONCE(*sp, *sp + 1); +	WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ +	unsigned long s; + +	smp_mb(); /* Caller's modifications seen first by other CPUs. */ +	s = (READ_ONCE(*sp) + 3) & ~0x1; +	smp_mb(); /* Above access must not bleed into critical section. */ +	return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ +	return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods.  */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ +	rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ +	rcu_seq_end(&rsp->expedited_sequence); +	smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ +	return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ +	return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, +			       struct rcu_data *rdp, +			       atomic_long_t *stat, unsigned long s) +{ +	if (rcu_exp_gp_seq_done(rsp, s)) { +		if (rnp) +			mutex_unlock(&rnp->exp_funnel_mutex); +		else if (rdp) +			mutex_unlock(&rdp->exp_funnel_mutex); +		/* Ensure test happens before caller kfree(). */ +		smp_mb__before_atomic(); /* ^^^ */ +		atomic_long_inc(stat); +		return true; +	} +	return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods.  Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp0; +	struct rcu_node *rnp1 = NULL; + +	/* +	 * First try directly acquiring the root lock in order to reduce +	 * latency in the common case where expedited grace periods are +	 * rare.  We check mutex_is_locked() to avoid pathological levels of +	 * memory contention on ->exp_funnel_mutex in the heavy-load case. +	 */ +	rnp0 = rcu_get_root(rsp); +	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { +		if (mutex_trylock(&rnp0->exp_funnel_mutex)) { +			if (sync_exp_work_done(rsp, rnp0, NULL, +					       &rsp->expedited_workdone0, s)) +				return NULL; +			return rnp0; +		} +	} + +	/* +	 * Each pass through the following loop works its way +	 * up the rcu_node tree, returning if others have done the +	 * work or otherwise falls through holding the root rnp's +	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure +	 * can be inexact, as it is just promoting locality and is not +	 * strictly needed for correctness. +	 */ +	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); +	if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) +		return NULL; +	mutex_lock(&rdp->exp_funnel_mutex); +	rnp0 = rdp->mynode; +	for (; rnp0 != NULL; rnp0 = rnp0->parent) { +		if (sync_exp_work_done(rsp, rnp1, rdp, +				       &rsp->expedited_workdone2, s)) +			return NULL; +		mutex_lock(&rnp0->exp_funnel_mutex); +		if (rnp1) +			mutex_unlock(&rnp1->exp_funnel_mutex); +		else +			mutex_unlock(&rdp->exp_funnel_mutex); +		rnp1 = rnp0; +	} +	if (sync_exp_work_done(rsp, rnp1, rdp, +			       &rsp->expedited_workdone3, s)) +		return NULL; +	return rnp1; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static int synchronize_sched_expedited_cpu_stop(void *data) +{ +	struct rcu_data *rdp = data; +	struct rcu_state *rsp = rdp->rsp; + +	/* We are here: If we are last, do the wakeup. */ +	rdp->exp_done = true; +	if (atomic_dec_and_test(&rsp->expedited_need_qs)) +		wake_up(&rsp->expedited_wq);  	return 0;  } +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ +	int cpu; +	unsigned long jiffies_stall; +	unsigned long jiffies_start; +	struct rcu_data *rdp; +	int ret; + +	jiffies_stall = rcu_jiffies_till_stall_check(); +	jiffies_start = jiffies; + +	for (;;) { +		ret = wait_event_interruptible_timeout( +				rsp->expedited_wq, +				!atomic_read(&rsp->expedited_need_qs), +				jiffies_stall); +		if (ret > 0) +			return; +		if (ret < 0) { +			/* Hit a signal, disable CPU stall warnings. */ +			wait_event(rsp->expedited_wq, +				   !atomic_read(&rsp->expedited_need_qs)); +			return; +		} +		pr_err("INFO: %s detected expedited stalls on CPUs: {", +		       rsp->name); +		for_each_online_cpu(cpu) { +			rdp = per_cpu_ptr(rsp->rda, cpu); + +			if (rdp->exp_done) +				continue; +			pr_cont(" %d", cpu); +		} +		pr_cont(" } %lu jiffies s: %lu\n", +			jiffies - jiffies_start, rsp->expedited_sequence); +		for_each_online_cpu(cpu) { +			rdp = per_cpu_ptr(rsp->rda, cpu); + +			if (rdp->exp_done) +				continue; +			dump_cpu_task(cpu); +		} +		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; +	} +} +  /**   * synchronize_sched_expedited - Brute-force RCU-sched grace period   * @@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)   * restructure your code to batch your updates, and then use a single   * synchronize_sched() instead.   * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word.  Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs.  If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period.  We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done.  If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot.  In this case, our work is - * done for us, and we can simply return.  Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers.   */  void synchronize_sched_expedited(void)  { -	cpumask_var_t cm; -	bool cma = false;  	int cpu; -	long firstsnap, s, snap; -	int trycount = 0; +	unsigned long s; +	struct rcu_node *rnp;  	struct rcu_state *rsp = &rcu_sched_state; -	/* -	 * If we are in danger of counter wrap, just do synchronize_sched(). -	 * By allowing sync_sched_expedited_started to advance no more than -	 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring -	 * that more than 3.5 billion CPUs would be required to force a -	 * counter wrap on a 32-bit system.  Quite a few more CPUs would of -	 * course be required on a 64-bit system. -	 */ -	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), -			 (ulong)atomic_long_read(&rsp->expedited_done) + -			 ULONG_MAX / 8)) { -		wait_rcu_gp(call_rcu_sched); -		atomic_long_inc(&rsp->expedited_wrap); -		return; -	} +	/* Take a snapshot of the sequence number.  */ +	s = rcu_exp_gp_seq_snap(rsp); -	/* -	 * Take a ticket.  Note that atomic_inc_return() implies a -	 * full memory barrier. -	 */ -	snap = atomic_long_inc_return(&rsp->expedited_start); -	firstsnap = snap;  	if (!try_get_online_cpus()) {  		/* CPU hotplug operation in flight, fall back to normal GP. */  		wait_rcu_gp(call_rcu_sched); @@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)  	}  	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); -	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ -	cma = zalloc_cpumask_var(&cm, GFP_KERNEL); -	if (cma) { -		cpumask_copy(cm, cpu_online_mask); -		cpumask_clear_cpu(raw_smp_processor_id(), cm); -		for_each_cpu(cpu, cm) { -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - -			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) -				cpumask_clear_cpu(cpu, cm); -		} -		if (cpumask_weight(cm) == 0) -			goto all_cpus_idle; +	rnp = exp_funnel_lock(rsp, s); +	if (rnp == NULL) { +		put_online_cpus(); +		return;  /* Someone else did our work for us. */  	} -	/* -	 * Each pass through the following loop attempts to force a -	 * context switch on each CPU. -	 */ -	while (try_stop_cpus(cma ? cm : cpu_online_mask, -			     synchronize_sched_expedited_cpu_stop, -			     NULL) == -EAGAIN) { -		put_online_cpus(); -		atomic_long_inc(&rsp->expedited_tryfail); - -		/* Check to see if someone else did our work for us. */ -		s = atomic_long_read(&rsp->expedited_done); -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { -			/* ensure test happens before caller kfree */ -			smp_mb__before_atomic(); /* ^^^ */ -			atomic_long_inc(&rsp->expedited_workdone1); -			free_cpumask_var(cm); -			return; -		} +	rcu_exp_gp_seq_start(rsp); -		/* No joy, try again later.  Or just synchronize_sched(). */ -		if (trycount++ < 10) { -			udelay(trycount * num_online_cpus()); -		} else { -			wait_rcu_gp(call_rcu_sched); -			atomic_long_inc(&rsp->expedited_normal); -			free_cpumask_var(cm); -			return; -		} +	/* Stop each CPU that is online, non-idle, and not us. */ +	init_waitqueue_head(&rsp->expedited_wq); +	atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ +	for_each_online_cpu(cpu) { +		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); -		/* Recheck to see if someone else did our work for us. */ -		s = atomic_long_read(&rsp->expedited_done); -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { -			/* ensure test happens before caller kfree */ -			smp_mb__before_atomic(); /* ^^^ */ -			atomic_long_inc(&rsp->expedited_workdone2); -			free_cpumask_var(cm); -			return; -		} +		rdp->exp_done = false; -		/* -		 * Refetching sync_sched_expedited_started allows later -		 * callers to piggyback on our grace period.  We retry -		 * after they started, so our grace period works for them, -		 * and they started after our first try, so their grace -		 * period works for us. -		 */ -		if (!try_get_online_cpus()) { -			/* CPU hotplug operation in flight, use normal GP. */ -			wait_rcu_gp(call_rcu_sched); -			atomic_long_inc(&rsp->expedited_normal); -			free_cpumask_var(cm); -			return; -		} -		snap = atomic_long_read(&rsp->expedited_start); -		smp_mb(); /* ensure read is before try_stop_cpus(). */ +		/* Skip our CPU and any idle CPUs. */ +		if (raw_smp_processor_id() == cpu || +		    !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) +			continue; +		atomic_inc(&rsp->expedited_need_qs); +		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, +				    rdp, &rdp->exp_stop_work);  	} -	atomic_long_inc(&rsp->expedited_stoppedcpus); -all_cpus_idle: -	free_cpumask_var(cm); +	/* Remove extra count and, if necessary, wait for CPUs to stop. */ +	if (!atomic_dec_and_test(&rsp->expedited_need_qs)) +		synchronize_sched_expedited_wait(rsp); -	/* -	 * Everyone up to our most recent fetch is covered by our grace -	 * period.  Update the counter, but only if our work is still -	 * relevant -- which it won't be if someone who started later -	 * than we did already did their update. -	 */ -	do { -		atomic_long_inc(&rsp->expedited_done_tries); -		s = atomic_long_read(&rsp->expedited_done); -		if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { -			/* ensure test happens before caller kfree */ -			smp_mb__before_atomic(); /* ^^^ */ -			atomic_long_inc(&rsp->expedited_done_lost); -			break; -		} -	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); -	atomic_long_inc(&rsp->expedited_done_exit); +	rcu_exp_gp_seq_end(rsp); +	mutex_unlock(&rnp->exp_funnel_mutex);  	put_online_cpus();  } @@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)  	struct rcu_state *rsp = rdp->rsp;  	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { -		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); +		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);  		complete(&rsp->barrier_completion);  	} else { -		_rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); +		_rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);  	}  } @@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)  	struct rcu_state *rsp = type;  	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); -	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); +	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);  	atomic_inc(&rsp->barrier_cpu_count);  	rsp->call(&rdp->barrier_head, rcu_barrier_callback);  } @@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)  {  	int cpu;  	struct rcu_data *rdp; -	unsigned long snap = READ_ONCE(rsp->n_barrier_done); -	unsigned long snap_done; +	unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); -	_rcu_barrier_trace(rsp, "Begin", -1, snap); +	_rcu_barrier_trace(rsp, "Begin", -1, s);  	/* Take mutex to serialize concurrent rcu_barrier() requests. */  	mutex_lock(&rsp->barrier_mutex); -	/* -	 * Ensure that all prior references, including to ->n_barrier_done, -	 * are ordered before the _rcu_barrier() machinery. -	 */ -	smp_mb();  /* See above block comment. */ - -	/* -	 * Recheck ->n_barrier_done to see if others did our work for us. -	 * This means checking ->n_barrier_done for an even-to-odd-to-even -	 * transition.  The "if" expression below therefore rounds the old -	 * value up to the next even number and adds two before comparing. -	 */ -	snap_done = rsp->n_barrier_done; -	_rcu_barrier_trace(rsp, "Check", -1, snap_done); - -	/* -	 * If the value in snap is odd, we needed to wait for the current -	 * rcu_barrier() to complete, then wait for the next one, in other -	 * words, we need the value of snap_done to be three larger than -	 * the value of snap.  On the other hand, if the value in snap is -	 * even, we only had to wait for the next rcu_barrier() to complete, -	 * in other words, we need the value of snap_done to be only two -	 * greater than the value of snap.  The "(snap + 3) & ~0x1" computes -	 * this for us (thank you, Linus!). -	 */ -	if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { -		_rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); +	/* Did someone else do our work for us? */ +	if (rcu_seq_done(&rsp->barrier_sequence, s)) { +		_rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);  		smp_mb(); /* caller's subsequent code after above check. */  		mutex_unlock(&rsp->barrier_mutex);  		return;  	} -	/* -	 * Increment ->n_barrier_done to avoid duplicate work.  Use -	 * WRITE_ONCE() to prevent the compiler from speculating -	 * the increment to precede the early-exit check. -	 */ -	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); -	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); -	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); -	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ +	/* Mark the start of the barrier operation. */ +	rcu_seq_start(&rsp->barrier_sequence); +	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);  	/*  	 * Initialize the count to one rather than to zero in order to @@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)  		if (rcu_is_nocb_cpu(cpu)) {  			if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {  				_rcu_barrier_trace(rsp, "OfflineNoCB", cpu, -						   rsp->n_barrier_done); +						   rsp->barrier_sequence);  			} else {  				_rcu_barrier_trace(rsp, "OnlineNoCB", cpu, -						   rsp->n_barrier_done); +						   rsp->barrier_sequence);  				smp_mb__before_atomic();  				atomic_inc(&rsp->barrier_cpu_count);  				__call_rcu(&rdp->barrier_head, @@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)  			}  		} else if (READ_ONCE(rdp->qlen)) {  			_rcu_barrier_trace(rsp, "OnlineQ", cpu, -					   rsp->n_barrier_done); +					   rsp->barrier_sequence);  			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);  		} else {  			_rcu_barrier_trace(rsp, "OnlineNQ", cpu, -					   rsp->n_barrier_done); +					   rsp->barrier_sequence);  		}  	}  	put_online_cpus(); @@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)  	if (atomic_dec_and_test(&rsp->barrier_cpu_count))  		complete(&rsp->barrier_completion); -	/* Increment ->n_barrier_done to prevent duplicate work. */ -	smp_mb(); /* Keep increment after above mechanism. */ -	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); -	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); -	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); -	smp_mb(); /* Keep increment before caller's subsequent code. */ -  	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */  	wait_for_completion(&rsp->barrier_completion); +	/* Mark the end of the barrier operation. */ +	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); +	rcu_seq_end(&rsp->barrier_sequence); +  	/* Other rcu_barrier() invocations can now safely proceed. */  	mutex_unlock(&rsp->barrier_mutex);  } @@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);  	rdp->cpu = cpu;  	rdp->rsp = rsp; +	mutex_init(&rdp->exp_funnel_mutex);  	rcu_boot_init_nocb_percpu_data(rdp);  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  } @@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void)   * Compute the per-level fanout, either using the exact fanout specified   * or balancing the tree, depending on the rcu_fanout_exact boot parameter.   */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)  {  	int i;  	if (rcu_fanout_exact) { -		rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; +		levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;  		for (i = rcu_num_lvls - 2; i >= 0; i--) -			rsp->levelspread[i] = RCU_FANOUT; +			levelspread[i] = RCU_FANOUT;  	} else {  		int ccur;  		int cprv;  		cprv = nr_cpu_ids;  		for (i = rcu_num_lvls - 1; i >= 0; i--) { -			ccur = rsp->levelcnt[i]; -			rsp->levelspread[i] = (cprv + ccur - 1) / ccur; +			ccur = levelcnt[i]; +			levelspread[i] = (cprv + ccur - 1) / ccur;  			cprv = ccur;  		}  	} @@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)  static void __init rcu_init_one(struct rcu_state *rsp,  		struct rcu_data __percpu *rda)  { -	static const char * const buf[] = { -		"rcu_node_0", -		"rcu_node_1", -		"rcu_node_2", -		"rcu_node_3" };  /* Match MAX_RCU_LVLS */ -	static const char * const fqs[] = { -		"rcu_node_fqs_0", -		"rcu_node_fqs_1", -		"rcu_node_fqs_2", -		"rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */ +	static const char * const buf[] = RCU_NODE_NAME_INIT; +	static const char * const fqs[] = RCU_FQS_NAME_INIT; +	static const char * const exp[] = RCU_EXP_NAME_INIT; +	static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;  	static u8 fl_mask = 0x1; + +	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */ +	int levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */  	int cpustride = 1;  	int i;  	int j;  	struct rcu_node *rnp; -	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */ +	BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */  	/* Silence gcc 4.8 false positive about array index out of range. */  	if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) @@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,  	/* Initialize the level-tracking arrays. */  	for (i = 0; i < rcu_num_lvls; i++) -		rsp->levelcnt[i] = num_rcu_lvl[i]; +		levelcnt[i] = num_rcu_lvl[i];  	for (i = 1; i < rcu_num_lvls; i++) -		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; -	rcu_init_levelspread(rsp); +		rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; +	rcu_init_levelspread(levelspread, levelcnt);  	rsp->flavor_mask = fl_mask;  	fl_mask <<= 1;  	/* Initialize the elements themselves, starting from the leaves. */  	for (i = rcu_num_lvls - 1; i >= 0; i--) { -		cpustride *= rsp->levelspread[i]; +		cpustride *= levelspread[i];  		rnp = rsp->level[i]; -		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { +		for (j = 0; j < levelcnt[i]; j++, rnp++) {  			raw_spin_lock_init(&rnp->lock);  			lockdep_set_class_and_name(&rnp->lock,  						   &rcu_node_class[i], buf[i]); @@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,  				rnp->grpmask = 0;  				rnp->parent = NULL;  			} else { -				rnp->grpnum = j % rsp->levelspread[i - 1]; +				rnp->grpnum = j % levelspread[i - 1];  				rnp->grpmask = 1UL << rnp->grpnum;  				rnp->parent = rsp->level[i - 1] + -					      j / rsp->levelspread[i - 1]; +					      j / levelspread[i - 1];  			}  			rnp->level = i;  			INIT_LIST_HEAD(&rnp->blkd_tasks);  			rcu_init_one_nocb(rnp); +			mutex_init(&rnp->exp_funnel_mutex); +			if (rsp == &rcu_sched_state) +				lockdep_set_class_and_name( +					&rnp->exp_funnel_mutex, +					&rcu_exp_sched_class[i], exp_sched[i]); +			else +				lockdep_set_class_and_name( +					&rnp->exp_funnel_mutex, +					&rcu_exp_class[i], exp[i]);  		}  	} @@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void)  {  	ulong d;  	int i; -	int j; -	int n = nr_cpu_ids; -	int rcu_capacity[MAX_RCU_LVLS + 1]; +	int rcu_capacity[RCU_NUM_LVLS];  	/*  	 * Initialize any unspecified boot parameters. @@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void)  		rcu_fanout_leaf, nr_cpu_ids);  	/* -	 * Compute number of nodes that can be handled an rcu_node tree -	 * with the given number of levels.  Setting rcu_capacity[0] makes -	 * some of the arithmetic easier. -	 */ -	rcu_capacity[0] = 1; -	rcu_capacity[1] = rcu_fanout_leaf; -	for (i = 2; i <= MAX_RCU_LVLS; i++) -		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; - -	/*  	 * The boot-time rcu_fanout_leaf parameter is only permitted  	 * to increase the leaf-level fanout, not decrease it.  Of course,  	 * the leaf-level fanout cannot exceed the number of bits in -	 * the rcu_node masks.  Finally, the tree must be able to accommodate -	 * the configured number of CPUs.  Complain and fall back to the -	 * compile-time values if these limits are exceeded. +	 * the rcu_node masks.  Complain and fall back to the compile- +	 * time values if these limits are exceeded.  	 */  	if (rcu_fanout_leaf < RCU_FANOUT_LEAF || -	    rcu_fanout_leaf > sizeof(unsigned long) * 8 || -	    n > rcu_capacity[MAX_RCU_LVLS]) { +	    rcu_fanout_leaf > sizeof(unsigned long) * 8) { +		rcu_fanout_leaf = RCU_FANOUT_LEAF;  		WARN_ON(1);  		return;  	} +	/* +	 * Compute number of nodes that can be handled an rcu_node tree +	 * with the given number of levels. +	 */ +	rcu_capacity[0] = rcu_fanout_leaf; +	for (i = 1; i < RCU_NUM_LVLS; i++) +		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; + +	/* +	 * The tree must be able to accommodate the configured number of CPUs. +	 * If this limit is exceeded than we have a serious problem elsewhere. +	 */ +	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) +		panic("rcu_init_geometry: rcu_capacity[] is too small"); + +	/* Calculate the number of levels in the tree. */ +	for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { +	} +	rcu_num_lvls = i + 1; +  	/* Calculate the number of rcu_nodes at each level of the tree. */ -	for (i = 1; i <= MAX_RCU_LVLS; i++) -		if (n <= rcu_capacity[i]) { -			for (j = 0; j <= i; j++) -				num_rcu_lvl[j] = -					DIV_ROUND_UP(n, rcu_capacity[i - j]); -			rcu_num_lvls = i; -			for (j = i + 1; j <= MAX_RCU_LVLS; j++) -				num_rcu_lvl[j] = 0; -			break; -		} +	for (i = 0; i < rcu_num_lvls; i++) { +		int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; +		num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); +	}  	/* Calculate the total number of rcu_node structures. */  	rcu_num_nodes = 0; -	for (i = 0; i <= MAX_RCU_LVLS; i++) +	for (i = 0; i < rcu_num_lvls; i++)  		rcu_num_nodes += num_rcu_lvl[i]; -	rcu_num_nodes -= n;  }  /*  |