diff options
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 324 | 
1 files changed, 272 insertions, 52 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..d91c9156fab2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@  #include <uapi/linux/sched/types.h>  #include <linux/prefetch.h>  #include <linux/delay.h> -#include <linux/stop_machine.h>  #include <linux/random.h>  #include <linux/trace_events.h>  #include <linux/suspend.h> @@ -55,6 +54,7 @@  #include <linux/oom.h>  #include <linux/smpboot.h>  #include <linux/jiffies.h> +#include <linux/slab.h>  #include <linux/sched/isolation.h>  #include <linux/sched/clock.h>  #include "../time/tick-internal.h" @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {  	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,  	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),  }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = {  	.level = { &rcu_state.node[0] },  	.gp_state = RCU_GP_IDLE,  	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);   * held, but the bit corresponding to the current CPU will be stable   * in most contexts.   */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)  {  	return READ_ONCE(rnp->qsmaskinitnext);  } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)   *   * No ordering, as we are sampling CPU-local information.   */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void)  {  	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)   * Snapshot the ->dynticks counter with full ordering so as to allow   * stable comparison of this counter with past and future snapshots.   */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp)  {  	int snap = atomic_add_return(0, &rdp->dynticks); @@ -529,16 +529,6 @@ static struct rcu_node *rcu_get_root(void)  }  /* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ -	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) -		return "???"; -	return gp_state_names[gs]; -} - -/*   * Send along grace-period-related data for rcutorture diagnostics.   */  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user)  	}  	lockdep_assert_irqs_disabled(); -	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); +	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));  	rdp = this_cpu_ptr(&rcu_data);  	do_nocb_deferred_wakeup(rdp); @@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)  	 * leave it in non-RCU-idle state.  	 */  	if (rdp->dynticks_nmi_nesting != 1) { -		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); +		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, +				  atomic_read(&rdp->dynticks));  		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */  			   rdp->dynticks_nmi_nesting - 2);  		return;  	}  	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ -	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); +	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));  	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */  	if (irq) @@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user)  	rcu_dynticks_task_exit();  	rcu_dynticks_eqs_exit();  	rcu_cleanup_after_idle(); -	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); +	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));  	WRITE_ONCE(rdp->dynticks_nesting, 1);  	WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -800,8 +791,8 @@ void rcu_user_exit(void)   */  static __always_inline void rcu_nmi_enter_common(bool irq)  { -	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);  	long incby = 2; +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);  	/* Complain about underflow. */  	WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq)  	} else if (tick_nohz_full_cpu(rdp->cpu) &&  		   rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&  		   READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { -		rdp->rcu_forced_tick = true; -		tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); +		raw_spin_lock_rcu_node(rdp->mynode); +		// Recheck under lock. +		if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { +			rdp->rcu_forced_tick = true; +			tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); +		} +		raw_spin_unlock_rcu_node(rdp->mynode);  	}  	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),  			  rdp->dynticks_nmi_nesting, -			  rdp->dynticks_nmi_nesting + incby, rdp->dynticks); +			  rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));  	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */  		   rdp->dynticks_nmi_nesting + incby);  	barrier(); @@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void)   */  static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)  { +	raw_lockdep_assert_held_rcu_node(rdp->mynode);  	WRITE_ONCE(rdp->rcu_urgent_qs, false);  	WRITE_ONCE(rdp->rcu_need_heavy_qs, false);  	if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { @@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)  	struct rcu_node *rnp_p;  	raw_lockdep_assert_held_rcu_node(rnp); -	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || +	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||  	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||  	    rnp->qsmask != 0) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp)  	/* If no callbacks are ready, just return. */  	if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {  		trace_rcu_batch_start(rcu_state.name, -				      rcu_segcblist_n_lazy_cbs(&rdp->cblist),  				      rcu_segcblist_n_cbs(&rdp->cblist), 0);  		trace_rcu_batch_end(rcu_state.name, 0,  				    !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp)  	if (unlikely(bl > 100))  		tlimit = local_clock() + rcu_resched_ns;  	trace_rcu_batch_start(rcu_state.name, -			      rcu_segcblist_n_lazy_cbs(&rdp->cblist),  			      rcu_segcblist_n_cbs(&rdp->cblist), bl);  	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);  	if (offloaded) @@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp)  	tick_dep_set_task(current, TICK_DEP_BIT_RCU);  	rhp = rcu_cblist_dequeue(&rcl);  	for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { +		rcu_callback_t f; +  		debug_rcu_head_unqueue(rhp); -		if (__rcu_reclaim(rcu_state.name, rhp)) -			rcu_cblist_dequeued_lazy(&rcl); + +		rcu_lock_acquire(&rcu_callback_map); +		trace_rcu_invoke_callback(rcu_state.name, rhp); + +		f = rhp->func; +		WRITE_ONCE(rhp->func, (rcu_callback_t)0L); +		f(rhp); + +		rcu_lock_release(&rcu_callback_map); +  		/*  		 * Stop only if limit reached and CPU has something to do.  		 * Note: The rcl structure counts down from zero. @@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))  		mask = 0;  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		if (rnp->qsmask == 0) { -			if (!IS_ENABLED(CONFIG_PREEMPTION) || +			if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||  			    rcu_preempt_blocked_readers_cgp(rnp)) {  				/*  				 * No point in scanning bits because they @@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))  			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  			continue;  		} -		for_each_leaf_node_possible_cpu(rnp, cpu) { -			unsigned long bit = leaf_node_cpu_bit(rnp, cpu); -			if ((rnp->qsmask & bit) != 0) { -				rdp = per_cpu_ptr(&rcu_data, cpu); -				if (f(rdp)) { -					mask |= bit; -					rcu_disable_urgency_upon_qs(rdp); -				} +		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { +			rdp = per_cpu_ptr(&rcu_data, cpu); +			if (f(rdp)) { +				mask |= rdp->grpmask; +				rcu_disable_urgency_upon_qs(rdp);  			}  		}  		if (mask != 0) { @@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu)  	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);  	int spincnt; +	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));  	for (spincnt = 0; spincnt < 10; spincnt++) { -		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));  		local_bh_disable();  		*statusp = RCU_KTHREAD_RUNNING;  		local_irq_disable(); @@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)   * is expected to specify a CPU.   */  static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func)  {  	unsigned long flags;  	struct rcu_data *rdp; @@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)  		if (rcu_segcblist_empty(&rdp->cblist))  			rcu_segcblist_init(&rdp->cblist);  	} +  	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))  		return; // Enqueued onto ->nocb_bypass, so just leave.  	/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ -	rcu_segcblist_enqueue(&rdp->cblist, head, lazy); +	rcu_segcblist_enqueue(&rdp->cblist, head);  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rcu_state.name, head,  					 (unsigned long)func, -					 rcu_segcblist_n_lazy_cbs(&rdp->cblist),  					 rcu_segcblist_n_cbs(&rdp->cblist));  	else  		trace_rcu_callback(rcu_state.name, head, -				   rcu_segcblist_n_lazy_cbs(&rdp->cblist),  				   rcu_segcblist_n_cbs(&rdp->cblist));  	/* Go handle any RCU core processing required. */ @@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)   */  void call_rcu(struct rcu_head *head, rcu_callback_t func)  { -	__call_rcu(head, func, 0); +	__call_rcu(head, func);  }  EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { +	struct rcu_work rcu_work; +	struct rcu_head *head_free; +	struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure.  The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files.  Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { +	struct rcu_head *head; +	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; +	spinlock_t lock; +	struct delayed_work monitor_work; +	bool monitor_todo; +	bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ +	unsigned long flags; +	struct rcu_head *head, *next; +	struct kfree_rcu_cpu *krcp; +	struct kfree_rcu_cpu_work *krwp; + +	krwp = container_of(to_rcu_work(work), +			    struct kfree_rcu_cpu_work, rcu_work); +	krcp = krwp->krcp; +	spin_lock_irqsave(&krcp->lock, flags); +	head = krwp->head_free; +	krwp->head_free = NULL; +	spin_unlock_irqrestore(&krcp->lock, flags); + +	// List "head" is now private, so traverse locklessly. +	for (; head; head = next) { +		unsigned long offset = (unsigned long)head->func; + +		next = head->next; +		// Potentially optimize with kfree_bulk in future. +		debug_rcu_head_unqueue(head); +		rcu_lock_acquire(&rcu_callback_map); +		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + +		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { +			/* Could be optimized with kfree_bulk() in future. */ +			kfree((void *)head - offset); +		} + +		rcu_lock_release(&rcu_callback_map); +		cond_resched_tasks_rcu_qs(); +	} +} +  /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ +	int i; +	struct kfree_rcu_cpu_work *krwp = NULL; + +	lockdep_assert_held(&krcp->lock); +	for (i = 0; i < KFREE_N_BATCHES; i++) +		if (!krcp->krw_arr[i].head_free) { +			krwp = &(krcp->krw_arr[i]); +			break; +		} + +	// If a previous RCU batch is in progress, we cannot immediately +	// queue another one, so return false to tell caller to retry. +	if (!krwp) +		return false; + +	krwp->head_free = krcp->head; +	krcp->head = NULL; +	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); +	queue_rcu_work(system_wq, &krwp->rcu_work); +	return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, +					  unsigned long flags) +{ +	// Attempt to start a new batch. +	krcp->monitor_todo = false; +	if (queue_kfree_rcu_work(krcp)) { +		// Success! Our job is done here. +		spin_unlock_irqrestore(&krcp->lock, flags); +		return; +	} + +	// Previous RCU batch still in progress, try again later. +	krcp->monitor_todo = true; +	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); +	spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ +	unsigned long flags; +	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, +						 monitor_work.work); + +	spin_lock_irqsave(&krcp->lock, flags); +	if (krcp->monitor_todo) +		kfree_rcu_drain_unlock(krcp, flags); +	else +		spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1.	Batch requests together to reduce the number of grace periods during + *	heavy kfree_rcu() load. + * + * 2.	It makes it possible to use kfree_bulk() on a large number of + *	kfree_rcu() requests thus reducing cache misses and the per-object + *	overhead of kfree().   */  void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  { -	__call_rcu(head, func, 1); +	unsigned long flags; +	struct kfree_rcu_cpu *krcp; + +	local_irq_save(flags);	// For safely calling this_cpu_ptr(). +	krcp = this_cpu_ptr(&krc); +	if (krcp->initialized) +		spin_lock(&krcp->lock); + +	// Queue the object but don't yet schedule the batch. +	if (debug_rcu_head_queue(head)) { +		// Probable double kfree_rcu(), just leak. +		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", +			  __func__, head); +		goto unlock_return; +	} +	head->func = func; +	head->next = krcp->head; +	krcp->head = head; + +	// Set timer to drain after KFREE_DRAIN_JIFFIES. +	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && +	    !krcp->monitor_todo) { +		krcp->monitor_todo = true; +		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); +	} + +unlock_return: +	if (krcp->initialized) +		spin_unlock(&krcp->lock); +	local_irq_restore(flags);  }  EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ +	int cpu; +	unsigned long flags; + +	for_each_online_cpu(cpu) { +		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + +		spin_lock_irqsave(&krcp->lock, flags); +		if (!krcp->head || krcp->monitor_todo) { +			spin_unlock_irqrestore(&krcp->lock, flags); +			continue; +		} +		krcp->monitor_todo = true; +		schedule_delayed_work_on(cpu, &krcp->monitor_work, +					 KFREE_DRAIN_JIFFIES); +		spin_unlock_irqrestore(&krcp->lock, flags); +	} +} +  /*   * During early boot, any blocking grace-period wait automatically - * implies a grace period.  Later on, this is never the case for PREEMPT. + * implies a grace period.  Later on, this is never the case for PREEMPTION.   * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any   * blocking grace-period wait automatically implies a grace period if   * there is only one CPU online at any point time during execution of   * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to @@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused)  	debug_rcu_head_queue(&rdp->barrier_head);  	rcu_nocb_lock(rdp);  	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); -	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { +	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {  		atomic_inc(&rcu_state.barrier_cpu_count);  	} else {  		debug_rcu_head_unqueue(&rdp->barrier_head); @@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void)  struct workqueue_struct *rcu_gp_wq;  struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ +	int cpu; +	int i; + +	for_each_possible_cpu(cpu) { +		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + +		spin_lock_init(&krcp->lock); +		for (i = 0; i < KFREE_N_BATCHES; i++) +			krcp->krw_arr[i].krcp = krcp; +		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); +		krcp->initialized = true; +	} +} +  void __init rcu_init(void)  {  	int cpu;  	rcu_early_boot_tests(); +	kfree_rcu_batch_init();  	rcu_bootup_announce();  	rcu_init_geometry();  	rcu_init_one();  |