diff options
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 169 | 
1 files changed, 124 insertions, 45 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74..8e78b2430c16 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,7 @@ static void invoke_rcu_core(void);  static void rcu_report_exp_rdp(struct rcu_data *rdp);  static void sync_sched_exp_online_cleanup(int cpu);  static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);  /* rcuc/rcub kthread realtime priority */  static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -648,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user)  	instrumentation_begin();  	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); -	rdp = this_cpu_ptr(&rcu_data);  	rcu_prepare_for_idle();  	rcu_preempt_deferred_qs(current); @@ -1077,7 +1077,6 @@ noinstr void rcu_nmi_enter(void)  	} else if (!in_nmi()) {  		instrumentation_begin();  		rcu_irq_enter_check_tick(); -		instrumentation_end();  	} else  {  		instrumentation_begin();  	} @@ -1672,7 +1671,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)  {  	bool ret = false;  	bool need_qs; -	const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); +	const bool offloaded = rcu_rdp_is_offloaded(rdp);  	raw_lockdep_assert_held_rcu_node(rnp); @@ -2128,7 +2127,7 @@ static void rcu_gp_cleanup(void)  		needgp = true;  	}  	/* Advance CBs to reduce false positives below. */ -	offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); +	offloaded = rcu_rdp_is_offloaded(rdp);  	if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {  		WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);  		WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2327,7 +2326,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)  	unsigned long flags;  	unsigned long mask;  	bool needwake = false; -	const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); +	const bool offloaded = rcu_rdp_is_offloaded(rdp);  	struct rcu_node *rnp;  	WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2414,7 +2413,7 @@ int rcutree_dying_cpu(unsigned int cpu)  	blkd = !!(rnp->qsmask & rdp->grpmask);  	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), -			       blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); +			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));  	return 0;  } @@ -2497,7 +2496,7 @@ static void rcu_do_batch(struct rcu_data *rdp)  	int div;  	bool __maybe_unused empty;  	unsigned long flags; -	const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); +	const bool offloaded = rcu_rdp_is_offloaded(rdp);  	struct rcu_head *rhp;  	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);  	long bl, count = 0; @@ -3066,7 +3065,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)  	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));  	/* Go handle any RCU core processing required. */ -	if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { +	if (unlikely(rcu_rdp_is_offloaded(rdp))) {  		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */  	} else {  		__call_rcu_core(rdp, head, flags); @@ -3229,8 +3228,7 @@ krc_this_cpu_lock(unsigned long *flags)  static inline void  krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)  { -	raw_spin_unlock(&krcp->lock); -	local_irq_restore(flags); +	raw_spin_unlock_irqrestore(&krcp->lock, flags);  }  static inline struct kvfree_rcu_bulk_data * @@ -3464,7 +3462,7 @@ static void fill_page_cache_func(struct work_struct *work)  	for (i = 0; i < rcu_min_cached_objs; i++) {  		bnode = (struct kvfree_rcu_bulk_data *) -			__get_free_page(GFP_KERNEL | __GFP_NOWARN); +			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);  		if (bnode) {  			raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3493,37 +3491,62 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)  	}  } +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags.  If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback.  static inline bool -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, +	unsigned long *flags, void *ptr, bool can_alloc)  {  	struct kvfree_rcu_bulk_data *bnode;  	int idx; -	if (unlikely(!krcp->initialized)) +	*krcp = krc_this_cpu_lock(flags); +	if (unlikely(!(*krcp)->initialized))  		return false; -	lockdep_assert_held(&krcp->lock);  	idx = !!is_vmalloc_addr(ptr);  	/* Check if a new block is required. */ -	if (!krcp->bkvhead[idx] || -			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { -		bnode = get_cached_bnode(krcp); -		/* Switch to emergency path. */ +	if (!(*krcp)->bkvhead[idx] || +			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { +		bnode = get_cached_bnode(*krcp); +		if (!bnode && can_alloc) { +			krc_this_cpu_unlock(*krcp, *flags); + +			// __GFP_NORETRY - allows a light-weight direct reclaim +			// what is OK from minimizing of fallback hitting point of +			// view. Apart of that it forbids any OOM invoking what is +			// also beneficial since we are about to release memory soon. +			// +			// __GFP_NOMEMALLOC - prevents from consuming of all the +			// memory reserves. Please note we have a fallback path. +			// +			// __GFP_NOWARN - it is supposed that an allocation can +			// be failed under low memory or high memory pressure +			// scenarios. +			bnode = (struct kvfree_rcu_bulk_data *) +				__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); +			*krcp = krc_this_cpu_lock(flags); +		} +  		if (!bnode)  			return false;  		/* Initialize the new block. */  		bnode->nr_records = 0; -		bnode->next = krcp->bkvhead[idx]; +		bnode->next = (*krcp)->bkvhead[idx];  		/* Attach it to the head. */ -		krcp->bkvhead[idx] = bnode; +		(*krcp)->bkvhead[idx] = bnode;  	}  	/* Finally insert. */ -	krcp->bkvhead[idx]->records -		[krcp->bkvhead[idx]->nr_records++] = ptr; +	(*krcp)->bkvhead[idx]->records +		[(*krcp)->bkvhead[idx]->nr_records++] = ptr;  	return true;  } @@ -3561,8 +3584,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  		ptr = (unsigned long *) func;  	} -	krcp = krc_this_cpu_lock(&flags); -  	// Queue the object but don't yet schedule the batch.  	if (debug_rcu_head_queue(ptr)) {  		// Probable double kfree_rcu(), just leak. @@ -3570,12 +3591,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  			  __func__, head);  		// Mark as success and leave. -		success = true; -		goto unlock_return; +		return;  	}  	kasan_record_aux_stack(ptr); -	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); +	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);  	if (!success) {  		run_page_cache_worker(krcp); @@ -3774,8 +3794,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);   * get_state_synchronize_rcu - Snapshot current RCU state   *   * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * to determine whether or not a full grace period has elapsed in the - * meantime. + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime.   */  unsigned long get_state_synchronize_rcu(void)  { @@ -3789,13 +3809,76 @@ unsigned long get_state_synchronize_rcu(void)  EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);  /** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime.  If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ +	unsigned long flags; +	unsigned long gp_seq = get_state_synchronize_rcu(); +	bool needwake; +	struct rcu_data *rdp; +	struct rcu_node *rnp; + +	lockdep_assert_irqs_enabled(); +	local_irq_save(flags); +	rdp = this_cpu_ptr(&rcu_data); +	rnp = rdp->mynode; +	raw_spin_lock_rcu_node(rnp); // irqs already disabled. +	needwake = rcu_start_this_gp(rnp, rdp, gp_seq); +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	if (needwake) +		rcu_gp_kthread_wake(); +	return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/** + * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call from + * which oldstate was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibilty to invoke this + * function later on until it does return @true.  Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @oldstate + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless.  If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!). + * Those needing to keep oldstate values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally + * and either refresh them or set a flag indicating that the grace period + * has completed. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ +	if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { +		smp_mb(); /* Ensure GP ends before subsequent accesses. */ +		return true; +	} +	return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + +/**   * cond_synchronize_rcu - Conditionally wait for an RCU grace period   *   * @oldstate: return value from earlier call to get_state_synchronize_rcu()   *   * If a full RCU grace period has elapsed since the earlier call to - * get_state_synchronize_rcu(), just return.  Otherwise, invoke - * synchronize_rcu() to wait for a full grace period. + * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. + * Otherwise, invoke synchronize_rcu() to wait for a full grace period.   *   * Yes, this function does not take counter wrap into account.  But   * counter wrap is harmless.  If the counter wraps, we have waited for @@ -3804,10 +3887,8 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);   */  void cond_synchronize_rcu(unsigned long oldstate)  { -	if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) +	if (!poll_state_synchronize_rcu(oldstate))  		synchronize_rcu(); -	else -		smp_mb(); /* Ensure GP ends before subsequent accesses. */  }  EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3843,13 +3924,13 @@ static int rcu_pending(int user)  		return 1;  	/* Does this CPU have callbacks ready to invoke? */ -	if (!rcu_segcblist_is_offloaded(&rdp->cblist) && +	if (!rcu_rdp_is_offloaded(rdp) &&  	    rcu_segcblist_ready_cbs(&rdp->cblist))  		return 1;  	/* Has RCU gone idle with this CPU needing another grace period? */  	if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && -	    !rcu_segcblist_is_offloaded(&rdp->cblist) && +	    !rcu_rdp_is_offloaded(rdp) &&  	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))  		return 1; @@ -3968,7 +4049,7 @@ void rcu_barrier(void)  	for_each_possible_cpu(cpu) {  		rdp = per_cpu_ptr(&rcu_data, cpu);  		if (cpu_is_offline(cpu) && -		    !rcu_segcblist_is_offloaded(&rdp->cblist)) +		    !rcu_rdp_is_offloaded(rdp))  			continue;  		if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {  			rcu_barrier_trace(TPS("OnlineQ"), cpu, @@ -4083,15 +4164,13 @@ int rcutree_prepare_cpu(unsigned int cpu)  	rdp->dynticks_nesting = 1;	/* CPU not up, no tearing. */  	rcu_dynticks_eqs_online();  	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */ +  	/* -	 * Lock in case the CB/GP kthreads are still around handling -	 * old callbacks (longer term we should flush all callbacks -	 * before completing CPU offline) +	 * Only non-NOCB CPUs that didn't have early-boot callbacks need to be +	 * (re-)initialized.  	 */ -	rcu_nocb_lock(rdp); -	if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ +	if (!rcu_segcblist_is_enabled(&rdp->cblist))  		rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */ -	rcu_nocb_unlock(rdp);  	/*  	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed @@ -4291,7 +4370,7 @@ void rcutree_migrate_callbacks(int cpu)  	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);  	bool needwake; -	if (rcu_segcblist_is_offloaded(&rdp->cblist) || +	if (rcu_rdp_is_offloaded(rdp) ||  	    rcu_segcblist_empty(&rdp->cblist))  		return;  /* No callbacks to migrate. */ @@ -4309,7 +4388,7 @@ void rcutree_migrate_callbacks(int cpu)  	rcu_segcblist_disable(&rdp->cblist);  	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=  		     !rcu_segcblist_n_cbs(&my_rdp->cblist)); -	if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { +	if (rcu_rdp_is_offloaded(my_rdp)) {  		raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */  		__call_rcu_nocb_wake(my_rdp, true, flags);  	} else {  |