diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/autogroup.c | 5 | ||||
| -rw-r--r-- | kernel/sched/core.c | 167 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 95 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 149 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 140 | ||||
| -rw-r--r-- | kernel/sched/membarrier.c | 177 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 32 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 114 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 6 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 13 | 
10 files changed, 630 insertions, 268 deletions
| diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,13 +1,12 @@  // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" -  #include <linux/proc_fs.h>  #include <linux/seq_file.h> -#include <linux/kallsyms.h>  #include <linux/utsname.h>  #include <linux/security.h>  #include <linux/export.h> +#include "sched.h" +  unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;  static struct autogroup autogroup_default;  static atomic_t autogroup_seq_nr; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7bf32aabfda..e7c535eee0a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu)  	unsigned long flags;  	raw_spin_lock_irqsave(&rq->lock, flags); -	resched_curr(rq); +	if (cpu_online(cpu) || cpu == smp_processor_id()) +		resched_curr(rq);  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } @@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  #ifdef CONFIG_SMP  	if (cpu == rq->cpu) { -		schedstat_inc(rq->ttwu_local); -		schedstat_inc(p->se.statistics.nr_wakeups_local); +		__schedstat_inc(rq->ttwu_local); +		__schedstat_inc(p->se.statistics.nr_wakeups_local);  	} else {  		struct sched_domain *sd; -		schedstat_inc(p->se.statistics.nr_wakeups_remote); +		__schedstat_inc(p->se.statistics.nr_wakeups_remote);  		rcu_read_lock();  		for_each_domain(rq->cpu, sd) {  			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -				schedstat_inc(sd->ttwu_wake_remote); +				__schedstat_inc(sd->ttwu_wake_remote);  				break;  			}  		} @@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  	}  	if (wake_flags & WF_MIGRATED) -		schedstat_inc(p->se.statistics.nr_wakeups_migrate); +		__schedstat_inc(p->se.statistics.nr_wakeups_migrate);  #endif /* CONFIG_SMP */ -	schedstat_inc(rq->ttwu_count); -	schedstat_inc(p->se.statistics.nr_wakeups); +	__schedstat_inc(rq->ttwu_count); +	__schedstat_inc(p->se.statistics.nr_wakeups);  	if (wake_flags & WF_SYNC) -		schedstat_inc(p->se.statistics.nr_wakeups_sync); +		__schedstat_inc(p->se.statistics.nr_wakeups_sync);  }  static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 * If the owning (remote) CPU is still in the middle of schedule() with  	 * this task as prev, wait until its done referencing the task.  	 * -	 * Pairs with the smp_store_release() in finish_lock_switch(). +	 * Pairs with the smp_store_release() in finish_task().  	 *  	 * This ensures that tasks getting woken will be fully ordered against  	 * their previous state and preserve Program Order. @@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)  	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,  	 * as we're not fully set-up yet.  	 */ +	p->recent_used_cpu = task_cpu(p);  	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif  	rq = __task_rq_lock(p, &rf); @@ -2571,6 +2573,62 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,  #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP +	/* +	 * Claim the task as running, we do this before switching to it +	 * such that any running task will have this set. +	 */ +	next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP +	/* +	 * After ->on_cpu is cleared, the task can be moved to a different CPU. +	 * We must ensure this doesn't happen until the switch is completely +	 * finished. +	 * +	 * In particular, the load of prev->state in finish_task_switch() must +	 * happen before this. +	 * +	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). +	 */ +	smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) +{ +	/* +	 * Since the runqueue lock will be released by the next +	 * task (which is an invalid locking op but in the case +	 * of the scheduler it's an obvious special-case), so we +	 * do an early lockdep release here: +	 */ +	rq_unpin_lock(rq, rf); +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#ifdef CONFIG_DEBUG_SPINLOCK +	/* this is a valid case when another task releases the spinlock */ +	rq->lock.owner = next; +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ +	/* +	 * If we are tracking spinlock dependencies then we have to +	 * fix up the runqueue lock - which gets 'carried over' from +	 * prev into current: +	 */ +	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); +	raw_spin_unlock_irq(&rq->lock); +} +  /**   * prepare_task_switch - prepare to switch tasks   * @rq: the runqueue preparing to switch @@ -2591,7 +2649,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  	sched_info_switch(rq, prev, next);  	perf_event_task_sched_out(prev, next);  	fire_sched_out_preempt_notifiers(prev, next); -	prepare_lock_switch(rq, next); +	prepare_task(next);  	prepare_arch_switch(next);  } @@ -2646,29 +2704,34 @@ static struct rq *finish_task_switch(struct task_struct *prev)  	 * the scheduled task must drop that reference.  	 *  	 * We must observe prev->state before clearing prev->on_cpu (in -	 * finish_lock_switch), otherwise a concurrent wakeup can get prev +	 * finish_task), otherwise a concurrent wakeup can get prev  	 * running on another CPU and we could rave with its RUNNING -> DEAD  	 * transition, resulting in a double drop.  	 */  	prev_state = prev->state;  	vtime_task_switch(prev);  	perf_event_task_sched_in(prev, current); -	/* -	 * The membarrier system call requires a full memory barrier -	 * after storing to rq->curr, before going back to user-space. -	 * -	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end -	 * up adding a full barrier to switch_mm(), or we should figure -	 * out if a smp_mb__after_unlock_lock is really the proper API -	 * to use. -	 */ -	smp_mb__after_unlock_lock(); -	finish_lock_switch(rq, prev); +	finish_task(prev); +	finish_lock_switch(rq);  	finish_arch_post_lock_switch();  	fire_sched_in_preempt_notifiers(current); -	if (mm) +	/* +	 * When switching through a kernel thread, the loop in +	 * membarrier_{private,global}_expedited() may have observed that +	 * kernel thread and not issued an IPI. It is therefore possible to +	 * schedule between user->kernel->user threads without passing though +	 * switch_mm(). Membarrier requires a barrier after storing to +	 * rq->curr, before returning to userspace, so provide them here: +	 * +	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly +	 *   provided by mmdrop(), +	 * - a sync_core for SYNC_CORE. +	 */ +	if (mm) { +		membarrier_mm_sync_core_before_usermode(mm);  		mmdrop(mm); +	}  	if (unlikely(prev_state == TASK_DEAD)) {  		if (prev->sched_class->task_dead)  			prev->sched_class->task_dead(prev); @@ -2772,6 +2835,13 @@ context_switch(struct rq *rq, struct task_struct *prev,  	 */  	arch_start_context_switch(prev); +	/* +	 * If mm is non-NULL, we pass through switch_mm(). If mm is +	 * NULL, we will pass through mmdrop() in finish_task_switch(). +	 * Both of these contain the full memory barrier required by +	 * membarrier after storing to rq->curr, before returning to +	 * user-space. +	 */  	if (!mm) {  		next->active_mm = oldmm;  		mmgrab(oldmm); @@ -2786,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); -	/* -	 * Since the runqueue lock will be released by the next -	 * task (which is an invalid locking op but in the case -	 * of the scheduler it's an obvious special-case), so we -	 * do an early lockdep release here: -	 */ -	rq_unpin_lock(rq, rf); -	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +	prepare_lock_switch(rq, next, rf);  	/* Here we just switch the register state and the stack. */  	switch_to(prev, next, prev); @@ -3308,6 +3371,9 @@ static void __sched notrace __schedule(bool preempt)  	 * Make sure that signal_pending_state()->signal_pending() below  	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)  	 * done by the caller to avoid the race with signal_wake_up(). +	 * +	 * The membarrier system call requires a full memory barrier +	 * after coming from user-space, before storing to rq->curr.  	 */  	rq_lock(rq, &rf);  	smp_mb__after_spinlock(); @@ -3355,17 +3421,16 @@ static void __sched notrace __schedule(bool preempt)  		/*  		 * The membarrier system call requires each architecture  		 * to have a full memory barrier after updating -		 * rq->curr, before returning to user-space. For TSO -		 * (e.g. x86), the architecture must provide its own -		 * barrier in switch_mm(). For weakly ordered machines -		 * for which spin_unlock() acts as a full memory -		 * barrier, finish_lock_switch() in common code takes -		 * care of this barrier. For weakly ordered machines for -		 * which spin_unlock() acts as a RELEASE barrier (only -		 * arm64 and PowerPC), arm64 has a full barrier in -		 * switch_to(), and PowerPC has -		 * smp_mb__after_unlock_lock() before -		 * finish_lock_switch(). +		 * rq->curr, before returning to user-space. +		 * +		 * Here are the schemes providing that barrier on the +		 * various architectures: +		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. +		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. +		 * - finish_lock_switch() for weakly-ordered +		 *   architectures where spin_unlock is a full barrier, +		 * - switch_to() for arm64 (weakly-ordered, spin_unlock +		 *   is a RELEASE barrier),  		 */  		++*switch_count; @@ -4040,8 +4105,7 @@ recheck:  			return -EINVAL;  	} -	if (attr->sched_flags & -		~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) +	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))  		return -EINVAL;  	/* @@ -4108,6 +4172,9 @@ recheck:  	}  	if (user) { +		if (attr->sched_flags & SCHED_FLAG_SUGOV) +			return -EINVAL; +  		retval = security_task_setscheduler(p);  		if (retval)  			return retval; @@ -4163,7 +4230,8 @@ change:  		}  #endif  #ifdef CONFIG_SMP -		if (dl_bandwidth_enabled() && dl_policy(policy)) { +		if (dl_bandwidth_enabled() && dl_policy(policy) && +				!(attr->sched_flags & SCHED_FLAG_SUGOV)) {  			cpumask_t *span = rq->rd->span;  			/* @@ -4293,6 +4361,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)  }  EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ +	return __sched_setscheduler(p, attr, false, true); +} +  /**   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.   * @p: the task in question. @@ -4799,7 +4872,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  	ret = sched_getaffinity(pid, mask);  	if (ret == 0) { -		size_t retlen = min_t(size_t, len, cpumask_size()); +		unsigned int retlen = min(len, cpumask_size());  		if (copy_to_user(user_mask_ptr, mask, retlen))  			ret = -EFAULT; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d6717a3331a1..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@  #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY	50 -  struct sugov_tunables {  	struct gov_attr_set attr_set;  	unsigned int rate_limit_us; @@ -60,7 +58,8 @@ struct sugov_cpu {  	u64 last_update;  	/* The fields below are only needed when sharing a policy. */ -	unsigned long util; +	unsigned long util_cfs; +	unsigned long util_dl;  	unsigned long max;  	unsigned int flags; @@ -176,21 +175,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,  	return cpufreq_driver_resolve_freq(policy, freq);  } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu)  { -	struct rq *rq = cpu_rq(cpu); -	unsigned long cfs_max; +	struct rq *rq = cpu_rq(sg_cpu->cpu); -	cfs_max = arch_scale_cpu_capacity(NULL, cpu); +	sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); +	sg_cpu->util_cfs = cpu_util_cfs(rq); +	sg_cpu->util_dl  = cpu_util_dl(rq); +} -	*util = min(rq->cfs.avg.util_avg, cfs_max); -	*max = cfs_max; +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ +	/* +	 * Ideally we would like to set util_dl as min/guaranteed freq and +	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet +	 * ready for such an interface. So, we only do the latter for now. +	 */ +	return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);  } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, -				   unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)  { -	if (flags & SCHED_CPUFREQ_IOWAIT) { +	if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {  		if (sg_cpu->iowait_boost_pending)  			return; @@ -264,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	unsigned int next_f;  	bool busy; -	sugov_set_iowait_boost(sg_cpu, time, flags); +	sugov_set_iowait_boost(sg_cpu, time);  	sg_cpu->last_update = time;  	if (!sugov_should_update_freq(sg_policy, time)) @@ -272,10 +278,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	busy = sugov_cpu_is_busy(sg_cpu); -	if (flags & SCHED_CPUFREQ_RT_DL) { +	if (flags & SCHED_CPUFREQ_RT) {  		next_f = policy->cpuinfo.max_freq;  	} else { -		sugov_get_util(&util, &max, sg_cpu->cpu); +		sugov_get_util(sg_cpu); +		max = sg_cpu->max; +		util = sugov_aggregate_util(sg_cpu);  		sugov_iowait_boost(sg_cpu, &util, &max);  		next_f = get_next_freq(sg_policy, util, max);  		/* @@ -305,23 +313,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)  		s64 delta_ns;  		/* -		 * If the CPU utilization was last updated before the previous -		 * frequency update and the time elapsed between the last update -		 * of the CPU utilization and the last frequency update is long -		 * enough, don't take the CPU into account as it probably is -		 * idle now (and clear iowait_boost for it). +		 * If the CFS CPU utilization was last updated before the +		 * previous frequency update and the time elapsed between the +		 * last update of the CPU utilization and the last frequency +		 * update is long enough, reset iowait_boost and util_cfs, as +		 * they are now probably stale. However, still consider the +		 * CPU contribution if it has some DEADLINE utilization +		 * (util_dl).  		 */  		delta_ns = time - j_sg_cpu->last_update;  		if (delta_ns > TICK_NSEC) {  			j_sg_cpu->iowait_boost = 0;  			j_sg_cpu->iowait_boost_pending = false; -			continue; +			j_sg_cpu->util_cfs = 0; +			if (j_sg_cpu->util_dl == 0) +				continue;  		} -		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) +		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)  			return policy->cpuinfo.max_freq; -		j_util = j_sg_cpu->util;  		j_max = j_sg_cpu->max; +		j_util = sugov_aggregate_util(j_sg_cpu);  		if (j_util * max > j_max * util) {  			util = j_util;  			max = j_max; @@ -338,22 +350,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,  {  	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);  	struct sugov_policy *sg_policy = sg_cpu->sg_policy; -	unsigned long util, max;  	unsigned int next_f; -	sugov_get_util(&util, &max, sg_cpu->cpu); -  	raw_spin_lock(&sg_policy->update_lock); -	sg_cpu->util = util; -	sg_cpu->max = max; +	sugov_get_util(sg_cpu);  	sg_cpu->flags = flags; -	sugov_set_iowait_boost(sg_cpu, time, flags); +	sugov_set_iowait_boost(sg_cpu, time);  	sg_cpu->last_update = time;  	if (sugov_should_update_freq(sg_policy, time)) { -		if (flags & SCHED_CPUFREQ_RT_DL) +		if (flags & SCHED_CPUFREQ_RT)  			next_f = sg_policy->policy->cpuinfo.max_freq;  		else  			next_f = sugov_next_freq_shared(sg_cpu, time); @@ -383,9 +391,9 @@ static void sugov_irq_work(struct irq_work *irq_work)  	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);  	/* -	 * For RT and deadline tasks, the schedutil governor shoots the -	 * frequency to maximum. Special care must be taken to ensure that this -	 * kthread doesn't result in the same behavior. +	 * For RT tasks, the schedutil governor shoots the frequency to maximum. +	 * Special care must be taken to ensure that this kthread doesn't result +	 * in the same behavior.  	 *  	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is  	 * updated only at the end of the sugov_work() function and before that @@ -470,7 +478,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)  static int sugov_kthread_create(struct sugov_policy *sg_policy)  {  	struct task_struct *thread; -	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; +	struct sched_attr attr = { +		.size = sizeof(struct sched_attr), +		.sched_policy = SCHED_DEADLINE, +		.sched_flags = SCHED_FLAG_SUGOV, +		.sched_nice = 0, +		.sched_priority = 0, +		/* +		 * Fake (unused) bandwidth; workaround to "fix" +		 * priority inheritance. +		 */ +		.sched_runtime	=  1000000, +		.sched_deadline = 10000000, +		.sched_period	= 10000000, +	};  	struct cpufreq_policy *policy = sg_policy->policy;  	int ret; @@ -488,10 +509,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)  		return PTR_ERR(thread);  	} -	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); +	ret = sched_setattr_nocheck(thread, &attr);  	if (ret) {  		kthread_stop(thread); -		pr_warn("%s: failed to set SCHED_FIFO\n", __func__); +		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);  		return ret;  	} @@ -655,7 +676,7 @@ static int sugov_start(struct cpufreq_policy *policy)  		memset(sg_cpu, 0, sizeof(*sg_cpu));  		sg_cpu->cpu = cpu;  		sg_cpu->sg_policy = sg_policy; -		sg_cpu->flags = SCHED_CPUFREQ_RT; +		sg_cpu->flags = 0;  		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;  	} diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736c7616..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)  #endif  static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)  {  	u64 old = dl_rq->running_bw; @@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)  	dl_rq->running_bw += dl_bw;  	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */  	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +	/* kick cpufreq (see the comment in kernel/sched/sched.h). */ +	cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);  }  static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)  {  	u64 old = dl_rq->running_bw; @@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)  	SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */  	if (dl_rq->running_bw > old)  		dl_rq->running_bw = 0; +	/* kick cpufreq (see the comment in kernel/sched/sched.h). */ +	cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);  }  static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)  {  	u64 old = dl_rq->this_bw; @@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)  }  static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)  {  	u64 old = dl_rq->this_bw; @@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)  	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);  } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	if (!dl_entity_is_special(dl_se)) +		__add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	if (!dl_entity_is_special(dl_se)) +		__sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	if (!dl_entity_is_special(dl_se)) +		__add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	if (!dl_entity_is_special(dl_se)) +		__sub_running_bw(dl_se->dl_bw, dl_rq); +} +  void dl_change_utilization(struct task_struct *p, u64 new_bw)  {  	struct rq *rq; +	BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); +  	if (task_on_rq_queued(p))  		return;  	rq = task_rq(p);  	if (p->dl.dl_non_contending) { -		sub_running_bw(p->dl.dl_bw, &rq->dl); +		sub_running_bw(&p->dl, &rq->dl);  		p->dl.dl_non_contending = 0;  		/*  		 * If the timer handler is currently running and the @@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)  		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)  			put_task_struct(p);  	} -	sub_rq_bw(p->dl.dl_bw, &rq->dl); -	add_rq_bw(new_bw, &rq->dl); +	__sub_rq_bw(p->dl.dl_bw, &rq->dl); +	__add_rq_bw(new_bw, &rq->dl);  }  /* @@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)  	if (dl_se->dl_runtime == 0)  		return; +	if (dl_entity_is_special(dl_se)) +		return; +  	WARN_ON(hrtimer_active(&dl_se->inactive_timer));  	WARN_ON(dl_se->dl_non_contending); @@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)  	 */  	if (zerolag_time < 0) {  		if (dl_task(p)) -			sub_running_bw(dl_se->dl_bw, dl_rq); +			sub_running_bw(dl_se, dl_rq);  		if (!dl_task(p) || p->state == TASK_DEAD) {  			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));  			if (p->state == TASK_DEAD) -				sub_rq_bw(p->dl.dl_bw, &rq->dl); +				sub_rq_bw(&p->dl, &rq->dl);  			raw_spin_lock(&dl_b->lock);  			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));  			__dl_clear_params(p); @@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)  		return;  	if (flags & ENQUEUE_MIGRATED) -		add_rq_bw(dl_se->dl_bw, dl_rq); +		add_rq_bw(dl_se, dl_rq);  	if (dl_se->dl_non_contending) {  		dl_se->dl_non_contending = 0; @@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)  		 * when the "inactive timer" fired).  		 * So, add it back.  		 */ -		add_running_bw(dl_se->dl_bw, dl_rq); +		add_running_bw(dl_se, dl_rq);  	}  } @@ -1114,7 +1151,9 @@ static void update_curr_dl(struct rq *rq)  {  	struct task_struct *curr = rq->curr;  	struct sched_dl_entity *dl_se = &curr->dl; -	u64 delta_exec; +	u64 delta_exec, scaled_delta_exec; +	int cpu = cpu_of(rq); +	u64 now;  	if (!dl_task(curr) || !on_dl_rq(dl_se))  		return; @@ -1127,34 +1166,58 @@ static void update_curr_dl(struct rq *rq)  	 * natural solution, but the full ramifications of this  	 * approach need further study.  	 */ -	delta_exec = rq_clock_task(rq) - curr->se.exec_start; +	now = rq_clock_task(rq); +	delta_exec = now - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0)) {  		if (unlikely(dl_se->dl_yielded))  			goto throttle;  		return;  	} -	/* kick cpufreq (see the comment in kernel/sched/sched.h). */ -	cpufreq_update_util(rq, SCHED_CPUFREQ_DL); -  	schedstat_set(curr->se.statistics.exec_max,  		      max(curr->se.statistics.exec_max, delta_exec));  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); -	curr->se.exec_start = rq_clock_task(rq); +	curr->se.exec_start = now;  	cgroup_account_cputime(curr, delta_exec);  	sched_rt_avg_update(rq, delta_exec); -	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) -		delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); -	dl_se->runtime -= delta_exec; +	if (dl_entity_is_special(dl_se)) +		return; + +	/* +	 * For tasks that participate in GRUB, we implement GRUB-PA: the +	 * spare reclaimed bandwidth is used to clock down frequency. +	 * +	 * For the others, we still need to scale reservation parameters +	 * according to current frequency and CPU maximum capacity. +	 */ +	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { +		scaled_delta_exec = grub_reclaim(delta_exec, +						 rq, +						 &curr->dl); +	} else { +		unsigned long scale_freq = arch_scale_freq_capacity(cpu); +		unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + +		scaled_delta_exec = cap_scale(delta_exec, scale_freq); +		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); +	} + +	dl_se->runtime -= scaled_delta_exec;  throttle:  	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {  		dl_se->dl_throttled = 1; + +		/* If requested, inform the user about runtime overruns. */ +		if (dl_runtime_exceeded(dl_se) && +		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) +			dl_se->dl_overrun = 1; +  		__dequeue_task_dl(rq, curr, 0);  		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))  			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -1204,8 +1267,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)  		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));  		if (p->state == TASK_DEAD && dl_se->dl_non_contending) { -			sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); -			sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); +			sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); +			sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));  			dl_se->dl_non_contending = 0;  		} @@ -1222,7 +1285,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)  	sched_clock_tick();  	update_rq_clock(rq); -	sub_running_bw(dl_se->dl_bw, &rq->dl); +	sub_running_bw(dl_se, &rq->dl);  	dl_se->dl_non_contending = 0;  unlock:  	task_rq_unlock(rq, p, &rf); @@ -1416,8 +1479,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  		dl_check_constrained_dl(&p->dl);  	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { -		add_rq_bw(p->dl.dl_bw, &rq->dl); -		add_running_bw(p->dl.dl_bw, &rq->dl); +		add_rq_bw(&p->dl, &rq->dl); +		add_running_bw(&p->dl, &rq->dl);  	}  	/* @@ -1457,8 +1520,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)  	__dequeue_task_dl(rq, p, flags);  	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { -		sub_running_bw(p->dl.dl_bw, &rq->dl); -		sub_rq_bw(p->dl.dl_bw, &rq->dl); +		sub_running_bw(&p->dl, &rq->dl); +		sub_rq_bw(&p->dl, &rq->dl);  	}  	/* @@ -1564,7 +1627,7 @@ static void migrate_task_rq_dl(struct task_struct *p)  	 */  	raw_spin_lock(&rq->lock);  	if (p->dl.dl_non_contending) { -		sub_running_bw(p->dl.dl_bw, &rq->dl); +		sub_running_bw(&p->dl, &rq->dl);  		p->dl.dl_non_contending = 0;  		/*  		 * If the timer handler is currently running and the @@ -1576,7 +1639,7 @@ static void migrate_task_rq_dl(struct task_struct *p)  		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)  			put_task_struct(p);  	} -	sub_rq_bw(p->dl.dl_bw, &rq->dl); +	sub_rq_bw(&p->dl, &rq->dl);  	raw_spin_unlock(&rq->lock);  } @@ -2019,11 +2082,11 @@ retry:  	}  	deactivate_task(rq, next_task, 0); -	sub_running_bw(next_task->dl.dl_bw, &rq->dl); -	sub_rq_bw(next_task->dl.dl_bw, &rq->dl); +	sub_running_bw(&next_task->dl, &rq->dl); +	sub_rq_bw(&next_task->dl, &rq->dl);  	set_task_cpu(next_task, later_rq->cpu); -	add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); -	add_running_bw(next_task->dl.dl_bw, &later_rq->dl); +	add_rq_bw(&next_task->dl, &later_rq->dl); +	add_running_bw(&next_task->dl, &later_rq->dl);  	activate_task(later_rq, next_task, 0);  	ret = 1; @@ -2111,11 +2174,11 @@ static void pull_dl_task(struct rq *this_rq)  			resched = true;  			deactivate_task(src_rq, p, 0); -			sub_running_bw(p->dl.dl_bw, &src_rq->dl); -			sub_rq_bw(p->dl.dl_bw, &src_rq->dl); +			sub_running_bw(&p->dl, &src_rq->dl); +			sub_rq_bw(&p->dl, &src_rq->dl);  			set_task_cpu(p, this_cpu); -			add_rq_bw(p->dl.dl_bw, &this_rq->dl); -			add_running_bw(p->dl.dl_bw, &this_rq->dl); +			add_rq_bw(&p->dl, &this_rq->dl); +			add_running_bw(&p->dl, &this_rq->dl);  			activate_task(this_rq, p, 0);  			dmin = p->dl.deadline; @@ -2224,7 +2287,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)  		task_non_contending(p);  	if (!task_on_rq_queued(p)) -		sub_rq_bw(p->dl.dl_bw, &rq->dl); +		sub_rq_bw(&p->dl, &rq->dl);  	/*  	 * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2256,7 +2319,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)  	/* If p is not queued we will update its parameters at next wakeup. */  	if (!task_on_rq_queued(p)) { -		add_rq_bw(p->dl.dl_bw, &rq->dl); +		add_rq_bw(&p->dl, &rq->dl);  		return;  	} @@ -2435,6 +2498,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,  	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;  	int cpus, err = -1; +	if (attr->sched_flags & SCHED_FLAG_SUGOV) +		return 0; +  	/* !deadline task may carry old deadline bandwidth */  	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))  		return 0; @@ -2521,6 +2587,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)   */  bool __checkparam_dl(const struct sched_attr *attr)  { +	/* special dl tasks don't actually use any parameter */ +	if (attr->sched_flags & SCHED_FLAG_SUGOV) +		return true; +  	/* deadline != 0 */  	if (attr->sched_deadline == 0)  		return false; @@ -2566,6 +2636,7 @@ void __dl_clear_params(struct task_struct *p)  	dl_se->dl_throttled = 0;  	dl_se->dl_yielded = 0;  	dl_se->dl_non_contending = 0; +	dl_se->dl_overrun = 0;  }  bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26a71ebcd3c2..5eb3ffc9be84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  	    likely(wait_start > prev_wait_start))  		wait_start -= prev_wait_start; -	schedstat_set(se->statistics.wait_start, wait_start); +	__schedstat_set(se->statistics.wait_start, wait_start);  }  static inline void @@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  			 * time stamp can be adjusted to accumulate wait time  			 * prior to migration.  			 */ -			schedstat_set(se->statistics.wait_start, delta); +			__schedstat_set(se->statistics.wait_start, delta);  			return;  		}  		trace_sched_stat_wait(p, delta);  	} -	schedstat_set(se->statistics.wait_max, +	__schedstat_set(se->statistics.wait_max,  		      max(schedstat_val(se->statistics.wait_max), delta)); -	schedstat_inc(se->statistics.wait_count); -	schedstat_add(se->statistics.wait_sum, delta); -	schedstat_set(se->statistics.wait_start, 0); +	__schedstat_inc(se->statistics.wait_count); +	__schedstat_add(se->statistics.wait_sum, delta); +	__schedstat_set(se->statistics.wait_start, 0);  }  static inline void @@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  			delta = 0;  		if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) -			schedstat_set(se->statistics.sleep_max, delta); +			__schedstat_set(se->statistics.sleep_max, delta); -		schedstat_set(se->statistics.sleep_start, 0); -		schedstat_add(se->statistics.sum_sleep_runtime, delta); +		__schedstat_set(se->statistics.sleep_start, 0); +		__schedstat_add(se->statistics.sum_sleep_runtime, delta);  		if (tsk) {  			account_scheduler_latency(tsk, delta >> 10, 1); @@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  			delta = 0;  		if (unlikely(delta > schedstat_val(se->statistics.block_max))) -			schedstat_set(se->statistics.block_max, delta); +			__schedstat_set(se->statistics.block_max, delta); -		schedstat_set(se->statistics.block_start, 0); -		schedstat_add(se->statistics.sum_sleep_runtime, delta); +		__schedstat_set(se->statistics.block_start, 0); +		__schedstat_add(se->statistics.sum_sleep_runtime, delta);  		if (tsk) {  			if (tsk->in_iowait) { -				schedstat_add(se->statistics.iowait_sum, delta); -				schedstat_inc(se->statistics.iowait_count); +				__schedstat_add(se->statistics.iowait_sum, delta); +				__schedstat_inc(se->statistics.iowait_count);  				trace_sched_stat_iowait(tsk, delta);  			} @@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  		struct task_struct *tsk = task_of(se);  		if (tsk->state & TASK_INTERRUPTIBLE) -			schedstat_set(se->statistics.sleep_start, +			__schedstat_set(se->statistics.sleep_start,  				      rq_clock(rq_of(cfs_rq)));  		if (tsk->state & TASK_UNINTERRUPTIBLE) -			schedstat_set(se->statistics.block_start, +			__schedstat_set(se->statistics.block_start,  				      rq_clock(rq_of(cfs_rq)));  	}  } @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  		/*  		 * There are a few boundary cases this might miss but it should  		 * get called often enough that that should (hopefully) not be -		 * a real problem -- added to that it only calls on the local -		 * CPU, so if we enqueue remotely we'll miss an update, but -		 * the next tick/schedule should update. +		 * a real problem.  		 *  		 * It will not get called when we go idle, because the idle  		 * thread is a different class (!fair), nor will the utilization @@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)  	return c1 + c2 + c3;  } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -  /*   * Accumulate the three separate parts of the sum; d1 the remainder   * of the last (incomplete) period, d2 the span of full periods and d3 @@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,  	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */  	u64 periods; -	scale_freq = arch_scale_freq_capacity(NULL, cpu); +	scale_freq = arch_scale_freq_capacity(cpu);  	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);  	delta += sa->period_contrib; @@ -5689,28 +5685,38 @@ static int wake_wide(struct task_struct *p)   * soonest. For the purpose of speed we only consider the waking and previous   * CPU.   * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - *			will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + *			cache-affine and is (or	will be) idle.   *   * wake_affine_weight() - considers the weight to reflect the average   *			  scheduling latency of the CPUs. This seems to work   *			  for the overloaded case.   */ - -static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, -		 int this_cpu, int prev_cpu, int sync) +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync)  { -	if (idle_cpu(this_cpu)) -		return true; +	/* +	 * If this_cpu is idle, it implies the wakeup is from interrupt +	 * context. Only allow the move if cache is shared. Otherwise an +	 * interrupt intensive workload could force all tasks onto one +	 * node depending on the IO topology or IRQ affinity settings. +	 * +	 * If the prev_cpu is idle and cache affine then avoid a migration. +	 * There is no guarantee that the cache hot data from an interrupt +	 * is more important than cache hot data on the prev_cpu and from +	 * a cpufreq perspective, it's better to have higher utilisation +	 * on one CPU. +	 */ +	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) +		return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;  	if (sync && cpu_rq(this_cpu)->nr_running == 1) -		return true; +		return this_cpu; -	return false; +	return nr_cpumask_bits;  } -static bool +static int  wake_affine_weight(struct sched_domain *sd, struct task_struct *p,  		   int this_cpu, int prev_cpu, int sync)  { @@ -5724,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,  		unsigned long current_load = task_h_load(current);  		if (current_load > this_eff_load) -			return true; +			return this_cpu;  		this_eff_load -= current_load;  	} @@ -5741,36 +5747,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,  		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;  	prev_eff_load *= capacity_of(this_cpu); -	return this_eff_load <= prev_eff_load; +	return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;  }  static int wake_affine(struct sched_domain *sd, struct task_struct *p,  		       int prev_cpu, int sync)  {  	int this_cpu = smp_processor_id(); -	bool affine = false; +	int target = nr_cpumask_bits; -	if (sched_feat(WA_IDLE) && !affine) -		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); +	if (sched_feat(WA_IDLE)) +		target = wake_affine_idle(this_cpu, prev_cpu, sync); -	if (sched_feat(WA_WEIGHT) && !affine) -		affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); +	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) +		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);  	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); -	if (affine) { -		schedstat_inc(sd->ttwu_move_affine); -		schedstat_inc(p->se.statistics.nr_wakeups_affine); -	} +	if (target == nr_cpumask_bits) +		return prev_cpu; -	return affine; +	schedstat_inc(sd->ttwu_move_affine); +	schedstat_inc(p->se.statistics.nr_wakeups_affine); +	return target;  } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p);  static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)  { -	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);  }  /* @@ -5950,7 +5956,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this  			}  		} else if (shallowest_idle_cpu == -1) {  			load = weighted_cpuload(cpu_rq(i)); -			if (load < min_load || (load == min_load && i == this_cpu)) { +			if (load < min_load) {  				min_load = load;  				least_loaded_cpu = i;  			} @@ -6191,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  static int select_idle_sibling(struct task_struct *p, int prev, int target)  {  	struct sched_domain *sd; -	int i; +	int i, recent_used_cpu;  	if (idle_cpu(target))  		return target; @@ -6202,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))  		return prev; +	/* Check a recently used CPU as a potential idle candidate */ +	recent_used_cpu = p->recent_used_cpu; +	if (recent_used_cpu != prev && +	    recent_used_cpu != target && +	    cpus_share_cache(recent_used_cpu, target) && +	    idle_cpu(recent_used_cpu) && +	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { +		/* +		 * Replace recent_used_cpu with prev as it is a potential +		 * candidate for the next wake. +		 */ +		p->recent_used_cpu = prev; +		return recent_used_cpu; +	} +  	sd = rcu_dereference(per_cpu(sd_llc, target));  	if (!sd)  		return target; @@ -6247,7 +6268,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)   * capacity_orig) as it useful for predicting the capacity required after task   * migrations (scheduler-driven DVFS).   */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu)  {  	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;  	unsigned long capacity = capacity_orig_of(cpu); @@ -6255,7 +6276,7 @@ static int cpu_util(int cpu)  	return (util >= capacity) ? capacity : util;  } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p)  {  	return p->se.avg.util_avg;  } @@ -6264,7 +6285,7 @@ static inline int task_util(struct task_struct *p)   * cpu_util_wake: Compute cpu utilization with any contributions from   * the waking task p removed.   */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p)  {  	unsigned long util, capacity; @@ -6355,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  		if (cpu == prev_cpu)  			goto pick_cpu; -		if (wake_affine(affine_sd, p, prev_cpu, sync)) -			new_cpu = cpu; +		new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);  	}  	if (sd && !(sd_flag & SD_BALANCE_FORK)) { @@ -6370,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  	if (!sd) {  pick_cpu: -		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ +		if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */  			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); +			if (want_affine) +				current->recent_used_cpu = cpu; +		}  	} else {  		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);  	} @@ -6449,8 +6472,7 @@ static void task_dead_fair(struct task_struct *p)  }  #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se)  {  	unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6492,7 +6514,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)  	if (vdiff <= 0)  		return -1; -	gran = wakeup_gran(curr, se); +	gran = wakeup_gran(se);  	if (vdiff > gran)  		return 1; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 9bcbacba82a8..5d0762633639 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -26,24 +26,110 @@   * Bitmask made from a "or" of all commands within enum membarrier_cmd,   * except MEMBARRIER_CMD_QUERY.   */ +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	\ +	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ +	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0 +#endif +  #define MEMBARRIER_CMD_BITMASK	\ -	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\ -	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) +	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ +	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ +	| MEMBARRIER_CMD_PRIVATE_EXPEDITED	\ +	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	\ +	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)  static void ipi_mb(void *info)  {  	smp_mb();	/* IPIs should be serializing but paranoid. */  } -static int membarrier_private_expedited(void) +static int membarrier_global_expedited(void)  {  	int cpu;  	bool fallback = false;  	cpumask_var_t tmpmask; -	if (!(atomic_read(¤t->mm->membarrier_state) -			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) -		return -EPERM; +	if (num_online_cpus() == 1) +		return 0; + +	/* +	 * Matches memory barriers around rq->curr modification in +	 * scheduler. +	 */ +	smp_mb();	/* system call entry is not a mb. */ + +	/* +	 * Expedited membarrier commands guarantee that they won't +	 * block, hence the GFP_NOWAIT allocation flag and fallback +	 * implementation. +	 */ +	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { +		/* Fallback for OOM. */ +		fallback = true; +	} + +	cpus_read_lock(); +	for_each_online_cpu(cpu) { +		struct task_struct *p; + +		/* +		 * Skipping the current CPU is OK even through we can be +		 * migrated at any point. The current CPU, at the point +		 * where we read raw_smp_processor_id(), is ensured to +		 * be in program order with respect to the caller +		 * thread. Therefore, we can skip this CPU from the +		 * iteration. +		 */ +		if (cpu == raw_smp_processor_id()) +			continue; +		rcu_read_lock(); +		p = task_rcu_dereference(&cpu_rq(cpu)->curr); +		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & +				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { +			if (!fallback) +				__cpumask_set_cpu(cpu, tmpmask); +			else +				smp_call_function_single(cpu, ipi_mb, NULL, 1); +		} +		rcu_read_unlock(); +	} +	if (!fallback) { +		preempt_disable(); +		smp_call_function_many(tmpmask, ipi_mb, NULL, 1); +		preempt_enable(); +		free_cpumask_var(tmpmask); +	} +	cpus_read_unlock(); + +	/* +	 * Memory barrier on the caller thread _after_ we finished +	 * waiting for the last IPI. Matches memory barriers around +	 * rq->curr modification in scheduler. +	 */ +	smp_mb();	/* exit from system call is not a mb */ +	return 0; +} + +static int membarrier_private_expedited(int flags) +{ +	int cpu; +	bool fallback = false; +	cpumask_var_t tmpmask; + +	if (flags & MEMBARRIER_FLAG_SYNC_CORE) { +		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) +			return -EINVAL; +		if (!(atomic_read(¤t->mm->membarrier_state) & +		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) +			return -EPERM; +	} else { +		if (!(atomic_read(¤t->mm->membarrier_state) & +		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) +			return -EPERM; +	}  	if (num_online_cpus() == 1)  		return 0; @@ -105,21 +191,69 @@ static int membarrier_private_expedited(void)  	return 0;  } -static void membarrier_register_private_expedited(void) +static int membarrier_register_global_expedited(void)  {  	struct task_struct *p = current;  	struct mm_struct *mm = p->mm; +	if (atomic_read(&mm->membarrier_state) & +	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) +		return 0; +	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); +	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { +		/* +		 * For single mm user, single threaded process, we can +		 * simply issue a memory barrier after setting +		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that +		 * no memory access following registration is reordered +		 * before registration. +		 */ +		smp_mb(); +	} else { +		/* +		 * For multi-mm user threads, we need to ensure all +		 * future scheduler executions will observe the new +		 * thread flag state for this mm. +		 */ +		synchronize_sched(); +	} +	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, +		  &mm->membarrier_state); +	return 0; +} + +static int membarrier_register_private_expedited(int flags) +{ +	struct task_struct *p = current; +	struct mm_struct *mm = p->mm; +	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + +	if (flags & MEMBARRIER_FLAG_SYNC_CORE) { +		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) +			return -EINVAL; +		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; +	} +  	/*  	 * We need to consider threads belonging to different thread  	 * groups, which use the same mm. (CLONE_VM but not  	 * CLONE_THREAD).  	 */ -	if (atomic_read(&mm->membarrier_state) -			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) -		return; -	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, -			&mm->membarrier_state); +	if (atomic_read(&mm->membarrier_state) & state) +		return 0; +	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); +	if (flags & MEMBARRIER_FLAG_SYNC_CORE) +		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, +			  &mm->membarrier_state); +	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { +		/* +		 * Ensure all future scheduler executions will observe the +		 * new thread flag state for this process. +		 */ +		synchronize_sched(); +	} +	atomic_or(state, &mm->membarrier_state); +	return 0;  }  /** @@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)  		int cmd_mask = MEMBARRIER_CMD_BITMASK;  		if (tick_nohz_full_enabled()) -			cmd_mask &= ~MEMBARRIER_CMD_SHARED; +			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;  		return cmd_mask;  	} -	case MEMBARRIER_CMD_SHARED: -		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ +	case MEMBARRIER_CMD_GLOBAL: +		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */  		if (tick_nohz_full_enabled())  			return -EINVAL;  		if (num_online_cpus() > 1)  			synchronize_sched();  		return 0; +	case MEMBARRIER_CMD_GLOBAL_EXPEDITED: +		return membarrier_global_expedited(); +	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: +		return membarrier_register_global_expedited();  	case MEMBARRIER_CMD_PRIVATE_EXPEDITED: -		return membarrier_private_expedited(); +		return membarrier_private_expedited(0);  	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: -		membarrier_register_private_expedited(); -		return 0; +		return membarrier_register_private_expedited(0); +	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: +		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); +	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: +		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);  	default:  		return -EINVAL;  	} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 665ace2fc558..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -951,11 +951,13 @@ static void update_curr_rt(struct rq *rq)  	struct task_struct *curr = rq->curr;  	struct sched_rt_entity *rt_se = &curr->rt;  	u64 delta_exec; +	u64 now;  	if (curr->sched_class != &rt_sched_class)  		return; -	delta_exec = rq_clock_task(rq) - curr->se.exec_start; +	now = rq_clock_task(rq); +	delta_exec = now - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0))  		return; @@ -968,7 +970,7 @@ static void update_curr_rt(struct rq *rq)  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); -	curr->se.exec_start = rq_clock_task(rq); +	curr->se.exec_start = now;  	cgroup_account_cputime(curr, delta_exec);  	sched_rt_avg_update(rq, delta_exec); @@ -1907,9 +1909,8 @@ static void push_rt_tasks(struct rq *rq)   * the rt_loop_next will cause the iterator to perform another scan.   *   */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd)  { -	struct root_domain *rd = rq->rd;  	int next;  	int cpu; @@ -1985,19 +1986,24 @@ static void tell_cpu_to_push(struct rq *rq)  	 * Otherwise it is finishing up and an ipi needs to be sent.  	 */  	if (rq->rd->rto_cpu < 0) -		cpu = rto_next_cpu(rq); +		cpu = rto_next_cpu(rq->rd);  	raw_spin_unlock(&rq->rd->rto_lock);  	rto_start_unlock(&rq->rd->rto_loop_start); -	if (cpu >= 0) +	if (cpu >= 0) { +		/* Make sure the rd does not get freed while pushing */ +		sched_get_rd(rq->rd);  		irq_work_queue_on(&rq->rd->rto_push_work, cpu); +	}  }  /* Called from hardirq context */  void rto_push_irq_work_func(struct irq_work *work)  { +	struct root_domain *rd = +		container_of(work, struct root_domain, rto_push_work);  	struct rq *rq;  	int cpu; @@ -2013,18 +2019,20 @@ void rto_push_irq_work_func(struct irq_work *work)  		raw_spin_unlock(&rq->lock);  	} -	raw_spin_lock(&rq->rd->rto_lock); +	raw_spin_lock(&rd->rto_lock);  	/* Pass the IPI to the next rt overloaded queue */ -	cpu = rto_next_cpu(rq); +	cpu = rto_next_cpu(rd); -	raw_spin_unlock(&rq->rd->rto_lock); +	raw_spin_unlock(&rd->rto_lock); -	if (cpu < 0) +	if (cpu < 0) { +		sched_put_rd(rd);  		return; +	}  	/* Try the next RT overloaded CPU */ -	irq_work_queue_on(&rq->rd->rto_push_work, cpu); +	irq_work_queue_on(&rd->rto_push_work, cpu);  }  #endif /* HAVE_RT_PUSH_IPI */ @@ -2212,7 +2220,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)  			queue_push_tasks(rq);  #endif /* CONFIG_SMP */ -		if (p->prio < rq->curr->prio) +		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))  			resched_curr(rq);  	}  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..fb5fc458547f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)  	return dl_policy(p->policy);  } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV	0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +	return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else +	return false; +#endif +} +  /*   * Tells if entity @a should preempt entity @b.   */  static inline bool  dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)  { -	return dl_time_before(a->deadline, b->deadline); +	return dl_entity_is_special(a) || +	       dl_time_before(a->deadline, b->deadline);  }  /* @@ -665,6 +691,8 @@ extern struct mutex sched_domains_mutex;  extern void init_defrootdomain(void);  extern int sched_init_domains(const struct cpumask *cpu_map);  extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd);  #ifdef HAVE_RT_PUSH_IPI  extern void rto_push_irq_work_func(struct irq_work *work); @@ -1328,47 +1356,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)  # define finish_arch_post_lock_switch()	do { } while (0)  #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP -	/* -	 * We can optimise this out completely for !SMP, because the -	 * SMP rebalancing from interrupt is the only thing that cares -	 * here. -	 */ -	next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP -	/* -	 * After ->on_cpu is cleared, the task can be moved to a different CPU. -	 * We must ensure this doesn't happen until the switch is completely -	 * finished. -	 * -	 * In particular, the load of prev->state in finish_task_switch() must -	 * happen before this. -	 * -	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -	 */ -	smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK -	/* this is a valid case when another task releases the spinlock */ -	rq->lock.owner = current; -#endif -	/* -	 * If we are tracking spinlock dependencies then we have to -	 * fix up the runqueue lock - which gets 'carried over' from -	 * prev into current: -	 */ -	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - -	raw_spin_unlock_irq(&rq->lock); -} -  /*   * wake flags   */ @@ -1687,17 +1674,17 @@ static inline int hrtick_enabled(struct rq *rq)  #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); -  #ifndef arch_scale_freq_capacity  static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu)  {  	return SCHED_CAPACITY_SCALE;  }  #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +  #ifndef arch_scale_cpu_capacity  static __always_inline  unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1711,10 +1698,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)  { -	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); +	rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));  	sched_avg_update(rq);  }  #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ +	return SCHED_CAPACITY_SCALE; +} +#endif  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }  static inline void sched_avg_update(struct rq *rq) { }  #endif @@ -2096,14 +2090,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);   * The way cpufreq is currently arranged requires it to evaluate the CPU   * performance state (frequency/voltage) on a regular basis to prevent it from   * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only).   * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling,   * but that really is a band-aid.  Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks.   */  static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)  { @@ -2125,3 +2119,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}  #else /* arch_scale_freq_capacity */  #define arch_scale_freq_invariant()	(false)  #endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ +	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ +	return rq->cfs.avg.util_avg; +} + +#endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baf500d12b7c..8e7b58de61e7 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)  		rq->rq_sched_info.run_delay += delta;  }  #define schedstat_enabled()		static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var)		do { var++; } while (0)  #define schedstat_inc(var)		do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt)	do { var += (amt); } while (0)  #define schedstat_add(var, amt)		do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val)		do { var = (val); } while (0)  #define schedstat_set(var, val)		do { if (schedstat_enabled()) { var = (val); } } while (0)  #define schedstat_val(var)		(var)  #define schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0) @@ -48,8 +51,11 @@ static inline void  rq_sched_info_depart(struct rq *rq, unsigned long long delta)  {}  #define schedstat_enabled()		0 +#define __schedstat_inc(var)		do { } while (0)  #define schedstat_inc(var)		do { } while (0) +#define __schedstat_add(var, amt)	do { } while (0)  #define schedstat_add(var, amt)		do { } while (0) +#define __schedstat_set(var, val)	do { } while (0)  #define schedstat_set(var, val)		do { } while (0)  #define schedstat_val(var)		0  #define schedstat_val_or_zero(var)	0 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 034cbed7f88b..519b024f4e94 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)  		call_rcu_sched(&old_rd->rcu, free_rootdomain);  } +void sched_get_rd(struct root_domain *rd) +{ +	atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ +	if (!atomic_dec_and_test(&rd->refcount)) +		return; + +	call_rcu_sched(&rd->rcu, free_rootdomain); +} +  static int init_rootdomain(struct root_domain *rd)  {  	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) |