diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 861 | 
1 files changed, 739 insertions, 122 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..7fea865a810d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,8 +296,6 @@ __read_mostly int scheduler_running;   */  int sysctl_sched_rt_runtime = 950000; - -  /*   * __task_rq_lock - lock the rq @p resides on.   */ @@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)  {  	int prio; -	if (task_has_rt_policy(p)) +	if (task_has_dl_policy(p)) +		prio = MAX_DL_PRIO-1; +	else if (task_has_rt_policy(p))  		prio = MAX_RT_PRIO-1 - p->rt_priority;  	else  		prio = __normal_prio(p); @@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,  		if (prev_class->switched_from)  			prev_class->switched_from(rq, p);  		p->sched_class->switched_to(rq, p); -	} else if (oldprio != p->prio) +	} else if (oldprio != p->prio || dl_task(p))  		p->sched_class->prio_changed(rq, p, oldprio);  } @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)  	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))  		goto out; +	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);  	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);  out: @@ -1499,8 +1500,7 @@ void scheduler_ipi(void)  	 * TIF_NEED_RESCHED remotely (for the first time) will also send  	 * this IPI.  	 */ -	if (tif_need_resched()) -		set_preempt_need_resched(); +	preempt_fold_need_resched();  	if (llist_empty(&this_rq()->wake_list)  			&& !tick_nohz_full_cpu(smp_processor_id()) @@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif +	RB_CLEAR_NODE(&p->dl.rb_node); +	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	p->dl.dl_runtime = p->dl.runtime = 0; +	p->dl.dl_deadline = p->dl.deadline = 0; +	p->dl.dl_period = 0; +	p->dl.flags = 0; +  	INIT_LIST_HEAD(&p->rt.run_list);  #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)  	numabalancing_enabled = enabled;  }  #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table t; +	int err; +	int state = numabalancing_enabled; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	t = *table; +	t.data = &state; +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); +	if (err < 0) +		return err; +	if (write) +		set_numabalancing_state(state); +	return err; +} +#endif +#endif  /*   * fork()/clone()-time setup:   */ -void sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long flags;  	int cpu = get_cpu(); @@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)  	 * Revert to default priority/policy on fork if requested.  	 */  	if (unlikely(p->sched_reset_on_fork)) { -		if (task_has_rt_policy(p)) { +		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  			p->policy = SCHED_NORMAL;  			p->static_prio = NICE_TO_PRIO(0);  			p->rt_priority = 0; @@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)  		p->sched_reset_on_fork = 0;  	} -	if (!rt_prio(p->prio)) +	if (dl_prio(p->prio)) { +		put_cpu(); +		return -EAGAIN; +	} else if (rt_prio(p->prio)) { +		p->sched_class = &rt_sched_class; +	} else {  		p->sched_class = &fair_sched_class; +	}  	if (p->sched_class->task_fork)  		p->sched_class->task_fork(p); @@ -1834,12 +1869,125 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)  	init_task_preempt_count(p);  #ifdef CONFIG_SMP  	plist_node_init(&p->pushable_tasks, MAX_PRIO); +	RB_CLEAR_NODE(&p->pushable_dl_tasks);  #endif  	put_cpu(); +	return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ +	if (runtime == RUNTIME_INF) +		return 1ULL << 20; + +	/* +	 * Doing this here saves a lot of checks in all +	 * the calling paths, and returning zero seems +	 * safe for them anyway. +	 */ +	if (period == 0) +		return 0; + +	return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	struct root_domain *rd = cpu_rq(i)->rd; +	int cpus = 0; + +	for_each_cpu_and(i, rd->span, cpu_active_mask) +		cpus++; + +	return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ +	return dl_b->bw != -1 && +	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;  }  /* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, +		       const struct sched_attr *attr) +{ + +	struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); +	u64 period = attr->sched_period; +	u64 runtime = attr->sched_runtime; +	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; +	int cpus, err = -1; + +	if (new_bw == p->dl.dl_bw) +		return 0; + +	/* +	 * Either if a task, enters, leave, or stays -deadline but changes +	 * its parameters, we may need to update accordingly the total +	 * allocated bandwidth of the container. +	 */ +	raw_spin_lock(&dl_b->lock); +	cpus = dl_bw_cpus(task_cpu(p)); +	if (dl_policy(policy) && !task_has_dl_policy(p) && +	    !__dl_overflow(dl_b, cpus, 0, new_bw)) { +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (dl_policy(policy) && task_has_dl_policy(p) && +		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (!dl_policy(policy) && task_has_dl_policy(p)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		err = 0; +	} +	raw_spin_unlock(&dl_b->lock); + +	return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + +/*   * wake_up_new_task - wake up a newly created task for the first time.   *   * This function will do some initial scheduler statistics housekeeping @@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	if (unlikely(prev_state == TASK_DEAD)) {  		task_numa_free(prev); +		if (prev->sched_class->task_dead) +			prev->sched_class->task_dead(prev); +  		/*  		 * Remove function-return probe instances associated with this  		 * task and put them back on the free list. @@ -2296,7 +2447,7 @@ void scheduler_tick(void)  #ifdef CONFIG_SMP  	rq->idle_balance = idle_cpu(cpu); -	trigger_load_balance(rq, cpu); +	trigger_load_balance(rq);  #endif  	rq_last_tick_reset(rq);  } @@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)  {  	/*  	 * Test if we are atomic. Since do_exit() needs to call into -	 * schedule() atomically, we ignore that path for now. -	 * Otherwise, whine if we are scheduling when we should not be. +	 * schedule() atomically, we ignore that path. Otherwise whine +	 * if we are scheduling when we should not.  	 */ -	if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) +	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))  		__schedule_bug(prev);  	rcu_sleep_check(); @@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);   */  void rt_mutex_setprio(struct task_struct *p, int prio)  { -	int oldprio, on_rq, running; +	int oldprio, on_rq, running, enqueue_flag = 0;  	struct rq *rq;  	const struct sched_class *prev_class; -	BUG_ON(prio < 0 || prio > MAX_PRIO); +	BUG_ON(prio > MAX_PRIO);  	rq = __task_rq_lock(p); @@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	}  	trace_sched_pi_setprio(p, prio); +	p->pi_top_task = rt_mutex_get_top_task(p);  	oldprio = p->prio;  	prev_class = p->sched_class;  	on_rq = p->on_rq; @@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	if (running)  		p->sched_class->put_prev_task(rq, p); -	if (rt_prio(prio)) +	/* +	 * Boosting condition are: +	 * 1. -rt task is running and holds mutex A +	 *      --> -dl task blocks on mutex A +	 * +	 * 2. -dl task is running and holds mutex A +	 *      --> -dl task blocks on mutex A and could preempt the +	 *          running task +	 */ +	if (dl_prio(prio)) { +		if (!dl_prio(p->normal_prio) || (p->pi_top_task && +			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { +			p->dl.dl_boosted = 1; +			p->dl.dl_throttled = 0; +			enqueue_flag = ENQUEUE_REPLENISH; +		} else +			p->dl.dl_boosted = 0; +		p->sched_class = &dl_sched_class; +	} else if (rt_prio(prio)) { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0; +		if (oldprio < prio) +			enqueue_flag = ENQUEUE_HEAD;  		p->sched_class = &rt_sched_class; -	else +	} else { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0;  		p->sched_class = &fair_sched_class; +	}  	p->prio = prio;  	if (running)  		p->sched_class->set_curr_task(rq);  	if (on_rq) -		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); +		enqueue_task(rq, p, enqueue_flag);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock:  	__task_rq_unlock(rq);  }  #endif +  void set_user_nice(struct task_struct *p, long nice)  {  	int old_prio, delta, on_rq; @@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)  	 * The RT priorities are set via sched_setscheduler(), but we still  	 * allow the 'normal' nice value to be set - but as expected  	 * it wont have any effect on scheduling until the task is -	 * SCHED_FIFO/SCHED_RR: +	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:  	 */ -	if (task_has_rt_policy(p)) { +	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  		p->static_prio = NICE_TO_PRIO(nice);  		goto out_unlock;  	} @@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)  	return pid ? find_task_by_vpid(pid) : current;  } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */  static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ +	struct sched_dl_entity *dl_se = &p->dl; + +	init_dl_task_timer(dl_se); +	dl_se->dl_runtime = attr->sched_runtime; +	dl_se->dl_deadline = attr->sched_deadline; +	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; +	dl_se->flags = attr->sched_flags; +	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); +	dl_se->dl_throttled = 0; +	dl_se->dl_new = 1; +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, +			   const struct sched_attr *attr)  { +	int policy = attr->sched_policy; + +	if (policy == -1) /* setparam */ +		policy = p->policy; +  	p->policy = policy; -	p->rt_priority = prio; + +	if (dl_policy(policy)) +		__setparam_dl(p, attr); +	else if (fair_policy(policy)) +		p->static_prio = NICE_TO_PRIO(attr->sched_nice); + +	/* +	 * __sched_setscheduler() ensures attr->sched_priority == 0 when +	 * !rt_policy. Always setting this ensures that things like +	 * getparam()/getattr() don't report silly values for !rt tasks. +	 */ +	p->rt_priority = attr->sched_priority; +  	p->normal_prio = normal_prio(p); -	/* we are holding p->pi_lock already */  	p->prio = rt_mutex_getprio(p); -	if (rt_prio(p->prio)) + +	if (dl_prio(p->prio)) +		p->sched_class = &dl_sched_class; +	else if (rt_prio(p->prio))  		p->sched_class = &rt_sched_class;  	else  		p->sched_class = &fair_sched_class; +  	set_load_weight(p);  } +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ +	struct sched_dl_entity *dl_se = &p->dl; + +	attr->sched_priority = p->rt_priority; +	attr->sched_runtime = dl_se->dl_runtime; +	attr->sched_deadline = dl_se->dl_deadline; +	attr->sched_period = dl_se->dl_period; +	attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ +	return attr && attr->sched_deadline != 0 && +		(attr->sched_period == 0 || +		(s64)(attr->sched_period   - attr->sched_deadline) >= 0) && +		(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  && +		attr->sched_runtime >= (2 << (DL_SCALE - 1)); +} +  /*   * check the target process has a UID that matches the current process's   */ @@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)  	return match;  } -static int __sched_setscheduler(struct task_struct *p, int policy, -				const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, +				const struct sched_attr *attr, +				bool user)  {  	int retval, oldprio, oldpolicy = -1, on_rq, running; +	int policy = attr->sched_policy;  	unsigned long flags;  	const struct sched_class *prev_class;  	struct rq *rq; @@ -3037,31 +3290,40 @@ recheck:  		reset_on_fork = p->sched_reset_on_fork;  		policy = oldpolicy = p->policy;  	} else { -		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -		policy &= ~SCHED_RESET_ON_FORK; +		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); -		if (policy != SCHED_FIFO && policy != SCHED_RR && +		if (policy != SCHED_DEADLINE && +				policy != SCHED_FIFO && policy != SCHED_RR &&  				policy != SCHED_NORMAL && policy != SCHED_BATCH &&  				policy != SCHED_IDLE)  			return -EINVAL;  	} +	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) +		return -EINVAL; +  	/*  	 * Valid priorities for SCHED_FIFO and SCHED_RR are  	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,  	 * SCHED_BATCH and SCHED_IDLE is 0.  	 */ -	if (param->sched_priority < 0 || -	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || -	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) +	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || +	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))  		return -EINVAL; -	if (rt_policy(policy) != (param->sched_priority != 0)) +	if ((dl_policy(policy) && !__checkparam_dl(attr)) || +	    (rt_policy(policy) != (attr->sched_priority != 0)))  		return -EINVAL;  	/*  	 * Allow unprivileged RT tasks to decrease priority:  	 */  	if (user && !capable(CAP_SYS_NICE)) { +		if (fair_policy(policy)) { +			if (attr->sched_nice < TASK_NICE(p) && +			    !can_nice(p, attr->sched_nice)) +				return -EPERM; +		} +  		if (rt_policy(policy)) {  			unsigned long rlim_rtprio =  					task_rlimit(p, RLIMIT_RTPRIO); @@ -3071,8 +3333,8 @@ recheck:  				return -EPERM;  			/* can't increase priority */ -			if (param->sched_priority > p->rt_priority && -			    param->sched_priority > rlim_rtprio) +			if (attr->sched_priority > p->rt_priority && +			    attr->sched_priority > rlim_rtprio)  				return -EPERM;  		} @@ -3120,14 +3382,21 @@ recheck:  	/*  	 * If not changing anything there's no need to proceed further:  	 */ -	if (unlikely(policy == p->policy && (!rt_policy(policy) || -			param->sched_priority == p->rt_priority))) { +	if (unlikely(policy == p->policy)) { +		if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) +			goto change; +		if (rt_policy(policy) && attr->sched_priority != p->rt_priority) +			goto change; +		if (dl_policy(policy)) +			goto change; +  		task_rq_unlock(rq, p, &flags);  		return 0;  	} +change: -#ifdef CONFIG_RT_GROUP_SCHED  	if (user) { +#ifdef CONFIG_RT_GROUP_SCHED  		/*  		 * Do not allow realtime tasks into groups that have no runtime  		 * assigned. @@ -3138,8 +3407,24 @@ recheck:  			task_rq_unlock(rq, p, &flags);  			return -EPERM;  		} -	}  #endif +#ifdef CONFIG_SMP +		if (dl_bandwidth_enabled() && dl_policy(policy)) { +			cpumask_t *span = rq->rd->span; + +			/* +			 * Don't allow tasks with an affinity mask smaller than +			 * the entire root_domain to become SCHED_DEADLINE. We +			 * will also fail if there's no bandwidth available. +			 */ +			if (!cpumask_subset(span, &p->cpus_allowed) || +			    rq->rd->dl_bw.bw == 0) { +				task_rq_unlock(rq, p, &flags); +				return -EPERM; +			} +		} +#endif +	}  	/* recheck policy now with rq lock held */  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3147,6 +3432,17 @@ recheck:  		task_rq_unlock(rq, p, &flags);  		goto recheck;  	} + +	/* +	 * If setscheduling to SCHED_DEADLINE (or changing the parameters +	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth +	 * is available. +	 */ +	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { +		task_rq_unlock(rq, p, &flags); +		return -EBUSY; +	} +  	on_rq = p->on_rq;  	running = task_current(rq, p);  	if (on_rq) @@ -3158,7 +3454,7 @@ recheck:  	oldprio = p->prio;  	prev_class = p->sched_class; -	__setscheduler(rq, p, policy, param->sched_priority); +	__setscheduler(rq, p, attr);  	if (running)  		p->sched_class->set_curr_task(rq); @@ -3173,6 +3469,26 @@ recheck:  	return 0;  } +static int _sched_setscheduler(struct task_struct *p, int policy, +			       const struct sched_param *param, bool check) +{ +	struct sched_attr attr = { +		.sched_policy   = policy, +		.sched_priority = param->sched_priority, +		.sched_nice	= PRIO_TO_NICE(p->static_prio), +	}; + +	/* +	 * Fixup the legacy SCHED_RESET_ON_FORK hack +	 */ +	if (policy & SCHED_RESET_ON_FORK) { +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +		policy &= ~SCHED_RESET_ON_FORK; +		attr.sched_policy = policy; +	} + +	return __sched_setscheduler(p, &attr, check); +}  /**   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.   * @p: the task in question. @@ -3186,10 +3502,16 @@ recheck:  int sched_setscheduler(struct task_struct *p, int policy,  		       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, true); +	return _sched_setscheduler(p, policy, param, true);  }  EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ +	return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); +  /**   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.   * @p: the task in question. @@ -3206,7 +3528,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);  int sched_setscheduler_nocheck(struct task_struct *p, int policy,  			       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, false); +	return _sched_setscheduler(p, policy, param, false);  }  static int @@ -3231,6 +3553,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)  	return retval;  } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr) +{ +	u32 size; +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) +		return -EFAULT; + +	/* +	 * zero the full structure, so that a short copy will be nice. +	 */ +	memset(attr, 0, sizeof(*attr)); + +	ret = get_user(size, &uattr->size); +	if (ret) +		return ret; + +	if (size > PAGE_SIZE)	/* silly large */ +		goto err_size; + +	if (!size)		/* abi compat */ +		size = SCHED_ATTR_SIZE_VER0; + +	if (size < SCHED_ATTR_SIZE_VER0) +		goto err_size; + +	/* +	 * If we're handed a bigger struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. new +	 * user-space does not rely on any kernel feature +	 * extensions we dont know about yet. +	 */ +	if (size > sizeof(*attr)) { +		unsigned char __user *addr; +		unsigned char __user *end; +		unsigned char val; + +		addr = (void __user *)uattr + sizeof(*attr); +		end  = (void __user *)uattr + size; + +		for (; addr < end; addr++) { +			ret = get_user(val, addr); +			if (ret) +				return ret; +			if (val) +				goto err_size; +		} +		size = sizeof(*attr); +	} + +	ret = copy_from_user(attr, uattr, size); +	if (ret) +		return -EFAULT; + +	/* +	 * XXX: do we want to be lenient like existing syscalls; or do we want +	 * to be strict and return an error on out-of-bounds values? +	 */ +	attr->sched_nice = clamp(attr->sched_nice, -20, 19); + +out: +	return ret; + +err_size: +	put_user(sizeof(*attr), &uattr->size); +	ret = -E2BIG; +	goto out; +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -3262,6 +3657,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  }  /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + */ +SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +{ +	struct sched_attr attr; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0) +		return -EINVAL; + +	if (sched_copy_attr(uattr, &attr)) +		return -EFAULT; + +	rcu_read_lock(); +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (p != NULL) +		retval = sched_setattr(p, &attr); +	rcu_read_unlock(); + +	return retval; +} + +/**   * sys_sched_getscheduler - get the policy (scheduling class) of a thread   * @pid: the pid in question.   * @@ -3316,6 +3738,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	if (retval)  		goto out_unlock; +	if (task_has_dl_policy(p)) { +		retval = -EINVAL; +		goto out_unlock; +	}  	lp.sched_priority = p->rt_priority;  	rcu_read_unlock(); @@ -3331,6 +3757,96 @@ out_unlock:  	return retval;  } +static int sched_read_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr, +			   unsigned int usize) +{ +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, usize)) +		return -EFAULT; + +	/* +	 * If we're handed a smaller struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. old +	 * user-space does not get uncomplete information. +	 */ +	if (usize < sizeof(*attr)) { +		unsigned char *addr; +		unsigned char *end; + +		addr = (void *)attr + usize; +		end  = (void *)attr + sizeof(*attr); + +		for (; addr < end; addr++) { +			if (*addr) +				goto err_size; +		} + +		attr->size = usize; +	} + +	ret = copy_to_user(uattr, attr, usize); +	if (ret) +		return -EFAULT; + +out: +	return ret; + +err_size: +	ret = -E2BIG; +	goto out; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + */ +SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +		unsigned int, size) +{ +	struct sched_attr attr = { +		.size = sizeof(struct sched_attr), +	}; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0 || size > PAGE_SIZE || +	    size < SCHED_ATTR_SIZE_VER0) +		return -EINVAL; + +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	retval = -ESRCH; +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	attr.sched_policy = p->policy; +	if (p->sched_reset_on_fork) +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +	if (task_has_dl_policy(p)) +		__getparam_dl(p, &attr); +	else if (task_has_rt_policy(p)) +		attr.sched_priority = p->rt_priority; +	else +		attr.sched_nice = TASK_NICE(p); + +	rcu_read_unlock(); + +	retval = sched_read_attr(uattr, &attr, size); +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} +  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  {  	cpumask_var_t cpus_allowed, new_mask; @@ -3375,8 +3891,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	if (retval)  		goto out_unlock; +  	cpuset_cpus_allowed(p, cpus_allowed);  	cpumask_and(new_mask, in_mask, cpus_allowed); + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +#ifdef CONFIG_SMP +	if (task_has_dl_policy(p)) { +		const struct cpumask *span = task_rq(p)->rd->span; + +		if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { +			retval = -EBUSY; +			goto out_unlock; +		} +	} +#endif  again:  	retval = set_cpus_allowed_ptr(p, new_mask); @@ -3653,7 +4187,7 @@ again:  	}  	double_rq_lock(rq, p_rq); -	while (task_rq(p) != p_rq) { +	if (task_rq(p) != p_rq) {  		double_rq_unlock(rq, p_rq);  		goto again;  	} @@ -3742,6 +4276,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)  	case SCHED_RR:  		ret = MAX_USER_RT_PRIO-1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -3768,6 +4303,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)  	case SCHED_RR:  		ret = 1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -4090,6 +4626,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)  	/* TODO: This is not properly updating schedstats */ +	trace_sched_move_numa(p, curr_cpu, target_cpu);  	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);  } @@ -4514,13 +5051,31 @@ static int sched_cpu_active(struct notifier_block *nfb,  static int sched_cpu_inactive(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  { +	unsigned long flags; +	long cpu = (long)hcpu; +  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE: -		set_cpu_active((long)hcpu, false); +		set_cpu_active(cpu, false); + +		/* explicitly allow suspend */ +		if (!(action & CPU_TASKS_FROZEN)) { +			struct dl_bw *dl_b = dl_bw_of(cpu); +			bool overflow; +			int cpus; + +			raw_spin_lock_irqsave(&dl_b->lock, flags); +			cpus = dl_bw_cpus(cpu); +			overflow = __dl_overflow(dl_b, cpus, 0, 0); +			raw_spin_unlock_irqrestore(&dl_b->lock, flags); + +			if (overflow) +				return notifier_from_errno(-EBUSY); +		}  		return NOTIFY_OK; -	default: -		return NOTIFY_DONE;  	} + +	return NOTIFY_DONE;  }  static int __init migration_init(void) @@ -4739,6 +5294,8 @@ static void free_rootdomain(struct rcu_head *rcu)  	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);  	cpupri_cleanup(&rd->cpupri); +	cpudl_cleanup(&rd->cpudl); +	free_cpumask_var(rd->dlo_mask);  	free_cpumask_var(rd->rto_mask);  	free_cpumask_var(rd->online);  	free_cpumask_var(rd->span); @@ -4790,8 +5347,14 @@ static int init_rootdomain(struct root_domain *rd)  		goto out;  	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))  		goto free_span; -	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))  		goto free_online; +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +		goto free_dlo_mask; + +	init_dl_bw(&rd->dl_bw); +	if (cpudl_init(&rd->cpudl) != 0) +		goto free_dlo_mask;  	if (cpupri_init(&rd->cpupri) != 0)  		goto free_rto_mask; @@ -4799,6 +5362,8 @@ static int init_rootdomain(struct root_domain *rd)  free_rto_mask:  	free_cpumask_var(rd->rto_mask); +free_dlo_mask: +	free_cpumask_var(rd->dlo_mask);  free_online:  	free_cpumask_var(rd->online);  free_span: @@ -6150,6 +6715,7 @@ void __init sched_init_smp(void)  	free_cpumask_var(non_isolated_cpus);  	init_sched_rt_class(); +	init_sched_dl_class();  }  #else  void __init sched_init_smp(void) @@ -6219,13 +6785,15 @@ void __init sched_init(void)  #endif /* CONFIG_CPUMASK_OFFSTACK */  	} +	init_rt_bandwidth(&def_rt_bandwidth, +			global_rt_period(), global_rt_runtime()); +	init_dl_bandwidth(&def_dl_bandwidth, +			global_rt_period(), global_rt_runtime()); +  #ifdef CONFIG_SMP  	init_defrootdomain();  #endif -	init_rt_bandwidth(&def_rt_bandwidth, -			global_rt_period(), global_rt_runtime()); -  #ifdef CONFIG_RT_GROUP_SCHED  	init_rt_bandwidth(&root_task_group.rt_bandwidth,  			global_rt_period(), global_rt_runtime()); @@ -6249,6 +6817,7 @@ void __init sched_init(void)  		rq->calc_load_update = jiffies + LOAD_FREQ;  		init_cfs_rq(&rq->cfs);  		init_rt_rq(&rq->rt, rq); +		init_dl_rq(&rq->dl, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED  		root_task_group.shares = ROOT_TASK_GROUP_LOAD;  		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6320,10 +6889,6 @@ void __init sched_init(void)  	INIT_HLIST_HEAD(&init_task.preempt_notifiers);  #endif -#ifdef CONFIG_RT_MUTEXES -	plist_head_init(&init_task.pi_waiters); -#endif -  	/*  	 * The boot idle thread does lazy MMU switching as well:  	 */ @@ -6397,13 +6962,16 @@ EXPORT_SYMBOL(__might_sleep);  static void normalize_task(struct rq *rq, struct task_struct *p)  {  	const struct sched_class *prev_class = p->sched_class; +	struct sched_attr attr = { +		.sched_policy = SCHED_NORMAL, +	};  	int old_prio = p->prio;  	int on_rq;  	on_rq = p->on_rq;  	if (on_rq)  		dequeue_task(rq, p, 0); -	__setscheduler(rq, p, SCHED_NORMAL, 0); +	__setscheduler(rq, p, &attr);  	if (on_rq) {  		enqueue_task(rq, p, 0);  		resched_task(rq->curr); @@ -6433,7 +7001,7 @@ void normalize_rt_tasks(void)  		p->se.statistics.block_start	= 0;  #endif -		if (!rt_task(p)) { +		if (!dl_task(p) && !rt_task(p)) {  			/*  			 * Renice negative nice level userspace  			 * tasks back to 0: @@ -6628,16 +7196,6 @@ void sched_move_task(struct task_struct *tsk)  }  #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ -	if (runtime == RUNTIME_INF) -		return 1ULL << 20; - -	return div64_u64(runtime << 20, period); -} -#endif -  #ifdef CONFIG_RT_GROUP_SCHED  /*   * Ensure that the real time constraints are schedulable. @@ -6811,24 +7369,13 @@ static long sched_group_rt_period(struct task_group *tg)  	do_div(rt_period_us, NSEC_PER_USEC);  	return rt_period_us;  } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED  static int sched_rt_global_constraints(void)  { -	u64 runtime, period;  	int ret = 0; -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	runtime = global_rt_runtime(); -	period = global_rt_period(); - -	/* -	 * Sanity check on the sysctl variables. -	 */ -	if (runtime > period && runtime != RUNTIME_INF) -		return -EINVAL; -  	mutex_lock(&rt_constraints_mutex);  	read_lock(&tasklist_lock);  	ret = __rt_schedulable(NULL, 0, 0); @@ -6851,17 +7398,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)  static int sched_rt_global_constraints(void)  {  	unsigned long flags; -	int i; - -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	/* -	 * There's always some RT tasks in the root group -	 * -- migration, kstopmachine etc.. -	 */ -	if (sysctl_sched_rt_runtime == 0) -		return -EBUSY; +	int i, ret = 0;  	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) { @@ -6873,36 +7410,88 @@ static int sched_rt_global_constraints(void)  	}  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); -	return 0; +	return ret;  }  #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, -		void __user *buffer, size_t *lenp, -		loff_t *ppos) +static int sched_dl_global_constraints(void)  { -	int ret; -	static DEFINE_MUTEX(mutex); +	u64 runtime = global_rt_runtime(); +	u64 period = global_rt_period(); +	u64 new_bw = to_ratio(period, runtime); +	int cpu, ret = 0; -	mutex_lock(&mutex); -	ret = proc_dointvec(table, write, buffer, lenp, ppos); -	/* make sure that internally we keep jiffies */ -	/* also, writing zero resets timeslice to default */ -	if (!ret && write) { -		sched_rr_timeslice = sched_rr_timeslice <= 0 ? -			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	/* +	 * Here we want to check the bandwidth not being set to some +	 * value smaller than the currently allocated bandwidth in +	 * any of the root_domains. +	 * +	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than +	 * cycling on root_domains... Discussion on different/better +	 * solutions is welcome! +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock(&dl_b->lock); +		if (new_bw < dl_b->total_bw) +			ret = -EBUSY; +		raw_spin_unlock(&dl_b->lock); + +		if (ret) +			break;  	} -	mutex_unlock(&mutex); +  	return ret;  } +static void sched_dl_do_global(void) +{ +	u64 new_bw = -1; +	int cpu; + +	def_dl_bandwidth.dl_period = global_rt_period(); +	def_dl_bandwidth.dl_runtime = global_rt_runtime(); + +	if (global_rt_runtime() != RUNTIME_INF) +		new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + +	/* +	 * FIXME: As above... +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock(&dl_b->lock); +		dl_b->bw = new_bw; +		raw_spin_unlock(&dl_b->lock); +	} +} + +static int sched_rt_global_validate(void) +{ +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) +		return -EINVAL; + +	return 0; +} + +static void sched_rt_do_global(void) +{ +	def_rt_bandwidth.rt_runtime = global_rt_runtime(); +	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} +  int sched_rt_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos)  { -	int ret;  	int old_period, old_runtime;  	static DEFINE_MUTEX(mutex); +	int ret;  	mutex_lock(&mutex);  	old_period = sysctl_sched_rt_period; @@ -6911,21 +7500,50 @@ int sched_rt_handler(struct ctl_table *table, int write,  	ret = proc_dointvec(table, write, buffer, lenp, ppos);  	if (!ret && write) { +		ret = sched_rt_global_validate(); +		if (ret) +			goto undo; +  		ret = sched_rt_global_constraints(); -		if (ret) { -			sysctl_sched_rt_period = old_period; -			sysctl_sched_rt_runtime = old_runtime; -		} else { -			def_rt_bandwidth.rt_runtime = global_rt_runtime(); -			def_rt_bandwidth.rt_period = -				ns_to_ktime(global_rt_period()); -		} +		if (ret) +			goto undo; + +		ret = sched_dl_global_constraints(); +		if (ret) +			goto undo; + +		sched_rt_do_global(); +		sched_dl_do_global(); +	} +	if (0) { +undo: +		sysctl_sched_rt_period = old_period; +		sysctl_sched_rt_runtime = old_runtime;  	}  	mutex_unlock(&mutex);  	return ret;  } +int sched_rr_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	ret = proc_dointvec(table, write, buffer, lenp, ppos); +	/* make sure that internally we keep jiffies */ +	/* also, writing zero resets timeslice to default */ +	if (!ret && write) { +		sched_rr_timeslice = sched_rr_timeslice <= 0 ? +			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	} +	mutex_unlock(&mutex); +	return ret; +} +  #ifdef CONFIG_CGROUP_SCHED  static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -7258,15 +7876,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)  	return ret;  } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, -		struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v)  { -	struct task_group *tg = css_tg(css); +	struct task_group *tg = css_tg(seq_css(sf));  	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -	cb->fill(cb, "nr_periods", cfs_b->nr_periods); -	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); -	cb->fill(cb, "throttled_time", cfs_b->throttled_time); +	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); +	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); +	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);  	return 0;  } @@ -7320,7 +7937,7 @@ static struct cftype cpu_files[] = {  	},  	{  		.name = "stat", -		.read_map = cpu_stats_show, +		.seq_show = cpu_stats_show,  	},  #endif  #ifdef CONFIG_RT_GROUP_SCHED  |