diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 726 | 
1 files changed, 570 insertions, 156 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..f9a1346a5fa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)  {  	struct hrtimer *timer = &rq->hrtick_timer; -	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); +	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);  }  /* @@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	 */  	delay = max_t(u64, delay, 10000LL);  	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -		      HRTIMER_MODE_REL_PINNED); +		      HRTIMER_MODE_REL_PINNED_HARD);  }  #endif /* CONFIG_SMP */ @@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)  	rq->hrtick_csd.info = rq;  #endif -	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	rq->hrtick_timer.function = hrtick;  }  #else	/* CONFIG_SCHED_HRTICK */ @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)  }  #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); +  /* Max allowed minimum utilization */  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)  	return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);  } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)  {  	if (clamp_id == UCLAMP_MIN)  		return 0; @@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,  }  static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,  		  unsigned int clamp_value)  {  	/* @@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,  	return uclamp_none(UCLAMP_MIN);  } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,  				     unsigned int clamp_value)  {  	/* Reset max-clamp retention only on idle exit */ @@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,  }  static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, -				 unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +				   unsigned int clamp_value)  {  	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;  	int bucket_id = UCLAMP_BUCKETS - 1; @@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,  	return uclamp_idle_value(rq, clamp_id, clamp_value);  } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) +{ +	struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP +	struct uclamp_se uc_max; + +	/* +	 * Tasks in autogroups or root task group will be +	 * restricted by system defaults. +	 */ +	if (task_group_is_autogroup(task_group(p))) +		return uc_req; +	if (task_group(p) == &root_task_group) +		return uc_req; + +	uc_max = task_group(p)->uclamp[clamp_id]; +	if (uc_req.value > uc_max.value || !uc_req.user_defined) +		return uc_max; +#endif + +	return uc_req; +} +  /*   * The effective clamp bucket index of a task depends on, by increasing   * priority:   * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + *   group or in an autogroup   * - the system default clamp value, defined by the sysadmin   */  static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)  { -	struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);  	struct uclamp_se uc_max = uclamp_default[clamp_id];  	/* System default restrictions always apply */ @@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)  	return uc_req;  } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)  {  	struct uclamp_se uc_eff; @@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)   * for each bucket when all its RUNNABLE tasks require the same clamp.   */  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, -				    unsigned int clamp_id) +				    enum uclamp_id clamp_id)  {  	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];  	struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,   * enforce the expected state and warn.   */  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, -				    unsigned int clamp_id) +				    enum uclamp_id clamp_id)  {  	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];  	struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	if (unlikely(!p->sched_class->uclamp_enabled))  		return; @@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	if (unlikely(!p->sched_class->uclamp_enabled))  		return; @@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  		uclamp_rq_dec_id(rq, p, clamp_id);  } +static inline void +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +{ +	struct rq_flags rf; +	struct rq *rq; + +	/* +	 * Lock the task and the rq where the task is (or was) queued. +	 * +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the +	 * price to pay to safely serialize util_{min,max} updates with +	 * enqueues, dequeues and migration operations. +	 * This is the same locking schema used by __set_cpus_allowed_ptr(). +	 */ +	rq = task_rq_lock(p, &rf); + +	/* +	 * Setting the clamp bucket is serialized by task_rq_lock(). +	 * If the task is not yet RUNNABLE and its task_struct is not +	 * affecting a valid clamp bucket, the next time it's enqueued, +	 * it will already see the updated clamp bucket value. +	 */ +	if (!p->uclamp[clamp_id].active) { +		uclamp_rq_dec_id(rq, p, clamp_id); +		uclamp_rq_inc_id(rq, p, clamp_id); +	} + +	task_rq_unlock(rq, p, &rf); +} + +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, +			   unsigned int clamps) +{ +	enum uclamp_id clamp_id; +	struct css_task_iter it; +	struct task_struct *p; + +	css_task_iter_start(css, 0, &it); +	while ((p = css_task_iter_next(&it))) { +		for_each_clamp_id(clamp_id) { +			if ((0x1 << clamp_id) & clamps) +				uclamp_update_active(p, clamp_id); +		} +	} +	css_task_iter_end(&it); +} + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ +	struct task_group *tg = &root_task_group; + +	uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], +		      sysctl_sched_uclamp_util_min, false); +	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], +		      sysctl_sched_uclamp_util_max, false); + +	rcu_read_lock(); +	cpu_util_update_eff(&root_task_group.css); +	rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif +  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp,  				loff_t *ppos)  { +	bool update_root_tg = false;  	int old_min, old_max; -	static DEFINE_MUTEX(mutex);  	int result; -	mutex_lock(&mutex); +	mutex_lock(&uclamp_mutex);  	old_min = sysctl_sched_uclamp_util_min;  	old_max = sysctl_sched_uclamp_util_max; @@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,  	if (old_min != sysctl_sched_uclamp_util_min) {  		uclamp_se_set(&uclamp_default[UCLAMP_MIN],  			      sysctl_sched_uclamp_util_min, false); +		update_root_tg = true;  	}  	if (old_max != sysctl_sched_uclamp_util_max) {  		uclamp_se_set(&uclamp_default[UCLAMP_MAX],  			      sysctl_sched_uclamp_util_max, false); +		update_root_tg = true;  	} +	if (update_root_tg) +		uclamp_update_root_tg(); +  	/* -	 * Updating all the RUNNABLE task is expensive, keep it simple and do -	 * just a lazy update at each next enqueue time. +	 * We update all RUNNABLE tasks only when task groups are in use. +	 * Otherwise, keep it simple and do just a lazy update at each next +	 * task enqueue time.  	 */ +  	goto done;  undo:  	sysctl_sched_uclamp_util_min = old_min;  	sysctl_sched_uclamp_util_max = old_max;  done: -	mutex_unlock(&mutex); +	mutex_unlock(&uclamp_mutex);  	return result;  } @@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,  static void __setscheduler_uclamp(struct task_struct *p,  				  const struct sched_attr *attr)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	/*  	 * On scheduling class change, reset to default clamps for tasks @@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,  static void uclamp_fork(struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	for_each_clamp_id(clamp_id)  		p->uclamp[clamp_id].active = false; @@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)  static void __init init_uclamp(void)  {  	struct uclamp_se uc_max = {}; -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	int cpu; +	mutex_init(&uclamp_mutex); +  	for_each_possible_cpu(cpu) {  		memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));  		cpu_rq(cpu)->uclamp_flags = 0; @@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)  	/* System defaults allow max clamp values for both indexes */  	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); -	for_each_clamp_id(clamp_id) +	for_each_clamp_id(clamp_id) {  		uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP +		root_task_group.uclamp_req[clamp_id] = uc_max; +		root_task_group.uclamp[clamp_id] = uc_max; +#endif +	}  }  #else /* CONFIG_UCLAMP_TASK */ @@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  }  /* @@ -3214,12 +3333,8 @@ static __always_inline struct rq *  context_switch(struct rq *rq, struct task_struct *prev,  	       struct task_struct *next, struct rq_flags *rf)  { -	struct mm_struct *mm, *oldmm; -  	prepare_task_switch(rq, prev, next); -	mm = next->mm; -	oldmm = prev->active_mm;  	/*  	 * For paravirt, this is coupled with an exit in switch_to to  	 * combine the page table reload and the switch backend into @@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,  	arch_start_context_switch(prev);  	/* -	 * If mm is non-NULL, we pass through switch_mm(). If mm is -	 * NULL, we will pass through mmdrop() in finish_task_switch(). -	 * Both of these contain the full memory barrier required by -	 * membarrier after storing to rq->curr, before returning to -	 * user-space. +	 * kernel -> kernel   lazy + transfer active +	 *   user -> kernel   lazy + mmgrab() active +	 * +	 * kernel ->   user   switch + mmdrop() active +	 *   user ->   user   switch  	 */ -	if (!mm) { -		next->active_mm = oldmm; -		mmgrab(oldmm); -		enter_lazy_tlb(oldmm, next); -	} else -		switch_mm_irqs_off(oldmm, mm, next); +	if (!next->mm) {                                // to kernel +		enter_lazy_tlb(prev->active_mm, next); + +		next->active_mm = prev->active_mm; +		if (prev->mm)                           // from user +			mmgrab(prev->active_mm); +		else +			prev->active_mm = NULL; +	} else {                                        // to user +		/* +		 * sys_membarrier() requires an smp_mb() between setting +		 * rq->curr and returning to userspace. +		 * +		 * The below provides this either through switch_mm(), or in +		 * case 'prev->active_mm == next->mm' through +		 * finish_task_switch()'s mmdrop(). +		 */ -	if (!prev->mm) { -		prev->active_mm = NULL; -		rq->prev_mm = oldmm; +		switch_mm_irqs_off(prev->active_mm, next->mm, next); + +		if (!prev->mm) {                        // from kernel +			/* will mmdrop() in finish_task_switch(). */ +			rq->prev_mm = prev->active_mm; +			prev->active_mm = NULL; +		}  	}  	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); @@ -3486,8 +3616,36 @@ void scheduler_tick(void)  struct tick_work {  	int			cpu; +	atomic_t		state;  	struct delayed_work	work;  }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE	0 +#define TICK_SCHED_REMOTE_OFFLINING	1 +#define TICK_SCHED_REMOTE_RUNNING	2 + +/* + * State diagram for ->state: + * + * + *          TICK_SCHED_REMOTE_OFFLINE + *                    |   ^ + *                    |   | + *                    |   | sched_tick_remote() + *                    |   | + *                    |   | + *                    +--TICK_SCHED_REMOTE_OFFLINING + *                    |   ^ + *                    |   | + * sched_tick_start() |   | sched_tick_stop() + *                    |   | + *                    V   | + *          TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */  static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)  	struct task_struct *curr;  	struct rq_flags rf;  	u64 delta; +	int os;  	/*  	 * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)  	rq_lock_irq(rq, &rf);  	curr = rq->curr; -	if (is_idle_task(curr)) +	if (is_idle_task(curr) || cpu_is_offline(cpu))  		goto out_unlock;  	update_rq_clock(rq); @@ -3533,13 +3692,18 @@ out_requeue:  	/*  	 * Run the remote tick once per second (1Hz). This arbitrary  	 * frequency is large enough to avoid overload but short enough -	 * to keep scheduler internal stats reasonably up to date. +	 * to keep scheduler internal stats reasonably up to date.  But +	 * first update state to reflect hotplug activity if required.  	 */ -	queue_delayed_work(system_unbound_wq, dwork, HZ); +	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); +	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); +	if (os == TICK_SCHED_REMOTE_RUNNING) +		queue_delayed_work(system_unbound_wq, dwork, HZ);  }  static void sched_tick_start(int cpu)  { +	int os;  	struct tick_work *twork;  	if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)  	WARN_ON_ONCE(!tick_work_cpu);  	twork = per_cpu_ptr(tick_work_cpu, cpu); -	twork->cpu = cpu; -	INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -	queue_delayed_work(system_unbound_wq, &twork->work, HZ); +	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); +	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); +	if (os == TICK_SCHED_REMOTE_OFFLINE) { +		twork->cpu = cpu; +		INIT_DELAYED_WORK(&twork->work, sched_tick_remote); +		queue_delayed_work(system_unbound_wq, &twork->work, HZ); +	}  }  #ifdef CONFIG_HOTPLUG_CPU  static void sched_tick_stop(int cpu)  {  	struct tick_work *twork; +	int os;  	if (housekeeping_cpu(cpu, HK_FLAG_TICK))  		return; @@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)  	WARN_ON_ONCE(!tick_work_cpu);  	twork = per_cpu_ptr(tick_work_cpu, cpu); -	cancel_delayed_work_sync(&twork->work); +	/* There cannot be competing actions, but don't rely on stop-machine. */ +	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); +	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); +	/* Don't cancel, as this would mess up the state machine. */  }  #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)  {  	tick_work_cpu = alloc_percpu(struct tick_work);  	BUG_ON(!tick_work_cpu); -  	return 0;  } @@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }  static inline void sched_tick_stop(int cpu) { }  #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_TRACE_PREEMPT_TOGGLE))  /*   * If the value passed in is equal to the current preempt count @@ -3700,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)  /*   * Various schedule()-time debugging checks and statistics:   */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt)  {  #ifdef CONFIG_SCHED_STACK_END_CHECK  	if (task_stack_end_corrupted(prev))  		panic("corrupted stack end detected inside scheduler\n");  #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +	if (!preempt && prev->state && prev->non_block_count) { +		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", +			prev->comm, prev->pid, prev->non_block_count); +		dump_stack(); +		add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +	} +#endif +  	if (unlikely(in_atomic_preempt_off())) {  		__schedule_bug(prev);  		preempt_count_set(PREEMPT_DISABLED); @@ -3739,7 +3919,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  		p = fair_sched_class.pick_next_task(rq, prev, rf);  		if (unlikely(p == RETRY_TASK)) -			goto again; +			goto restart;  		/* Assumes fair_sched_class->next == idle_sched_class */  		if (unlikely(!p)) @@ -3748,14 +3928,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  		return p;  	} -again: +restart: +	/* +	 * Ensure that we put DL/RT tasks before the pick loop, such that they +	 * can PULL higher prio tasks when we lower the RQ 'priority'. +	 */ +	prev->sched_class->put_prev_task(rq, prev, rf); +	if (!rq->nr_running) +		newidle_balance(rq, rf); +  	for_each_class(class) { -		p = class->pick_next_task(rq, prev, rf); -		if (p) { -			if (unlikely(p == RETRY_TASK)) -				goto again; +		p = class->pick_next_task(rq, NULL, NULL); +		if (p)  			return p; -		}  	}  	/* The idle class should always have a runnable task: */ @@ -3782,7 +3967,7 @@ again:   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets   *      called on the nearest possible occasion:   * - *       - If the kernel is preemptible (CONFIG_PREEMPT=y): + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):   *   *         - in syscall or exception context, at the next outmost   *           preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3976,7 @@ again:   *         - in IRQ context, return from interrupt-handler to   *           preemptible context   * - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)   *         then at the next:   *   *          - cond_resched() call @@ -3813,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)  	rq = cpu_rq(cpu);  	prev = rq->curr; -	schedule_debug(prev); +	schedule_debug(prev, preempt);  	if (sched_feat(HRTICK))  		hrtick_clear(rq); @@ -3904,7 +4089,7 @@ void __noreturn do_task_dead(void)  static inline void sched_submit_work(struct task_struct *tsk)  { -	if (!tsk->state || tsk_is_pi_blocked(tsk)) +	if (!tsk->state)  		return;  	/* @@ -3920,6 +4105,9 @@ static inline void sched_submit_work(struct task_struct *tsk)  		preempt_enable_no_resched();  	} +	if (tsk_is_pi_blocked(tsk)) +		return; +  	/*  	 * If we are going to sleep and we have plugged IO queued,  	 * make sure to submit it to avoid deadlocks. @@ -4033,7 +4221,7 @@ static void __sched notrace preempt_schedule_common(void)  	} while (need_resched());  } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  /*   * this is the entry point to schedule() from in-kernel preemption   * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4105,7 +4293,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)  }  EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */  /*   * this is the entry point to schedule() from kernel preemption @@ -4273,7 +4461,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)  	if (queued)  		enqueue_task(rq, p, queue_flag);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock: @@ -4340,7 +4528,7 @@ void set_user_nice(struct task_struct *p, long nice)  			resched_curr(rq);  	}  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  out_unlock:  	task_rq_unlock(rq, p, &rf);  } @@ -4657,6 +4845,9 @@ recheck:  			return retval;  	} +	if (pi) +		cpuset_read_lock(); +  	/*  	 * Make sure no PI-waiters arrive (or leave) while we are  	 * changing the priority of the task: @@ -4671,8 +4862,8 @@ recheck:  	 * Changing the policy of the stop threads its a very bad idea:  	 */  	if (p == rq->stop) { -		task_rq_unlock(rq, p, &rf); -		return -EINVAL; +		retval = -EINVAL; +		goto unlock;  	}  	/* @@ -4690,8 +4881,8 @@ recheck:  			goto change;  		p->sched_reset_on_fork = reset_on_fork; -		task_rq_unlock(rq, p, &rf); -		return 0; +		retval = 0; +		goto unlock;  	}  change: @@ -4704,8 +4895,8 @@ change:  		if (rt_bandwidth_enabled() && rt_policy(policy) &&  				task_group(p)->rt_bandwidth.rt_runtime == 0 &&  				!task_group_is_autogroup(task_group(p))) { -			task_rq_unlock(rq, p, &rf); -			return -EPERM; +			retval = -EPERM; +			goto unlock;  		}  #endif  #ifdef CONFIG_SMP @@ -4720,8 +4911,8 @@ change:  			 */  			if (!cpumask_subset(span, p->cpus_ptr) ||  			    rq->rd->dl_bw.bw == 0) { -				task_rq_unlock(rq, p, &rf); -				return -EPERM; +				retval = -EPERM; +				goto unlock;  			}  		}  #endif @@ -4731,6 +4922,8 @@ change:  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {  		policy = oldpolicy = -1;  		task_rq_unlock(rq, p, &rf); +		if (pi) +			cpuset_read_unlock();  		goto recheck;  	} @@ -4740,8 +4933,8 @@ change:  	 * is available.  	 */  	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { -		task_rq_unlock(rq, p, &rf); -		return -EBUSY; +		retval = -EBUSY; +		goto unlock;  	}  	p->sched_reset_on_fork = reset_on_fork; @@ -4783,7 +4976,7 @@ change:  		enqueue_task(rq, p, queue_flags);  	}  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio); @@ -4791,14 +4984,22 @@ change:  	preempt_disable();  	task_rq_unlock(rq, p, &rf); -	if (pi) +	if (pi) { +		cpuset_read_unlock();  		rt_mutex_adjust_pi(p); +	}  	/* Run balance callbacks after we've adjusted the PI chain: */  	balance_callback(rq);  	preempt_enable();  	return 0; + +unlock: +	task_rq_unlock(rq, p, &rf); +	if (pi) +		cpuset_read_unlock(); +	return retval;  }  static int _sched_setscheduler(struct task_struct *p, int policy, @@ -4882,10 +5083,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)  	rcu_read_lock();  	retval = -ESRCH;  	p = find_process_by_pid(pid); -	if (p != NULL) -		retval = sched_setscheduler(p, policy, &lparam); +	if (likely(p)) +		get_task_struct(p);  	rcu_read_unlock(); +	if (likely(p)) { +		retval = sched_setscheduler(p, policy, &lparam); +		put_task_struct(p); +	} +  	return retval;  } @@ -5102,37 +5308,40 @@ out_unlock:  	return retval;  } -static int sched_read_attr(struct sched_attr __user *uattr, -			   struct sched_attr *attr, -			   unsigned int usize) +/* + * Copy the kernel size attribute structure (which might be larger + * than what user-space knows about) to user-space. + * + * Note that all cases are valid: user-space buffer can be larger or + * smaller than the kernel-space buffer. The usual case is that both + * have the same size. + */ +static int +sched_attr_copy_to_user(struct sched_attr __user *uattr, +			struct sched_attr *kattr, +			unsigned int usize)  { -	int ret; +	unsigned int ksize = sizeof(*kattr);  	if (!access_ok(uattr, usize))  		return -EFAULT;  	/* -	 * If we're handed a smaller struct than we know of, -	 * ensure all the unknown bits are 0 - i.e. old -	 * user-space does not get uncomplete information. +	 * sched_getattr() ABI forwards and backwards compatibility: +	 * +	 * If usize == ksize then we just copy everything to user-space and all is good. +	 * +	 * If usize < ksize then we only copy as much as user-space has space for, +	 * this keeps ABI compatibility as well. We skip the rest. +	 * +	 * If usize > ksize then user-space is using a newer version of the ABI, +	 * which part the kernel doesn't know about. Just ignore it - tooling can +	 * detect the kernel's knowledge of attributes from the attr->size value +	 * which is set to ksize in this case.  	 */ -	if (usize < sizeof(*attr)) { -		unsigned char *addr; -		unsigned char *end; - -		addr = (void *)attr + usize; -		end  = (void *)attr + sizeof(*attr); +	kattr->size = min(usize, ksize); -		for (; addr < end; addr++) { -			if (*addr) -				return -EFBIG; -		} - -		attr->size = usize; -	} - -	ret = copy_to_user(uattr, attr, attr->size); -	if (ret) +	if (copy_to_user(uattr, kattr, kattr->size))  		return -EFAULT;  	return 0; @@ -5142,20 +5351,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,   * sys_sched_getattr - similar to sched_getparam, but with sched_attr   * @pid: the pid in question.   * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.   * @flags: for future extension.   */  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -		unsigned int, size, unsigned int, flags) +		unsigned int, usize, unsigned int, flags)  { -	struct sched_attr attr = { -		.size = sizeof(struct sched_attr), -	}; +	struct sched_attr kattr = { };  	struct task_struct *p;  	int retval; -	if (!uattr || pid < 0 || size > PAGE_SIZE || -	    size < SCHED_ATTR_SIZE_VER0 || flags) +	if (!uattr || pid < 0 || usize > PAGE_SIZE || +	    usize < SCHED_ATTR_SIZE_VER0 || flags)  		return -EINVAL;  	rcu_read_lock(); @@ -5168,25 +5375,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,  	if (retval)  		goto out_unlock; -	attr.sched_policy = p->policy; +	kattr.sched_policy = p->policy;  	if (p->sched_reset_on_fork) -		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;  	if (task_has_dl_policy(p)) -		__getparam_dl(p, &attr); +		__getparam_dl(p, &kattr);  	else if (task_has_rt_policy(p)) -		attr.sched_priority = p->rt_priority; +		kattr.sched_priority = p->rt_priority;  	else -		attr.sched_nice = task_nice(p); +		kattr.sched_nice = task_nice(p);  #ifdef CONFIG_UCLAMP_TASK -	attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -	attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; +	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;  #endif  	rcu_read_unlock(); -	retval = sched_read_attr(uattr, &attr, size); -	return retval; +	return sched_attr_copy_to_user(uattr, &kattr, usize);  out_unlock:  	rcu_read_unlock(); @@ -5416,7 +5622,7 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION  int __sched _cond_resched(void)  {  	if (should_resched(0)) { @@ -5433,7 +5639,7 @@ EXPORT_SYMBOL(_cond_resched);   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,   * call schedule, and on return reacquire the lock.   * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level   * operations here to prevent schedule() from being called twice (once via   * spin_unlock(), once by hand).   */ @@ -5972,7 +6178,7 @@ void sched_setnuma(struct task_struct *p, int nid)  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	task_rq_unlock(rq, p, &rf);  }  #endif /* CONFIG_NUMA_BALANCING */ @@ -6012,21 +6218,22 @@ static void calc_load_migrate(struct rq *rq)  		atomic_long_add(delta, &calc_load_tasks);  } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq)  { -} +	const struct sched_class *class; +	struct task_struct *next; -static const struct sched_class fake_sched_class = { -	.put_prev_task = put_prev_task_fake, -}; +	for_each_class(class) { +		next = class->pick_next_task(rq, NULL, NULL); +		if (next) { +			next->sched_class->put_prev_task(rq, next, NULL); +			return next; +		} +	} -static struct task_struct fake_task = { -	/* -	 * Avoid pull_{rt,dl}_task() -	 */ -	.prio = MAX_PRIO + 1, -	.sched_class = &fake_sched_class, -}; +	/* The idle class should always have a runnable task */ +	BUG(); +}  /*   * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6069,12 +6276,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)  		if (rq->nr_running == 1)  			break; -		/* -		 * pick_next_task() assumes pinned rq->lock: -		 */ -		next = pick_next_task(rq, &fake_task, rf); -		BUG_ON(!next); -		put_prev_task(rq, next); +		next = __pick_migrate_task(rq);  		/*  		 * Rules for changing task_struct::cpus_mask are holding @@ -6371,19 +6573,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);  void __init sched_init(void)  { -	unsigned long alloc_size = 0, ptr; +	unsigned long ptr = 0;  	int i;  	wait_bit_init();  #ifdef CONFIG_FAIR_GROUP_SCHED -	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +	ptr += 2 * nr_cpu_ids * sizeof(void **);  #endif  #ifdef CONFIG_RT_GROUP_SCHED -	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +	ptr += 2 * nr_cpu_ids * sizeof(void **);  #endif -	if (alloc_size) { -		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); +	if (ptr) { +		ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);  #ifdef CONFIG_FAIR_GROUP_SCHED  		root_task_group.se = (struct sched_entity **)ptr; @@ -6570,7 +6772,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  	rcu_sleep_check();  	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -	     !is_idle_task(current)) || +	     !is_idle_task(current) && !current->non_block_count) ||  	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||  	    oops_in_progress)  		return; @@ -6586,8 +6788,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  		"BUG: sleeping function called from invalid context at %s:%d\n",  			file, line);  	printk(KERN_ERR -		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -			in_atomic(), irqs_disabled(), +		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", +			in_atomic(), irqs_disabled(), current->non_block_count,  			current->pid, current->comm);  	if (task_stack_end_corrupted(current)) @@ -6702,7 +6904,7 @@ struct task_struct *curr_task(int cpu)  #ifdef CONFIG_IA64  /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU.   * @cpu: the processor in question.   * @p: the task pointer to set.   * @@ -6727,6 +6929,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)  /* task_group_lock serializes the addition/removal of task groups */  static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, +					    struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP +	enum uclamp_id clamp_id; + +	for_each_clamp_id(clamp_id) { +		uclamp_se_set(&tg->uclamp_req[clamp_id], +			      uclamp_none(clamp_id), false); +		tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; +	} +#endif +} +  static void sched_free_group(struct task_group *tg)  {  	free_fair_sched_group(tg); @@ -6750,6 +6966,8 @@ struct task_group *sched_create_group(struct task_group *parent)  	if (!alloc_rt_sched_group(tg, parent))  		goto err; +	alloc_uclamp_sched_group(tg, parent); +  	return tg;  err: @@ -6853,7 +7071,7 @@ void sched_move_task(struct task_struct *tsk)  	if (queued)  		enqueue_task(rq, tsk, queue_flags);  	if (running) -		set_curr_task(rq, tsk); +		set_next_task(rq, tsk);  	task_rq_unlock(rq, tsk, &rf);  } @@ -6936,10 +7154,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)  #ifdef CONFIG_RT_GROUP_SCHED  		if (!sched_rt_can_attach(css_tg(css), task))  			return -EINVAL; -#else -		/* We don't support RT-tasks being in separate groups */ -		if (task->sched_class != &fair_sched_class) -			return -EINVAL;  #endif  		/*  		 * Serialize against wake_up_new_task() such that if its @@ -6970,6 +7184,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)  		sched_move_task(task);  } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ +	struct cgroup_subsys_state *top_css = css; +	struct uclamp_se *uc_parent = NULL; +	struct uclamp_se *uc_se = NULL; +	unsigned int eff[UCLAMP_CNT]; +	enum uclamp_id clamp_id; +	unsigned int clamps; + +	css_for_each_descendant_pre(css, top_css) { +		uc_parent = css_tg(css)->parent +			? css_tg(css)->parent->uclamp : NULL; + +		for_each_clamp_id(clamp_id) { +			/* Assume effective clamps matches requested clamps */ +			eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; +			/* Cap effective clamps with parent's effective clamps */ +			if (uc_parent && +			    eff[clamp_id] > uc_parent[clamp_id].value) { +				eff[clamp_id] = uc_parent[clamp_id].value; +			} +		} +		/* Ensure protection is always capped by limit */ +		eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + +		/* Propagate most restrictive effective clamps */ +		clamps = 0x0; +		uc_se = css_tg(css)->uclamp; +		for_each_clamp_id(clamp_id) { +			if (eff[clamp_id] == uc_se[clamp_id].value) +				continue; +			uc_se[clamp_id].value = eff[clamp_id]; +			uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); +			clamps |= (0x1 << clamp_id); +		} +		if (!clamps) { +			css = css_rightmost_descendant(css); +			continue; +		} + +		/* Immediately update descendants RUNNABLE tasks */ +		uclamp_update_active_tasks(css, clamps); +	} +} + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT	2 +#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_SHIFT)) +	s64 percent; +	u64 util; +	int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ +	struct uclamp_request req = { +		.percent = UCLAMP_PERCENT_SCALE, +		.util = SCHED_CAPACITY_SCALE, +		.ret = 0, +	}; + +	buf = strim(buf); +	if (strcmp(buf, "max")) { +		req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, +					     &req.percent); +		if (req.ret) +			return req; +		if (req.percent > UCLAMP_PERCENT_SCALE) { +			req.ret = -ERANGE; +			return req; +		} + +		req.util = req.percent << SCHED_CAPACITY_SHIFT; +		req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); +	} + +	return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, +				size_t nbytes, loff_t off, +				enum uclamp_id clamp_id) +{ +	struct uclamp_request req; +	struct task_group *tg; + +	req = capacity_from_percent(buf); +	if (req.ret) +		return req.ret; + +	mutex_lock(&uclamp_mutex); +	rcu_read_lock(); + +	tg = css_tg(of_css(of)); +	if (tg->uclamp_req[clamp_id].value != req.util) +		uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + +	/* +	 * Because of not recoverable conversion rounding we keep track of the +	 * exact requested value +	 */ +	tg->uclamp_pct[clamp_id] = req.percent; + +	/* Update effective clamps to track the most restrictive value */ +	cpu_util_update_eff(of_css(of)); + +	rcu_read_unlock(); +	mutex_unlock(&uclamp_mutex); + +	return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, +				    char *buf, size_t nbytes, +				    loff_t off) +{ +	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, +				    char *buf, size_t nbytes, +				    loff_t off) +{ +	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, +				    enum uclamp_id clamp_id) +{ +	struct task_group *tg; +	u64 util_clamp; +	u64 percent; +	u32 rem; + +	rcu_read_lock(); +	tg = css_tg(seq_css(sf)); +	util_clamp = tg->uclamp_req[clamp_id].value; +	rcu_read_unlock(); + +	if (util_clamp == SCHED_CAPACITY_SCALE) { +		seq_puts(sf, "max\n"); +		return; +	} + +	percent = tg->uclamp_pct[clamp_id]; +	percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); +	seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ +	cpu_uclamp_print(sf, UCLAMP_MIN); +	return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ +	cpu_uclamp_print(sf, UCLAMP_MAX); +	return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ +  #ifdef CONFIG_FAIR_GROUP_SCHED  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,  				struct cftype *cftype, u64 shareval) @@ -7315,6 +7701,20 @@ static struct cftype cpu_legacy_files[] = {  		.write_u64 = cpu_rt_period_write_uint,  	},  #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP +	{ +		.name = "uclamp.min", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_min_show, +		.write = cpu_uclamp_min_write, +	}, +	{ +		.name = "uclamp.max", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_max_show, +		.write = cpu_uclamp_max_write, +	}, +#endif  	{ }	/* Terminate */  }; @@ -7482,6 +7882,20 @@ static struct cftype cpu_files[] = {  		.write = cpu_max_write,  	},  #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP +	{ +		.name = "uclamp.min", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_min_show, +		.write = cpu_uclamp_min_write, +	}, +	{ +		.name = "uclamp.max", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_max_show, +		.write = cpu_uclamp_max_write, +	}, +#endif  	{ }	/* terminate */  };  |