diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 3 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 119 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 175 | 
3 files changed, 210 insertions, 87 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d18804491d9f..c56fb57f2991 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1456,7 +1456,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * yield - it could be a while.  		 */  		if (unlikely(queued)) { -			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); +			ktime_t to = NSEC_PER_SEC / HZ;  			set_current_state(TASK_UNINTERRUPTIBLE);  			schedule_hrtimeout(&to, HRTIMER_MODE_REL); @@ -5280,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu)  	__sched_fork(0, idle);  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); +	idle->flags |= PF_IDLE;  	kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/cpufreq.h> +#include <linux/kthread.h>  #include <linux/slab.h>  #include <trace/events/power.h>  #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY	50 +  struct sugov_tunables {  	struct gov_attr_set attr_set;  	unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy {  	/* The next fields are only needed if fast switch cannot be used. */  	struct irq_work irq_work; -	struct work_struct work; +	struct kthread_work work;  	struct mutex work_lock; +	struct kthread_worker worker; +	struct task_struct *thread;  	bool work_in_progress;  	bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,  	raw_spin_unlock(&sg_policy->update_lock);  } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work)  {  	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)  	struct sugov_policy *sg_policy;  	sg_policy = container_of(irq_work, struct sugov_policy, irq_work); -	schedule_work_on(smp_processor_id(), &sg_policy->work); + +	/* +	 * For RT and deadline tasks, the schedutil governor shoots the +	 * frequency to maximum. Special care must be taken to ensure that this +	 * kthread doesn't result in the same behavior. +	 * +	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is +	 * updated only at the end of the sugov_work() function and before that +	 * the schedutil governor rejects all other frequency scaling requests. +	 * +	 * There is a very rare case though, where the RT thread yields right +	 * after the work_in_progress flag is cleared. The effects of that are +	 * neglected for now. +	 */ +	kthread_queue_work(&sg_policy->worker, &sg_policy->work);  }  /************************** sysfs interface ************************/ @@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)  		return NULL;  	sg_policy->policy = policy; -	init_irq_work(&sg_policy->irq_work, sugov_irq_work); -	INIT_WORK(&sg_policy->work, sugov_work); -	mutex_init(&sg_policy->work_lock);  	raw_spin_lock_init(&sg_policy->update_lock);  	return sg_policy;  }  static void sugov_policy_free(struct sugov_policy *sg_policy)  { -	mutex_destroy(&sg_policy->work_lock);  	kfree(sg_policy);  } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ +	struct task_struct *thread; +	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; +	struct cpufreq_policy *policy = sg_policy->policy; +	int ret; + +	/* kthread only required for slow path */ +	if (policy->fast_switch_enabled) +		return 0; + +	kthread_init_work(&sg_policy->work, sugov_work); +	kthread_init_worker(&sg_policy->worker); +	thread = kthread_create(kthread_worker_fn, &sg_policy->worker, +				"sugov:%d", +				cpumask_first(policy->related_cpus)); +	if (IS_ERR(thread)) { +		pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); +		return PTR_ERR(thread); +	} + +	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); +	if (ret) { +		kthread_stop(thread); +		pr_warn("%s: failed to set SCHED_FIFO\n", __func__); +		return ret; +	} + +	sg_policy->thread = thread; +	kthread_bind_mask(thread, policy->related_cpus); +	init_irq_work(&sg_policy->irq_work, sugov_irq_work); +	mutex_init(&sg_policy->work_lock); + +	wake_up_process(thread); + +	return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ +	/* kthread only required for slow path */ +	if (sg_policy->policy->fast_switch_enabled) +		return; + +	kthread_flush_worker(&sg_policy->worker); +	kthread_stop(sg_policy->thread); +	mutex_destroy(&sg_policy->work_lock); +} +  static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)  {  	struct sugov_tunables *tunables; @@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy)  	if (policy->governor_data)  		return -EBUSY; +	cpufreq_enable_fast_switch(policy); +  	sg_policy = sugov_policy_alloc(policy); -	if (!sg_policy) -		return -ENOMEM; +	if (!sg_policy) { +		ret = -ENOMEM; +		goto disable_fast_switch; +	} + +	ret = sugov_kthread_create(sg_policy); +	if (ret) +		goto free_sg_policy;  	mutex_lock(&global_tunables_lock);  	if (global_tunables) {  		if (WARN_ON(have_governor_per_policy())) {  			ret = -EINVAL; -			goto free_sg_policy; +			goto stop_kthread;  		}  		policy->governor_data = sg_policy;  		sg_policy->tunables = global_tunables; @@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy)  	tunables = sugov_tunables_alloc(sg_policy);  	if (!tunables) {  		ret = -ENOMEM; -		goto free_sg_policy; +		goto stop_kthread;  	}  	tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy)  	if (ret)  		goto fail; - out: +out:  	mutex_unlock(&global_tunables_lock); - -	cpufreq_enable_fast_switch(policy);  	return 0; - fail: +fail:  	policy->governor_data = NULL;  	sugov_tunables_free(tunables); - free_sg_policy: +stop_kthread: +	sugov_kthread_stop(sg_policy); + +free_sg_policy:  	mutex_unlock(&global_tunables_lock);  	sugov_policy_free(sg_policy); + +disable_fast_switch: +	cpufreq_disable_fast_switch(policy); +  	pr_err("initialization failed (error %d)\n", ret);  	return ret;  } @@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy)  	struct sugov_tunables *tunables = sg_policy->tunables;  	unsigned int count; -	cpufreq_disable_fast_switch(policy); -  	mutex_lock(&global_tunables_lock);  	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy)  	mutex_unlock(&global_tunables_lock); +	sugov_kthread_stop(sg_policy);  	sugov_policy_free(sg_policy); +	cpufreq_disable_fast_switch(policy);  }  static int sugov_start(struct cpufreq_policy *policy) @@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)  	synchronize_sched(); -	irq_work_sync(&sg_policy->irq_work); -	cancel_work_sync(&sg_policy->work); +	if (!policy->fast_switch_enabled) { +		irq_work_sync(&sg_policy->irq_work); +		kthread_cancel_work_sync(&sg_policy->work); +	}  }  static void sugov_limits(struct cpufreq_policy *policy) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)  	 * timekeeping to prevent timer interrupts from kicking us out of idle  	 * until a proper wakeup interrupt happens.  	 */ -	if (idle_should_freeze()) { -		entered_state = cpuidle_enter_freeze(drv, dev); -		if (entered_state > 0) { -			local_irq_enable(); -			goto exit_idle; + +	if (idle_should_freeze() || dev->use_deepest_state) { +		if (idle_should_freeze()) { +			entered_state = cpuidle_enter_freeze(drv, dev); +			if (entered_state > 0) { +				local_irq_enable(); +				goto exit_idle; +			}  		}  		next_state = cpuidle_find_deepest_state(drv, dev); @@ -202,76 +205,65 @@ exit_idle:   *   * Called with polling cleared.   */ -static void cpu_idle_loop(void) +static void do_idle(void)  { -	int cpu = smp_processor_id(); - -	while (1) { -		/* -		 * If the arch has a polling bit, we maintain an invariant: -		 * -		 * Our polling bit is clear if we're not scheduled (i.e. if -		 * rq->curr != rq->idle).  This means that, if rq->idle has -		 * the polling bit set, then setting need_resched is -		 * guaranteed to cause the cpu to reschedule. -		 */ - -		__current_set_polling(); -		quiet_vmstat(); -		tick_nohz_idle_enter(); +	/* +	 * If the arch has a polling bit, we maintain an invariant: +	 * +	 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != +	 * rq->idle). This means that, if rq->idle has the polling bit set, +	 * then setting need_resched is guaranteed to cause the CPU to +	 * reschedule. +	 */ -		while (!need_resched()) { -			check_pgt_cache(); -			rmb(); +	__current_set_polling(); +	tick_nohz_idle_enter(); -			if (cpu_is_offline(cpu)) { -				cpuhp_report_idle_dead(); -				arch_cpu_idle_dead(); -			} +	while (!need_resched()) { +		check_pgt_cache(); +		rmb(); -			local_irq_disable(); -			arch_cpu_idle_enter(); - -			/* -			 * In poll mode we reenable interrupts and spin. -			 * -			 * Also if we detected in the wakeup from idle -			 * path that the tick broadcast device expired -			 * for us, we don't want to go deep idle as we -			 * know that the IPI is going to arrive right -			 * away -			 */ -			if (cpu_idle_force_poll || tick_check_broadcast_expired()) -				cpu_idle_poll(); -			else -				cpuidle_idle_call(); - -			arch_cpu_idle_exit(); +		if (cpu_is_offline(smp_processor_id())) { +			cpuhp_report_idle_dead(); +			arch_cpu_idle_dead();  		} -		/* -		 * Since we fell out of the loop above, we know -		 * TIF_NEED_RESCHED must be set, propagate it into -		 * PREEMPT_NEED_RESCHED. -		 * -		 * This is required because for polling idle loops we will -		 * not have had an IPI to fold the state for us. -		 */ -		preempt_set_need_resched(); -		tick_nohz_idle_exit(); -		__current_clr_polling(); +		local_irq_disable(); +		arch_cpu_idle_enter();  		/* -		 * We promise to call sched_ttwu_pending and reschedule -		 * if need_resched is set while polling is set.  That -		 * means that clearing polling needs to be visible -		 * before doing these things. +		 * In poll mode we reenable interrupts and spin. Also if we +		 * detected in the wakeup from idle path that the tick +		 * broadcast device expired for us, we don't want to go deep +		 * idle as we know that the IPI is going to arrive right away.  		 */ -		smp_mb__after_atomic(); - -		sched_ttwu_pending(); -		schedule_preempt_disabled(); +		if (cpu_idle_force_poll || tick_check_broadcast_expired()) +			cpu_idle_poll(); +		else +			cpuidle_idle_call(); +		arch_cpu_idle_exit();  	} + +	/* +	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must +	 * be set, propagate it into PREEMPT_NEED_RESCHED. +	 * +	 * This is required because for polling idle loops we will not have had +	 * an IPI to fold the state for us. +	 */ +	preempt_set_need_resched(); +	tick_nohz_idle_exit(); +	__current_clr_polling(); + +	/* +	 * We promise to call sched_ttwu_pending() and reschedule if +	 * need_resched() is set while polling is set. That means that clearing +	 * polling needs to be visible before doing these things. +	 */ +	smp_mb__after_atomic(); + +	sched_ttwu_pending(); +	schedule_preempt_disabled();  }  bool cpu_in_idle(unsigned long pc) @@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)  		pc < (unsigned long)__cpuidle_text_end;  } +struct idle_timer { +	struct hrtimer timer; +	int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ +	struct idle_timer *it = container_of(timer, struct idle_timer, timer); + +	WRITE_ONCE(it->done, 1); +	set_tsk_need_resched(current); + +	return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ +	struct idle_timer it; + +	/* +	 * Only FIFO tasks can disable the tick since they don't need the forced +	 * preemption. +	 */ +	WARN_ON_ONCE(current->policy != SCHED_FIFO); +	WARN_ON_ONCE(current->nr_cpus_allowed != 1); +	WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); +	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); +	WARN_ON_ONCE(!duration_ms); + +	rcu_sleep_check(); +	preempt_disable(); +	current->flags |= PF_IDLE; +	cpuidle_use_deepest_state(true); + +	it.done = 0; +	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	it.timer.function = idle_inject_timer_fn; +	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + +	while (!READ_ONCE(it.done)) +		do_idle(); + +	cpuidle_use_deepest_state(false); +	current->flags &= ~PF_IDLE; + +	preempt_fold_need_resched(); +	preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); +  void cpu_startup_entry(enum cpuhp_state state)  {  	/* @@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)  #endif  	arch_cpu_idle_prepare();  	cpuhp_online_idle(state); -	cpu_idle_loop(); +	while (1) +		do_idle();  } |