diff options
Diffstat (limited to 'kernel/sched/psi.c')
| -rw-r--r-- | kernel/sched/psi.c | 100 | 
1 files changed, 84 insertions, 16 deletions
| diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ee2ecc081422..8ac8b81bfee6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -189,6 +189,7 @@ static void group_init(struct psi_group *group)  	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);  	mutex_init(&group->avgs_lock);  	/* Init trigger-related members */ +	atomic_set(&group->poll_scheduled, 0);  	mutex_init(&group->trigger_lock);  	INIT_LIST_HEAD(&group->triggers);  	group->poll_min_period = U32_MAX; @@ -242,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,  			     u32 *pchanged_states)  {  	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); +	int current_cpu = raw_smp_processor_id(); +	unsigned int tasks[NR_PSI_TASK_COUNTS];  	u64 now, state_start;  	enum psi_states s;  	unsigned int seq; @@ -256,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,  		memcpy(times, groupc->times, sizeof(groupc->times));  		state_mask = groupc->state_mask;  		state_start = groupc->state_start; +		if (cpu == current_cpu) +			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));  	} while (read_seqcount_retry(&groupc->seq, seq));  	/* Calculate state time deltas against the previous snapshot */ @@ -280,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,  		if (delta)  			*pchanged_states |= (1 << s);  	} + +	/* +	 * When collect_percpu_times() from the avgs_work, we don't want to +	 * re-arm avgs_work when all CPUs are IDLE. But the current CPU running +	 * this avgs_work is never IDLE, cause avgs_work can't be shut off. +	 * So for the current CPU, we need to re-arm avgs_work only when +	 * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs +	 * we can just check PSI_NONIDLE delta. +	 */ +	if (current_work() == &group->avgs_work.work) { +		bool reschedule; + +		if (cpu == current_cpu) +			reschedule = tasks[NR_RUNNING] + +				     tasks[NR_IOWAIT] + +				     tasks[NR_MEMSTALL] > 1; +		else +			reschedule = *pchanged_states & (1 << PSI_NONIDLE); + +		if (reschedule) +			*pchanged_states |= PSI_STATE_RESCHEDULE; +	}  }  static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -415,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)  	struct delayed_work *dwork;  	struct psi_group *group;  	u32 changed_states; -	bool nonidle;  	u64 now;  	dwork = to_delayed_work(work); @@ -426,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)  	now = sched_clock();  	collect_percpu_times(group, PSI_AVGS, &changed_states); -	nonidle = changed_states & (1 << PSI_NONIDLE);  	/*  	 * If there is task activity, periodically fold the per-cpu  	 * times and feed samples into the running averages. If things @@ -437,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)  	if (now >= group->avg_next_update)  		group->avg_next_update = update_averages(group, now); -	if (nonidle) { +	if (changed_states & PSI_STATE_RESCHEDULE) {  		schedule_delayed_work(dwork, nsecs_to_jiffies(  				group->avg_next_update - now) + 1);  	} @@ -539,10 +564,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)  			/* Calculate growth since last update */  			growth = window_update(&t->win, now, total[t->state]); -			if (growth < t->threshold) -				continue; +			if (!t->pending_event) { +				if (growth < t->threshold) +					continue; -			t->pending_event = true; +				t->pending_event = true; +			}  		}  		/* Limit event signaling to once per window */  		if (now < t->last_event_time + t->win.size) @@ -563,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now)  	return now + group->poll_min_period;  } -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule polling if it's not already scheduled or forced. */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, +				   bool force)  {  	struct task_struct *task;  	/* -	 * Do not reschedule if already scheduled. -	 * Possible race with a timer scheduled after this check but before -	 * mod_timer below can be tolerated because group->polling_next_update -	 * will keep updates on schedule. +	 * atomic_xchg should be called even when !force to provide a +	 * full memory barrier (see the comment inside psi_poll_work).  	 */ -	if (timer_pending(&group->poll_timer)) +	if (atomic_xchg(&group->poll_scheduled, 1) && !force)  		return;  	rcu_read_lock(); @@ -586,12 +612,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)  	 */  	if (likely(task))  		mod_timer(&group->poll_timer, jiffies + delay); +	else +		atomic_set(&group->poll_scheduled, 0);  	rcu_read_unlock();  }  static void psi_poll_work(struct psi_group *group)  { +	bool force_reschedule = false;  	u32 changed_states;  	u64 now; @@ -599,6 +628,43 @@ static void psi_poll_work(struct psi_group *group)  	now = sched_clock(); +	if (now > group->polling_until) { +		/* +		 * We are either about to start or might stop polling if no +		 * state change was recorded. Resetting poll_scheduled leaves +		 * a small window for psi_group_change to sneak in and schedule +		 * an immediate poll_work before we get to rescheduling. One +		 * potential extra wakeup at the end of the polling window +		 * should be negligible and polling_next_update still keeps +		 * updates correctly on schedule. +		 */ +		atomic_set(&group->poll_scheduled, 0); +		/* +		 * A task change can race with the poll worker that is supposed to +		 * report on it. To avoid missing events, ensure ordering between +		 * poll_scheduled and the task state accesses, such that if the poll +		 * worker misses the state update, the task change is guaranteed to +		 * reschedule the poll worker: +		 * +		 * poll worker: +		 *   atomic_set(poll_scheduled, 0) +		 *   smp_mb() +		 *   LOAD states +		 * +		 * task change: +		 *   STORE states +		 *   if atomic_xchg(poll_scheduled, 1) == 0: +		 *     schedule poll worker +		 * +		 * The atomic_xchg() implies a full barrier. +		 */ +		smp_mb(); +	} else { +		/* Polling window is not over, keep rescheduling */ +		force_reschedule = true; +	} + +  	collect_percpu_times(group, PSI_POLL, &changed_states);  	if (changed_states & group->poll_states) { @@ -624,7 +690,8 @@ static void psi_poll_work(struct psi_group *group)  		group->polling_next_update = update_triggers(group, now);  	psi_schedule_poll_work(group, -		nsecs_to_jiffies(group->polling_next_update - now) + 1); +		nsecs_to_jiffies(group->polling_next_update - now) + 1, +		force_reschedule);  out:  	mutex_unlock(&group->trigger_lock); @@ -785,7 +852,7 @@ static void psi_group_change(struct psi_group *group, int cpu,  	write_seqcount_end(&groupc->seq);  	if (state_mask & group->poll_states) -		psi_schedule_poll_work(group, 1); +		psi_schedule_poll_work(group, 1, false);  	if (wake_clock && !delayed_work_pending(&group->avgs_work))  		schedule_delayed_work(&group->avgs_work, PSI_FREQ); @@ -939,7 +1006,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)  		write_seqcount_end(&groupc->seq);  		if (group->poll_states & (1 << PSI_IRQ_FULL)) -			psi_schedule_poll_work(group, 1); +			psi_schedule_poll_work(group, 1, false);  	} while ((group = group->parent));  }  #endif @@ -1325,6 +1392,7 @@ void psi_trigger_destroy(struct psi_trigger *t)  		 * can no longer be found through group->poll_task.  		 */  		kthread_stop(task_to_destroy); +		atomic_set(&group->poll_scheduled, 0);  	}  	kfree(t);  } |