diff options
Diffstat (limited to 'kernel/sched/psi.c')
| -rw-r--r-- | kernel/sched/psi.c | 164 | 
1 files changed, 90 insertions, 74 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c0766c..db27b69fa92a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@   * delayed on that resource such that nobody is advancing and the CPU   * goes idle. This leaves both workload and CPU unproductive.   * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration.   *   *	SOME = nr_delayed_tasks != 0   *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -59,7 +62,7 @@   * states, we would have to conclude a CPU SOME pressure number of   * 100%, since *somebody* is waiting on a runqueue at all   * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution   * threads will be contended at any given time, or about 0.4%.   *   * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,7 +76,7 @@   * we have to base our calculation on the number of non-idle tasks in   * conjunction with the number of available CPUs, which is the number   * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible   * threads that are unproductive due to delays:   *   *	threads = min(nr_nonidle_tasks, nr_cpus) @@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)  {  	switch (state) {  	case PSI_IO_SOME: -		return tasks[NR_IOWAIT]; +		return unlikely(tasks[NR_IOWAIT]);  	case PSI_IO_FULL: -		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; +		return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);  	case PSI_MEM_SOME: -		return tasks[NR_MEMSTALL]; +		return unlikely(tasks[NR_MEMSTALL]);  	case PSI_MEM_FULL: -		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; +		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);  	case PSI_CPU_SOME: -		return tasks[NR_RUNNING] > tasks[NR_ONCPU]; +		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); +	case PSI_CPU_FULL: +		return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);  	case PSI_NONIDLE:  		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||  			tasks[NR_RUNNING]; @@ -441,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)  	mutex_unlock(&group->avgs_lock);  } -/* Trigger tracking window manupulations */ +/* Trigger tracking window manipulations */  static void window_reset(struct psi_window *win, u64 now, u64 value,  			 u64 prev_growth)  { @@ -639,13 +644,10 @@ static void poll_timer_fn(struct timer_list *t)  	wake_up_interruptible(&group->poll_wait);  } -static void record_times(struct psi_group_cpu *groupc, int cpu, -			 bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, u64 now)  {  	u32 delta; -	u64 now; -	now = cpu_clock(cpu);  	delta = now - groupc->state_start;  	groupc->state_start = now; @@ -659,34 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,  		groupc->times[PSI_MEM_SOME] += delta;  		if (groupc->state_mask & (1 << PSI_MEM_FULL))  			groupc->times[PSI_MEM_FULL] += delta; -		else if (memstall_tick) { -			u32 sample; -			/* -			 * Since we care about lost potential, a -			 * memstall is FULL when there are no other -			 * working tasks, but also when the CPU is -			 * actively reclaiming and nothing productive -			 * could run even if it were runnable. -			 * -			 * When the timer tick sees a reclaiming CPU, -			 * regardless of runnable tasks, sample a FULL -			 * tick (or less if it hasn't been a full tick -			 * since the last state change). -			 */ -			sample = min(delta, (u32)jiffies_to_nsecs(1)); -			groupc->times[PSI_MEM_FULL] += sample; -		}  	} -	if (groupc->state_mask & (1 << PSI_CPU_SOME)) +	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {  		groupc->times[PSI_CPU_SOME] += delta; +		if (groupc->state_mask & (1 << PSI_CPU_FULL)) +			groupc->times[PSI_CPU_FULL] += delta; +	}  	if (groupc->state_mask & (1 << PSI_NONIDLE))  		groupc->times[PSI_NONIDLE] += delta;  }  static void psi_group_change(struct psi_group *group, int cpu, -			     unsigned int clear, unsigned int set, +			     unsigned int clear, unsigned int set, u64 now,  			     bool wake_clock)  {  	struct psi_group_cpu *groupc; @@ -706,19 +694,20 @@ static void psi_group_change(struct psi_group *group, int cpu,  	 */  	write_seqcount_begin(&groupc->seq); -	record_times(groupc, cpu, false); +	record_times(groupc, now);  	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {  		if (!(m & (1 << t)))  			continue; -		if (groupc->tasks[t] == 0 && !psi_bug) { +		if (groupc->tasks[t]) { +			groupc->tasks[t]--; +		} else if (!psi_bug) {  			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",  					cpu, t, groupc->tasks[0],  					groupc->tasks[1], groupc->tasks[2],  					groupc->tasks[3], clear, set);  			psi_bug = 1;  		} -		groupc->tasks[t]--;  	}  	for (t = 0; set; set &= ~(1 << t), t++) @@ -730,6 +719,18 @@ static void psi_group_change(struct psi_group *group, int cpu,  		if (test_state(groupc->tasks, s))  			state_mask |= (1 << s);  	} + +	/* +	 * Since we care about lost potential, a memstall is FULL +	 * when there are no other working tasks, but also when +	 * the CPU is actively reclaiming and nothing productive +	 * could run even if it were runnable. So when the current +	 * task in a cgroup is in_memstall, the corresponding groupc +	 * on that cpu is in PSI_MEM_FULL state. +	 */ +	if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) +		state_mask |= (1 << PSI_MEM_FULL); +  	groupc->state_mask = state_mask;  	write_seqcount_end(&groupc->seq); @@ -786,12 +787,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)  	struct psi_group *group;  	bool wake_clock = true;  	void *iter = NULL; +	u64 now;  	if (!task->pid)  		return;  	psi_flags_change(task, clear, set); +	now = cpu_clock(cpu);  	/*  	 * Periodic aggregation shuts off if there is a period of no  	 * task changes, so we wake it back up if necessary. However, @@ -804,7 +807,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)  		wake_clock = false;  	while ((group = iterate_groups(task, &iter))) -		psi_group_change(group, cpu, clear, set, wake_clock); +		psi_group_change(group, cpu, clear, set, now, wake_clock);  }  void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -813,56 +816,61 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,  	struct psi_group *group, *common = NULL;  	int cpu = task_cpu(prev);  	void *iter; +	u64 now = cpu_clock(cpu);  	if (next->pid) { +		bool identical_state; +  		psi_flags_change(next, 0, TSK_ONCPU);  		/* -		 * When moving state between tasks, the group that -		 * contains them both does not change: we can stop -		 * updating the tree once we reach the first common -		 * ancestor. Iterate @next's ancestors until we -		 * encounter @prev's state. +		 * When switching between tasks that have an identical +		 * runtime state, the cgroup that contains both tasks +		 * runtime state, the cgroup that contains both tasks +		 * we reach the first common ancestor. Iterate @next's +		 * ancestors only until we encounter @prev's ONCPU.  		 */ +		identical_state = prev->psi_flags == next->psi_flags;  		iter = NULL;  		while ((group = iterate_groups(next, &iter))) { -			if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { +			if (identical_state && +			    per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {  				common = group;  				break;  			} -			psi_group_change(group, cpu, 0, TSK_ONCPU, true); +			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);  		}  	} -	/* -	 * If this is a voluntary sleep, dequeue will have taken care -	 * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We -	 * only need to deal with it during preemption. -	 */ -	if (sleep) -		return; -  	if (prev->pid) { -		psi_flags_change(prev, TSK_ONCPU, 0); +		int clear = TSK_ONCPU, set = 0; -		iter = NULL; -		while ((group = iterate_groups(prev, &iter)) && group != common) -			psi_group_change(group, cpu, TSK_ONCPU, 0, true); -	} -} +		/* +		 * When we're going to sleep, psi_dequeue() lets us handle +		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it +		 * with TSK_ONCPU and save walking common ancestors twice. +		 */ +		if (sleep) { +			clear |= TSK_RUNNING; +			if (prev->in_iowait) +				set |= TSK_IOWAIT; +		} -void psi_memstall_tick(struct task_struct *task, int cpu) -{ -	struct psi_group *group; -	void *iter = NULL; +		psi_flags_change(prev, clear, set); -	while ((group = iterate_groups(task, &iter))) { -		struct psi_group_cpu *groupc; +		iter = NULL; +		while ((group = iterate_groups(prev, &iter)) && group != common) +			psi_group_change(group, cpu, clear, set, now, true); -		groupc = per_cpu_ptr(group->pcpu, cpu); -		write_seqcount_begin(&groupc->seq); -		record_times(groupc, cpu, true); -		write_seqcount_end(&groupc->seq); +		/* +		 * TSK_ONCPU is handled up to the common ancestor. If we're tasked +		 * with dequeuing too, finish that for the rest of the hierarchy. +		 */ +		if (sleep) { +			clear &= ~TSK_ONCPU; +			for (; group; group = iterate_groups(prev, &iter)) +				psi_group_change(group, cpu, clear, set, now, true); +		}  	}  } @@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)  		group->avg_next_update = update_averages(group, now);  	mutex_unlock(&group->avgs_lock); -	for (full = 0; full < 2 - (res == PSI_CPU); full++) { +	for (full = 0; full < 2; full++) {  		unsigned long avg[3];  		u64 total;  		int w; @@ -1054,19 +1062,27 @@ static int psi_cpu_show(struct seq_file *m, void *v)  	return psi_show(m, &psi_system, PSI_CPU);  } +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ +	if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) +		return -EPERM; + +	return single_open(file, psi_show, NULL); +} +  static int psi_io_open(struct inode *inode, struct file *file)  { -	return single_open(file, psi_io_show, NULL); +	return psi_open(file, psi_io_show);  }  static int psi_memory_open(struct inode *inode, struct file *file)  { -	return single_open(file, psi_memory_show, NULL); +	return psi_open(file, psi_memory_show);  }  static int psi_cpu_open(struct inode *inode, struct file *file)  { -	return single_open(file, psi_cpu_show, NULL); +	return psi_open(file, psi_cpu_show);  }  struct psi_trigger *psi_trigger_create(struct psi_group *group, @@ -1346,9 +1362,9 @@ static int __init psi_proc_init(void)  {  	if (psi_enable) {  		proc_mkdir("pressure", NULL); -		proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); -		proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); -		proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); +		proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); +		proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); +		proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);  	}  	return 0;  }  |