diff options
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 81 |
1 files changed, 46 insertions, 35 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7b4aa5809c0f..020d58967d4e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -41,7 +41,7 @@ * What it means for a task to be productive is defined differently * for each resource. For IO, productive means a running task. For * memory, productive means a running task that isn't a reclaimer. For - * CPU, productive means an oncpu task. + * CPU, productive means an on-CPU task. * * Naturally, the FULL state doesn't exist for the CPU resource at the * system level, but exist at the cgroup level. At the cgroup level, @@ -49,7 +49,7 @@ * resource which is being used by others outside of the cgroup or * throttled by the cgroup cpu.max configuration. * - * The percentage of wallclock time spent in those compound stall + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -218,28 +218,32 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return unlikely(tasks[NR_IOWAIT]); - case PSI_IO_FULL: - return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); - case PSI_MEM_SOME: - return unlikely(tasks[NR_MEMSTALL]); - case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && - tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); - case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > oncpu); - case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !oncpu); - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); + } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } static void get_recent_times(struct psi_group *group, int cpu, @@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group, /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -770,9 +774,9 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; - enum psi_states s; u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* @@ -841,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu, return; } - for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) - state_mask |= (1 << s); - } + state_mask = test_states(groupc->tasks, state_mask); /* * Since we care about lost potential, a memstall is FULL @@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } #ifdef CONFIG_IRQ_TIME_ACCOUNTING -void psi_account_irqtime(struct task_struct *task, u32 delta) +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { - int cpu = task_cpu(task); + int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now; + u64 now, irq; + s64 delta; if (static_branch_likely(&psi_disabled)) return; - if (!task->pid) + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) return; now = cpu_clock(cpu); + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; - group = task_psi_group(task); do { if (!group->enabled) continue; @@ -1194,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group) /* * After we disable psi_group->enabled, we don't actually * stop percpu tasks accounting in each psi_group_cpu, - * instead only stop test_state() loop, record_times() + * instead only stop test_states() loop, record_times() * and averaging worker, see psi_group_change() for details. * * When disable cgroup PSI, this function has nothing to sync @@ -1202,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group) * would see !psi_group->enabled and only do task accounting. * * When re-enable cgroup PSI, this function use psi_group_change() - * to get correct state mask from test_state() loop on tasks[], + * to get correct state mask from test_states() loop on tasks[], * and restart groupc->state_start from now, use .clear = .set = 0 * here since no task status really changed. */ |