diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 89 |
1 files changed, 88 insertions, 1 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6986ea31c984..373ff5f55884 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,6 +2928,24 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static bool vma_is_accessed(struct vm_area_struct *vma) +{ + unsigned long pids; + /* + * Allow unconditional access first two times, so that all the (pages) + * of VMAs get prot_none fault introduced irrespective of accesses. + * This is also done to avoid any side effect of task scanning + * amplifying the unfairness of disjoint set of VMAs' access. + */ + if (READ_ONCE(current->mm->numa_scan_seq) < 2) + return true; + + pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; + return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); +} + +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -3027,6 +3045,45 @@ static void task_numa_work(struct callback_head *work) if (!vma_is_accessible(vma)) continue; + /* Initialise new per-VMA NUMAB state. */ + if (!vma->numab_state) { + vma->numab_state = kzalloc(sizeof(struct vma_numab_state), + GFP_KERNEL); + if (!vma->numab_state) + continue; + + vma->numab_state->next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + + /* Reset happens after 4 times scan delay of scan start */ + vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + } + + /* + * Scanning the VMA's of short lived tasks add more overhead. So + * delay the scan for new VMAs. + */ + if (mm->numa_scan_seq && time_before(jiffies, + vma->numab_state->next_scan)) + continue; + + /* Do not scan the VMA if task has not accessed */ + if (!vma_is_accessed(vma)) + continue; + + /* + * RESET access PIDs regularly for old VMAs. Resetting after checking + * vma for recent access to avoid clearing PID info before access.. + */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->next_pid_reset)) { + vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); + vma->numab_state->access_pids[1] = 0; + } + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); @@ -5959,6 +6016,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; + + /* Add a random offset so that timers interleave */ + hrtimer_set_expires(&cfs_b->period_timer, + get_random_u32_below(cfs_b->period)); hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->slack_started = false; @@ -6614,7 +6675,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->stats.nr_wakeups_affine_attempts); - if (target == nr_cpumask_bits) + if (target != this_cpu) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); @@ -10238,6 +10299,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* @@ -11966,6 +12037,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, return delta > 0; } + +static int task_is_throttled_fair(struct task_struct *p, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq = task_group(p)->cfs_rq[cpu]; +#else + cfs_rq = &cpu_rq(cpu)->cfs; +#endif + return throttled_hierarchy(cfs_rq); +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif @@ -12592,6 +12675,10 @@ DEFINE_SCHED_CLASS(fair) = { .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_fair, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif |