diff options
Diffstat (limited to 'kernel/sched/sched.h')
| -rw-r--r-- | kernel/sched/sched.h | 340 |
1 files changed, 304 insertions, 36 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..eeef1a3086d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,9 +1,27 @@ #include <linux/sched.h> +#include <linux/sched/autogroup.h> #include <linux/sched/sysctl.h> +#include <linux/sched/topology.h> #include <linux/sched/rt.h> -#include <linux/u64_stats_sync.h> #include <linux/sched/deadline.h> +#include <linux/sched/clock.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/signal.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/mm.h> +#include <linux/sched/cpufreq.h> +#include <linux/sched/stat.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> +#include <linux/sched/init.h> + +#include <linux/u64_stats_sync.h> +#include <linux/kernel_stat.h> #include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> @@ -12,14 +30,18 @@ #include <linux/tick.h> #include <linux/slab.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + #include "cpupri.h" #include "cpudeadline.h" #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -196,23 +218,25 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -extern struct dl_bw *dl_bw_of(int i); - struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -222,7 +246,22 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -extern struct mutex sched_domains_mutex; +void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern void __dl_clear_params(struct task_struct *p); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -344,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, @@ -536,6 +580,30 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -583,6 +651,11 @@ struct root_domain { }; extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -644,7 +717,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_skip_update; + unsigned int clock_update_flags; u64 clock; u64 clock_task; @@ -768,28 +841,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) return READ_ONCE(rq->clock); } +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + static inline u64 rq_clock(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock_task; } -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 - static inline void rq_clock_skip_update(struct rq *rq, bool skip) { lockdep_assert_held(&rq->lock); if (skip) - rq->clock_skip_update |= RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; else - rq->clock_skip_update &= ~RQCF_REQ_SKIP; + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif } #ifdef CONFIG_NUMA @@ -803,6 +958,16 @@ extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); #endif +#ifdef CONFIG_NUMA +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +#endif + #ifdef CONFIG_NUMA_BALANCING /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { @@ -904,7 +1069,11 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - unsigned long cpumask[0]; /* iteration mask */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -925,16 +1094,15 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } @@ -945,7 +1113,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); @@ -969,7 +1137,7 @@ static inline void sched_ttwu_pending(void) { } #endif /* CONFIG_SMP */ #include "stats.h" -#include "auto_group.h" +#include "autogroup.h" #ifdef CONFIG_CGROUP_SCHED @@ -1210,15 +1378,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -1245,7 +1415,7 @@ struct sched_class { */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev, - struct pin_cookie cookie); + struct rq_flags *rf); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1299,7 +1469,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1344,6 +1518,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) } #endif +extern void schedule_idle(void); + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1361,7 +1537,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1501,13 +1682,9 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -struct rq_flags { - unsigned long flags; - struct pin_cookie cookie; -}; - struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1515,7 +1692,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); } @@ -1524,11 +1701,67 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1674,6 +1907,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + #else /* CONFIG_SMP */ /* @@ -1718,7 +1955,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); - #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); @@ -1748,16 +1984,48 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { - u64 hardirq_time; - u64 softirq_time; + u64 total; + u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); @@ -1766,7 +2034,7 @@ static inline u64 irq_time_read(int cpu) do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = irqtime->softirq_time + irqtime->hardirq_time; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; |