aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/rt.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/rt.c')
-rw-r--r--kernel/sched/rt.c447
1 files changed, 426 insertions, 21 deletions
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..45caf937ef90 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -334,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -351,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -839,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
@@ -1323,7 +1335,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1412,7 +1424,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
@@ -1436,7 +1448,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
@@ -1444,7 +1456,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1523,7 +1535,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
}
static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
@@ -1535,9 +1547,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
pull_rt_task(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -1578,7 +1590,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1590,7 +1602,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1628,7 +1640,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1725,8 +1737,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -1761,7 +1772,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
@@ -1819,7 +1830,7 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
@@ -1927,6 +1938,87 @@ static int find_next_push_cpu(struct rq *rq)
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ * (None or zero): No IPI is going around for the current rq
+ * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
+ * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
+ * has changed, and the IPI should restart
+ * circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
static void tell_cpu_to_push(struct rq *rq)
{
int cpu;
@@ -2121,9 +2213,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -2196,12 +2288,11 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
resched_curr(rq);
-#endif /* CONFIG_SMP */
}
}
@@ -2246,6 +2337,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
}
}
+#ifdef CONFIG_POSIX_TIMERS
static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
@@ -2267,6 +2359,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
}
}
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
@@ -2354,6 +2449,316 @@ const struct sched_class rt_sched_class = {
.update_curr = update_curr_rt,
};
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ int ret;
+
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+ return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_rt_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the timeslice to default:
+ */
+ if (!ret && write) {
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
#ifdef CONFIG_SCHED_DEBUG
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);