diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 42 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/irq/msi.c | 4 | ||||
-rw-r--r-- | kernel/sched/autogroup.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 53 | ||||
-rw-r--r-- | kernel/sched/fair.c | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c | 12 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 19 |
10 files changed, 101 insertions, 49 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 60f1bfc3c7b2..ce77f0265660 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,12 +1,23 @@ # SPDX-License-Identifier: GPL-2.0-only +config PREEMPT_NONE_BUILD + bool + +config PREEMPT_VOLUNTARY_BUILD + bool + +config PREEMPT_BUILD + bool + select PREEMPTION + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + choice prompt "Preemption Model" - default PREEMPT_NONE_BEHAVIOUR + default PREEMPT_NONE -config PREEMPT_NONE_BEHAVIOUR +config PREEMPT_NONE bool "No Forced Preemption (Server)" - select PREEMPT_NONE if !PREEMPT_DYNAMIC + select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards throughput. It will still provide good latencies most of the @@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR raw processing power of the kernel, irrespective of scheduling latencies. -config PREEMPT_VOLUNTARY_BEHAVIOUR +config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC + select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR Select this if you are building a kernel for a desktop system. -config PREEMPT_BEHAVIOUR +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT + select PREEMPT_BUILD help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC + depends on EXPERT && ARCH_SUPPORTS_RT select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -75,17 +86,6 @@ config PREEMPT_RT endchoice -config PREEMPT_NONE - bool - -config PREEMPT_VOLUNTARY - bool - -config PREEMPT - bool - select PREEMPTION - select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK - config PREEMPT_COUNT bool @@ -95,8 +95,8 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC - select PREEMPT + depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select PREEMPT_BUILD default y help This option allows to define the preemption model on the kernel diff --git a/kernel/events/core.c b/kernel/events/core.c index f2253ea729a2..523106a506ee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7154,7 +7154,6 @@ void perf_output_sample(struct perf_output_handle *handle, static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; - struct page *p = NULL; if (!virt) return 0; @@ -7173,14 +7172,15 @@ static u64 perf_virt_to_phys(u64 virt) * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { + struct page *p; + pagefault_disable(); - if (get_user_page_fast_only(virt, 0, &p)) + if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } pagefault_enable(); } - - if (p) - put_page(p); } return phys_addr; diff --git a/kernel/fork.c b/kernel/fork.c index 5de23f3e08bf..3244cc56b697 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2277,6 +2277,7 @@ static __latent_entropy struct task_struct *copy_process( p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + clear_posix_cputimers_work(p); #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6a5ecee6e567..7f350ae59c5f 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -529,10 +529,10 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, /* * Checking the first MSI descriptor is sufficient. MSIX supports - * masking and MSI does so when the maskbit is set. + * masking and MSI does so when the can_mask attribute is set. */ desc = first_msi_entry(dev); - return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; + return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask; } int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2067080bb235..8629b37d118e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif - sched_offline_group(ag->tg); + sched_release_group(ag->tg); sched_destroy_group(ag->tg); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 523fd602ea90..3c9b0fda64ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3726,6 +3726,9 @@ out: bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode); static void __init preempt_dynamic_init(void) { if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); } else { /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); preempt_dynamic_mode = preempt_dynamic_full; pr_info("Dynamic Preempt: full\n"); } @@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +static void sched_free_group_rcu(struct rcu_head *rcu) +{ + sched_free_group(container_of(rcu, struct task_group, rcu)); +} + +static void sched_unregister_group(struct task_group *tg) +{ + unregister_fair_sched_group(tg); + unregister_rt_sched_group(tg); + /* + * We have to wait for yet another RCU grace period to expire, as + * print_cfs_stats() might run concurrently. + */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) +static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ - sched_free_group(container_of(rhp, struct task_group, rcu)); + sched_unregister_group(container_of(rhp, struct task_group, rcu)); } void sched_destroy_group(struct task_group *tg) { /* Wait for possible concurrent references to cfs_rqs complete: */ - call_rcu(&tg->rcu, sched_free_group_rcu); + call_rcu(&tg->rcu, sched_unregister_group_rcu); } -void sched_offline_group(struct task_group *tg) +void sched_release_group(struct task_group *tg) { unsigned long flags; - /* End participation in shares distribution: */ - unregister_fair_sched_group(tg); - + /* + * Unlink first, to avoid walk_tg_tree_from() from finding us (via + * sched_cfs_period_timer()). + * + * For this to be effective, we have to wait for all pending users of + * this task group to leave their RCU critical section to ensure no new + * user will see our dying task group any more. Specifically ensure + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. + * + * We therefore defer calling unregister_fair_sched_group() to + * sched_unregister_group() which is guarantied to get called only after the + * current RCU grace period has expired. + */ spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); @@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_offline_group(tg); + sched_release_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) @@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) /* * Relies on the RCU grace period between css_released() and this. */ - sched_free_group(tg); + sched_unregister_group(tg); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13950beb01a2..6e476f6d9435 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg) { int i; - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg) struct rq *rq; int cpu; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(cpu) { if (tg->se[cpu]) remove_entity_load_avg(tg->se[cpu]); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bb945f8faeca..b48baaba2fc2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) return rt_rq->rq; } -void free_rt_sched_group(struct task_group *tg) +void unregister_rt_sched_group(struct task_group *tg) { - int i; - if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void unregister_rt_sched_group(struct task_group *tg) { } + void free_rt_sched_group(struct task_group *tg) { } int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7f1612d26c18..0e66749486e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, @@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); -extern void sched_offline_group(struct task_group *tg); +extern void sched_release_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 643d412ac623..96b4e7810426 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1159,13 +1159,28 @@ static void posix_cpu_timers_work(struct callback_head *work) } /* + * Clear existing posix CPU timers task work. + */ +void clear_posix_cputimers_work(struct task_struct *p) +{ + /* + * A copied work entry from the old task is not meaningful, clear it. + * N.B. init_task_work will not do this. + */ + memset(&p->posix_cputimers_work.work, 0, + sizeof(p->posix_cputimers_work.work)); + init_task_work(&p->posix_cputimers_work.work, + posix_cpu_timers_work); + p->posix_cputimers_work.scheduled = false; +} + +/* * Initialize posix CPU timers task work in init task. Out of line to * keep the callback static and to avoid header recursion hell. */ void __init posix_cputimers_init_work(void) { - init_task_work(¤t->posix_cputimers_work.work, - posix_cpu_timers_work); + clear_posix_cputimers_work(current); } /* |