From 6015b1aca1a233379625385feb01dd014aca60b5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Mar 2023 19:32:38 -0700 Subject: sched_getaffinity: don't assume 'cpumask_size()' is fully initialized The getaffinity() system call uses 'cpumask_size()' to decide how big the CPU mask is - so far so good. It is indeed the allocation size of a cpumask. But the code also assumes that the whole allocation is initialized without actually doing so itself. That's wrong, because we might have fixed-size allocations (making copying and clearing more efficient), but not all of it is then necessarily used if 'nr_cpu_ids' is smaller. Having checked other users of 'cpumask_size()', they all seem to be ok, either using it purely for the allocation size, or explicitly zeroing the cpumask before using the size in bytes to copy it. See for example the ublk_ctrl_get_queue_affinity() function that uses the proper 'zalloc_cpumask_var()' to make sure that the whole mask is cleared, whether the storage is on the stack or if it was an external allocation. Fix this by just zeroing the allocation before using it. Do the same for the compat version of sched_getaffinity(), which had the same logic. Also, for consistency, make sched_getaffinity() use 'cpumask_bits()' to access the bits. For a cpumask_var_t, it ends up being a pointer to the same data either way, but it's just a good idea to treat it like you would a 'cpumask_t'. The compat case already did that. Reported-by: Ryan Roberts Link: https://lore.kernel.org/lkml/7d026744-6bd6-6827-0471-b5e8eae0be3f@arm.com/ Cc: Yury Norov Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index af017e038b48..488655f2319f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8414,14 +8414,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, if (len & (sizeof(unsigned long)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { unsigned int retlen = min(len, cpumask_size()); - if (copy_to_user(user_mask_ptr, mask, retlen)) + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) ret = -EFAULT; else ret = retlen; -- cgit From a53ce18cacb477dd0513c607f187d16f0fa96f71 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Mar 2023 17:08:10 +0100 Subject: sched/fair: Sanitize vruntime of entity being migrated Commit 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed") fixes an overflowing bug, but ignore a case that se->exec_start is reset after a migration. For fixing this case, we delay the reset of se->exec_start after placing the entity which se->exec_start to detect long sleeping task. In order to take into account a possible divergence between the clock_task of 2 rqs, we increase the threshold to around 104 days. Fixes: 829c1651e9c4 ("sched/fair: sanitize vruntime of entity being placed") Originally-by: Zhang Qiao Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Qiao Link: https://lore.kernel.org/r/20230317160810.107988-1-vincent.guittot@linaro.org --- kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 11 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 488655f2319f..0d18c3969f90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2084,6 +2084,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { + if (task_on_rq_migrating(p)) + flags |= ENQUEUE_MIGRATED; + enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a1b1f855b96..6986ea31c984 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4648,11 +4648,33 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +static inline bool entity_is_long_sleeper(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + u64 sleep_time; + + if (se->exec_start == 0) + return false; + + cfs_rq = cfs_rq_of(se); + + sleep_time = rq_clock_task(rq_of(cfs_rq)); + + /* Happen while migrating because of clock task divergence */ + if (sleep_time <= se->exec_start) + return false; + + sleep_time -= se->exec_start; + if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) + return true; + + return false; +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; - u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4684,13 +4706,24 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* * Pull vruntime of the entity being placed to the base level of - * cfs_rq, to prevent boosting it if placed backwards. If the entity - * slept for a long time, don't even try to compare its vruntime with - * the base as it may be too far off and the comparison may get - * inversed due to s64 overflow. - */ - sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; - if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + * cfs_rq, to prevent boosting it if placed backwards. + * However, min_vruntime can advance much faster than real time, with + * the extreme being when an entity with the minimal weight always runs + * on the cfs_rq. If the waking entity slept for a long time, its + * vruntime difference from min_vruntime may overflow s64 and their + * comparison may get inversed, so ignore the entity's original + * vruntime in that case. + * The maximal vruntime speedup is given by the ratio of normal to + * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. + * When placing a migrated waking entity, its exec_start has been set + * from a different rq. In order to take into account a possible + * divergence between new and prev rq's clocks task because of irq and + * stolen time, we take an additional margin. + * So, cutting off on the sleep time of + * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days + * should be safe. + */ + if (entity_is_long_sleeper(se)) se->vruntime = vruntime; else se->vruntime = max_vruntime(se->vruntime, vruntime); @@ -4770,6 +4803,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); @@ -7657,9 +7693,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) /* Tell new CPU we are migrated */ se->avg.last_update_time = 0; - /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; - update_scan_period(p, new_cpu); } -- cgit From 530bfad1d53d103f98cec66a3e491a36d397884d Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Thu, 16 Mar 2023 16:18:06 +0800 Subject: sched/core: Avoid selecting the task that is throttled to run when core-sched enable When {rt, cfs}_rq or dl task is throttled, since cookied tasks are not dequeued from the core tree, So sched_core_find() and sched_core_next() may return throttled task, which may cause throttled task to run on the CPU. So we add checks in sched_core_find() and sched_core_next() to make sure that the return is a runnable task that is not throttled. Co-developed-by: Cruz Zhao Signed-off-by: Cruz Zhao Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230316081806.69544-1-jiahao.os@bytedance.com --- kernel/sched/core.c | 60 +++++++++++++++++++++++++++++++++---------------- kernel/sched/deadline.c | 10 +++++++++ kernel/sched/fair.c | 16 +++++++++++++ kernel/sched/rt.c | 19 ++++++++++++++++ kernel/sched/sched.h | 4 ++++ 5 files changed, 90 insertions(+), 19 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 488655f2319f..9140a33bcc97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -261,36 +261,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } -/* - * Find left-most (aka, highest priority) task matching @cookie. - */ -static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +static int sched_task_is_throttled(struct task_struct *p, int cpu) { - struct rb_node *node; - - node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); - /* - * The idle task always matches any cookie! - */ - if (!node) - return idle_sched_class.pick_task(rq); + if (p->sched_class->task_is_throttled) + return p->sched_class->task_is_throttled(p, cpu); - return __node_2_sc(node); + return 0; } static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) { struct rb_node *node = &p->core_node; + int cpu = task_cpu(p); + + do { + node = rb_next(node); + if (!node) + return NULL; + + p = __node_2_sc(node); + if (p->core_cookie != cookie) + return NULL; + + } while (sched_task_is_throttled(p, cpu)); + + return p; +} + +/* + * Find left-most (aka, highest priority) and unthrottled task matching @cookie. + * If no suitable task is found, NULL will be returned. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct task_struct *p; + struct rb_node *node; - node = rb_next(node); + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); if (!node) return NULL; - p = container_of(node, struct task_struct, core_node); - if (p->core_cookie != cookie) - return NULL; + p = __node_2_sc(node); + if (!sched_task_is_throttled(p, rq->cpu)) + return p; - return p; + return sched_core_next(p, cookie); } /* @@ -6236,7 +6251,7 @@ static bool try_steal_cookie(int this, int that) goto unlock; p = sched_core_find(src, cookie); - if (p == src->idle) + if (!p) goto unlock; do { @@ -6248,6 +6263,13 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; + /* + * sched_core_find() and sched_core_next() will ensure that task @p + * is not throttled now, we also need to check whether the runqueue + * of the destination CPU is being throttled. + */ + if (sched_task_is_throttled(p, this)) + goto next; deactivate_task(src, p, 0); set_task_cpu(p, this); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 71b24371a6f7..4cc7e1ca066d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2704,6 +2704,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, #endif } +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_dl(struct task_struct *p, int cpu) +{ + return p->dl.dl_throttled; +} +#endif + DEFINE_SCHED_CLASS(dl) = { .enqueue_task = enqueue_task_dl, @@ -2736,6 +2743,9 @@ DEFINE_SCHED_CLASS(dl) = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_dl, +#endif }; /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a1b1f855b96..b572367249f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11933,6 +11933,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, return delta > 0; } + +static int task_is_throttled_fair(struct task_struct *p, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq = task_group(p)->cfs_rq[cpu]; +#else + cfs_rq = &cpu_rq(cpu)->cfs; +#endif + return throttled_hierarchy(cfs_rq); +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif @@ -12559,6 +12571,10 @@ DEFINE_SCHED_CLASS(fair) = { .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_fair, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0a11f44adee5..9d67dfbf1000 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2677,6 +2677,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_rt(struct task_struct *p, int cpu) +{ + struct rt_rq *rt_rq; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq = task_group(p)->rt_rq[cpu]; +#else + rt_rq = &cpu_rq(cpu)->rt; +#endif + + return rt_rq_throttled(rt_rq); +} +#endif + DEFINE_SCHED_CLASS(rt) = { .enqueue_task = enqueue_task_rt, @@ -2710,6 +2725,10 @@ DEFINE_SCHED_CLASS(rt) = { .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_rt, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3e8df6d31c1e..060616944d7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2224,6 +2224,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p); #endif + +#ifdef CONFIG_SCHED_CORE + int (*task_is_throttled)(struct task_struct *p, int cpu); +#endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) -- cgit From eff6c8ce8d4d7faef75f66614dd20bb50595d261 Mon Sep 17 00:00:00 2001 From: wuchi Date: Tue, 21 Mar 2023 14:44:59 +0800 Subject: sched/core: Reduce cost of sched_move_task when config autogroup Some sched_move_task calls are useless because that task_struct->sched_task_group maybe not changed (equals task_group of cpu_cgroup) when system enable autogroup. So do some checks in sched_move_task. sched_move_task eg: task A belongs to cpu_cgroup0 and autogroup0, it will always belong to cpu_cgroup0 when do_exit. So there is no need to do {de|en}queue. The call graph is as follow. do_exit sched_autogroup_exit_task sched_move_task dequeue_task sched_change_group A.sched_task_group = sched_get_task_group (=cpu_cgroup0) enqueue_task Performance results: =========================== 1. env cpu: bogomips=4600.00 kernel: 6.3.0-rc3 cpu_cgroup: 6:cpu,cpuacct:/user.slice 2. cmds do_exit script: for i in {0..10000}; do sleep 0 & done wait Run the above script, then use the following bpftrace cmd to get the cost of sched_move_task: bpftrace -e 'k:sched_move_task { @ts[tid] = nsecs; } kr:sched_move_task /@ts[tid]/ { @ns += nsecs - @ts[tid]; delete(@ts[tid]); }' 3. cost time(ns): without patch: 43528033 with patch: 18541416 diff:-24986617 -57.4% As the result show, the patch will save 57.4% in the scenario. Signed-off-by: wuchi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230321064459.39421-1-wuchi.zero@gmail.com --- kernel/sched/core.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9140a33bcc97..5ddd9610be56 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10351,7 +10351,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk) +static struct task_group *sched_get_task_group(struct task_struct *tsk) { struct task_group *tg; @@ -10363,7 +10363,13 @@ static void sched_change_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - tsk->sched_task_group = tg; + + return tg; +} + +static void sched_change_group(struct task_struct *tsk, struct task_group *group) +{ + tsk->sched_task_group = group; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -10384,10 +10390,19 @@ void sched_move_task(struct task_struct *tsk) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct task_group *group; struct rq_flags rf; struct rq *rq; rq = task_rq_lock(tsk, &rf); + /* + * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous + * group changes. + */ + group = sched_get_task_group(tsk); + if (group == tsk->sched_task_group) + goto unlock; + update_rq_clock(rq); running = task_current(rq, tsk); @@ -10398,7 +10413,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk); + sched_change_group(tsk, group); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10412,6 +10427,7 @@ void sched_move_task(struct task_struct *tsk) resched_curr(rq); } +unlock: task_rq_unlock(rq, tsk, &rf); } -- cgit From e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 24 Feb 2023 08:50:00 -0800 Subject: livepatch,sched: Add livepatch task switching to cond_resched() There have been reports [1][2] of live patches failing to complete within a reasonable amount of time due to CPU-bound kthreads. Fix it by patching tasks in cond_resched(). There are four different flavors of cond_resched(), depending on the kernel configuration. Hook into all of them. A more elegant solution might be to use a preempt notifier. However, non-ORC unwinders can't unwind a preempted task reliably. [1] https://lore.kernel.org/lkml/20220507174628.2086373-1-song@kernel.org/ [2] https://lkml.kernel.org/lkml/20230120-vhost-klp-switching-v1-0-7c2b65519c43@kernel.org Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Tested-by: Seth Forshee (DigitalOcean) Link: https://lore.kernel.org/r/4ae981466b7814ec221014fc2554b2f86f3fb70b.1677257135.git.jpoimboe@kernel.org --- include/linux/livepatch.h | 1 + include/linux/livepatch_sched.h | 29 +++++++++++ include/linux/sched.h | 20 ++++++-- kernel/livepatch/core.c | 1 + kernel/livepatch/transition.c | 107 ++++++++++++++++++++++++++++++++++------ kernel/sched/core.c | 64 +++++++++++++++++++++--- 6 files changed, 194 insertions(+), 28 deletions(-) create mode 100644 include/linux/livepatch_sched.h (limited to 'kernel/sched/core.c') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 293e29960c6e..9b9b38e89563 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -13,6 +13,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_LIVEPATCH) diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h new file mode 100644 index 000000000000..013794fb5da0 --- /dev/null +++ b/include/linux/livepatch_sched.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_LIVEPATCH_SCHED_H_ +#define _LINUX_LIVEPATCH_SCHED_H_ + +#include +#include + +#ifdef CONFIG_LIVEPATCH + +void __klp_sched_try_switch(void); + +#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + +DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); + +static __always_inline void klp_sched_try_switch(void) +{ + if (static_branch_unlikely(&klp_sched_try_switch_key)) + __klp_sched_try_switch(); +} + +#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + +#else /* !CONFIG_LIVEPATCH */ +static inline void klp_sched_try_switch(void) {} +static inline void __klp_sched_try_switch(void) {} +#endif /* CONFIG_LIVEPATCH */ + +#endif /* _LINUX_LIVEPATCH_SCHED_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..6d654eb4cabd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -36,6 +36,7 @@ #include #include #include +#include #include /* task_struct member predeclarations (sorted alphabetically): */ @@ -2070,6 +2071,9 @@ extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +void sched_dynamic_klp_enable(void); +void sched_dynamic_klp_disable(void); + DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) @@ -2078,6 +2082,7 @@ static __always_inline int _cond_resched(void) } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) @@ -2085,20 +2090,25 @@ static __always_inline int _cond_resched(void) return dynamic_cond_resched(); } -#else +#else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { + klp_sched_try_switch(); return __cond_resched(); } -#endif /* CONFIG_PREEMPT_DYNAMIC */ +#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ -#else +#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ -static inline int _cond_resched(void) { return 0; } +static inline int _cond_resched(void) +{ + klp_sched_try_switch(); + return 0; +} -#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ +#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 4bd2d5e10f20..eea7c8ec6e05 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -33,6 +33,7 @@ * * - klp_ftrace_handler() * - klp_update_patch_state() + * - __klp_sched_try_switch() */ DEFINE_MUTEX(klp_mutex); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 824e2e3f07dd..e9fd83a02228 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -9,6 +9,7 @@ #include #include +#include #include "core.h" #include "patch.h" #include "transition.h" @@ -26,6 +27,25 @@ static int klp_target_state = KLP_UNDEFINED; static unsigned int klp_signals_cnt; +/* + * When a livepatch is in progress, enable klp stack checking in + * cond_resched(). This helps CPU-bound kthreads get patched. + */ +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + +#define klp_cond_resched_enable() sched_dynamic_klp_enable() +#define klp_cond_resched_disable() sched_dynamic_klp_disable() + +#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + +DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); +EXPORT_SYMBOL(klp_sched_try_switch_key); + +#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) + +#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + /* * This work can be performed periodically to finish patching or unpatching any * "straggler" tasks which failed to transition in the first attempt. @@ -174,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task) * barrier (smp_rmb) for two cases: * * 1) Enforce the order of the TIF_PATCH_PENDING read and the - * klp_target_state read. The corresponding write barrier is in - * klp_init_transition(). + * klp_target_state read. The corresponding write barriers are in + * klp_init_transition() and klp_reverse_transition(). * * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read * of func->transition, if klp_ftrace_handler() is called later on @@ -343,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task) return !ret; } +void __klp_sched_try_switch(void) +{ + if (likely(!klp_patch_pending(current))) + return; + + /* + * This function is called from cond_resched() which is called in many + * places throughout the kernel. Using the klp_mutex here might + * deadlock. + * + * Instead, disable preemption to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch this task while it's running. + */ + preempt_disable(); + + /* + * Make sure current didn't get patched between the above check and + * preempt_disable(). + */ + if (unlikely(!klp_patch_pending(current))) + goto out; + + /* + * Enforce the order of the TIF_PATCH_PENDING read above and the + * klp_target_state read in klp_try_switch_task(). The corresponding + * write barriers are in klp_init_transition() and + * klp_reverse_transition(). + */ + smp_rmb(); + + klp_try_switch_task(current); + +out: + preempt_enable(); +} +EXPORT_SYMBOL(__klp_sched_try_switch); + /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. * Kthreads with TIF_PATCH_PENDING set are woken up. @@ -449,7 +507,8 @@ void klp_try_complete_transition(void) return; } - /* we're done, now cleanup the data structures */ + /* Done! Now cleanup the data structures. */ + klp_cond_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -501,6 +560,8 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + klp_cond_resched_enable(); + klp_signals_cnt = 0; } @@ -556,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state) * see a func in transition with a task->patch_state of KLP_UNDEFINED. * * Also enforce the order of the klp_target_state write and future - * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't - * set a task->patch_state to KLP_UNDEFINED. + * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and + * __klp_sched_try_switch() don't set a task->patch_state to + * KLP_UNDEFINED. */ smp_wmb(); @@ -593,14 +655,10 @@ void klp_reverse_transition(void) klp_target_state == KLP_PATCHED ? "patching to unpatching" : "unpatching to patching"); - klp_transition_patch->enabled = !klp_transition_patch->enabled; - - klp_target_state = !klp_target_state; - /* * Clear all TIF_PATCH_PENDING flags to prevent races caused by - * klp_update_patch_state() running in parallel with - * klp_start_transition(). + * klp_update_patch_state() or __klp_sched_try_switch() running in + * parallel with the reverse transition. */ read_lock(&tasklist_lock); for_each_process_thread(g, task) @@ -610,9 +668,28 @@ void klp_reverse_transition(void) for_each_possible_cpu(cpu) clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); - /* Let any remaining calls to klp_update_patch_state() complete */ + /* + * Make sure all existing invocations of klp_update_patch_state() and + * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before + * starting the reverse transition. + */ klp_synchronize_transition(); + /* + * All patching has stopped, now re-initialize the global variables to + * prepare for the reverse transition. + */ + klp_transition_patch->enabled = !klp_transition_patch->enabled; + klp_target_state = !klp_target_state; + + /* + * Enforce the order of the klp_target_state write and the + * TIF_PATCH_PENDING writes in klp_start_transition() to ensure + * klp_update_patch_state() and __klp_sched_try_switch() don't set + * task->patch_state to the wrong value. + */ + smp_wmb(); + klp_start_transition(); } @@ -626,9 +703,9 @@ void klp_copy_process(struct task_struct *child) * the task flag up to date with the parent here. * * The operation is serialized against all klp_*_transition() - * operations by the tasklist_lock. The only exception is - * klp_update_patch_state(current), but we cannot race with - * that because we are current. + * operations by the tasklist_lock. The only exceptions are + * klp_update_patch_state(current) and __klp_sched_try_switch(), but we + * cannot race with them because we are current. */ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING)) set_tsk_thread_flag(child, TIF_PATCH_PENDING); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ddd9610be56..b9616f153946 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8525,6 +8525,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { + klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -8673,13 +8674,17 @@ int sched_dynamic_mode(const char *str) #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif -void sched_dynamic_update(int mode) +DEFINE_MUTEX(sched_dynamic_mutex); +static bool klp_override; + +static void __sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -8687,36 +8692,79 @@ void sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: none\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: voluntary\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - preempt_dynamic_disable(cond_resched); + if (!klp_override) + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: full\n"); break; } preempt_dynamic_mode = mode; } +void sched_dynamic_update(int mode) +{ + mutex_lock(&sched_dynamic_mutex); + __sched_dynamic_update(mode); + mutex_unlock(&sched_dynamic_mutex); +} + +#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL + +static int klp_cond_resched(void) +{ + __klp_sched_try_switch(); + return __cond_resched(); +} + +void sched_dynamic_klp_enable(void) +{ + mutex_lock(&sched_dynamic_mutex); + + klp_override = true; + static_call_update(cond_resched, klp_cond_resched); + + mutex_unlock(&sched_dynamic_mutex); +} + +void sched_dynamic_klp_disable(void) +{ + mutex_lock(&sched_dynamic_mutex); + + klp_override = false; + __sched_dynamic_update(preempt_dynamic_mode); + + mutex_unlock(&sched_dynamic_mutex); +} + +#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); -- cgit From cc9cb0a71725aa8dd8d8f534a9b562bbf7981f75 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 7 Mar 2023 14:35:53 +0000 Subject: sched, smp: Trace IPIs sent via send_call_function_single_ipi() send_call_function_single_ipi() is the thing that sends IPIs at the bottom of smp_call_function*() via either generic_exec_single() or smp_call_function_many_cond(). Give it an IPI-related tracepoint. Note that this ends up tracing any IPI sent via __smp_call_single_queue(), which covers __ttwu_queue_wakelist() and irq_work_queue_on() "for free". Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20230307143558.294354-3-vschneid@redhat.com --- arch/arm/kernel/smp.c | 1 - arch/arm64/kernel/smp.c | 1 - kernel/sched/core.c | 9 +++++++-- kernel/smp.c | 2 ++ 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 0b8c25763adc..5edf09237b2f 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -48,7 +48,6 @@ #include #include -#define CREATE_TRACE_POINTS #include /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 4e8327264255..438c16fc4463 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -51,7 +51,6 @@ #include #include -#define CREATE_TRACE_POINTS #include DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 488655f2319f..c26a2cd99ec7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -80,6 +80,7 @@ #define CREATE_TRACE_POINTS #include #include +#include #undef CREATE_TRACE_POINTS #include "sched.h" @@ -95,6 +96,8 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); + /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -3830,10 +3833,12 @@ void send_call_function_single_ipi(int cpu) { struct rq *rq = cpu_rq(cpu); - if (!set_nr_if_polling(rq->idle)) + if (!set_nr_if_polling(rq->idle)) { + trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, NULL); arch_send_call_function_single_ipi(cpu); - else + } else { trace_sched_wake_idle_without_ipi(cpu); + } } /* diff --git a/kernel/smp.c b/kernel/smp.c index 298ba7570621..770e879a7274 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -26,6 +26,8 @@ #include #include +#include + #include "smpboot.h" #include "sched/smp.h" -- cgit From 68f4ff04dbada18dad79659c266a8e5e29e458cd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 7 Mar 2023 14:35:58 +0000 Subject: sched, smp: Trace smp callback causing an IPI Context ======= The newly-introduced ipi_send_cpumask tracepoint has a "callback" parameter which so far has only been fed with NULL. While CSD_TYPE_SYNC/ASYNC and CSD_TYPE_IRQ_WORK share a similar backing struct layout (meaning their callback func can be accessed without caring about the actual CSD type), CSD_TYPE_TTWU doesn't even have a function attached to its struct. This means we need to check the type of a CSD before eventually dereferencing its associated callback. This isn't as trivial as it sounds: the CSD type is stored in __call_single_node.u_flags, which get cleared right before the callback is executed via csd_unlock(). This implies checking the CSD type before it is enqueued on the call_single_queue, as the target CPU's queue can be flushed before we get to sending an IPI. Furthermore, send_call_function_single_ipi() only has a CPU parameter, and would need to have an additional argument to trickle down the invoked function. This is somewhat silly, as the extra argument will always be pushed down to the function even when nothing is being traced, which is unnecessary overhead. Changes ======= send_call_function_single_ipi() is only used by smp.c, and is defined in sched/core.c as it contains scheduler-specific ops (set_nr_if_polling() of a CPU's idle task). Split it into two parts: the scheduler bits remain in sched/core.c, and the actual IPI emission is moved into smp.c. This lets us define an __always_inline helper function that can take the related callback as parameter without creating useless register pressure in the non-traced path which only gains a (disabled) static branch. Do the same thing for the multi IPI case. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230307143558.294354-8-vschneid@redhat.com --- kernel/sched/core.c | 18 +++++++++++------- kernel/sched/smp.h | 2 +- kernel/smp.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 53 insertions(+), 16 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c26a2cd99ec7..b0a48cfc0a22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3829,16 +3829,20 @@ void sched_ttwu_pending(void *arg) rq_unlock_irqrestore(rq, &rf); } -void send_call_function_single_ipi(int cpu) +/* + * Prepare the scene for sending an IPI for a remote smp_call + * + * Returns true if the caller can proceed with sending the IPI. + * Returns false otherwise. + */ +bool call_function_single_prep_ipi(int cpu) { - struct rq *rq = cpu_rq(cpu); - - if (!set_nr_if_polling(rq->idle)) { - trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, NULL); - arch_send_call_function_single_ipi(cpu); - } else { + if (set_nr_if_polling(cpu_rq(cpu)->idle)) { trace_sched_wake_idle_without_ipi(cpu); + return false; } + + return true; } /* diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 2eb23dd0f285..21ac44428bb0 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -6,7 +6,7 @@ extern void sched_ttwu_pending(void *arg); -extern void send_call_function_single_ipi(int cpu); +extern bool call_function_single_prep_ipi(int cpu); #ifdef CONFIG_SMP extern void flush_smp_call_function_queue(void); diff --git a/kernel/smp.c b/kernel/smp.c index 6bbfabbe62fc..37e9613a0889 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -104,9 +104,18 @@ void __init call_function_init(void) } static __always_inline void -send_call_function_ipi_mask(struct cpumask *mask) +send_call_function_single_ipi(int cpu, smp_call_func_t func) { - trace_ipi_send_cpumask(mask, _RET_IP_, NULL); + if (call_function_single_prep_ipi(cpu)) { + trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, func); + arch_send_call_function_single_ipi(cpu); + } +} + +static __always_inline void +send_call_function_ipi_mask(struct cpumask *mask, smp_call_func_t func) +{ + trace_ipi_send_cpumask(mask, _RET_IP_, func); arch_send_call_function_ipi_mask(mask); } @@ -307,9 +316,8 @@ static __always_inline void csd_unlock(struct __call_single_data *csd) smp_store_release(&csd->node.u_flags, 0); } -static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); - -void __smp_call_single_queue(int cpu, struct llist_node *node) +static __always_inline void +raw_smp_call_single_queue(int cpu, struct llist_node *node, smp_call_func_t func) { /* * The list addition should be visible to the target CPU when it pops @@ -324,7 +332,32 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) - send_call_function_single_ipi(cpu); + send_call_function_single_ipi(cpu, func); +} + +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); + +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * We have to check the type of the CSD before queueing it, because + * once queued it can have its flags cleared by + * flush_smp_call_function_queue() + * even if we haven't sent the smp_call IPI yet (e.g. the stopper + * executes migration_cpu_stop() on the remote CPU). + */ + if (trace_ipi_send_cpumask_enabled()) { + call_single_data_t *csd; + smp_call_func_t func; + + csd = container_of(node, call_single_data_t, node.llist); + func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? + sched_ttwu_pending : csd->func; + + raw_smp_call_single_queue(cpu, node, func); + } else { + raw_smp_call_single_queue(cpu, node, NULL); + } } /* @@ -768,9 +801,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * provided mask. */ if (nr_cpus == 1) - send_call_function_single_ipi(last_cpu); + send_call_function_single_ipi(last_cpu, func); else if (likely(nr_cpus > 1)) - send_call_function_ipi_mask(cfd->cpumask_ipi); + send_call_function_ipi_mask(cfd->cpumask_ipi, func); } if (run_local && (!cond_func || cond_func(this_cpu, info))) { -- cgit From 68e2d17c9eb311ab59aeb6d0c38aad8985fa2596 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Mar 2023 11:28:36 +0100 Subject: trace: Add trace_ipi_send_cpu() Because copying cpumasks around when targeting a single CPU is a bit daft... Tested-and-reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230322103004.GA571242%40hirez.programming.kicks-ass.net --- include/linux/smp.h | 6 +++--- include/trace/events/ipi.h | 22 ++++++++++++++++++++++ kernel/irq_work.c | 6 ++---- kernel/sched/core.c | 1 + kernel/smp.c | 4 ++-- 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/smp.h b/include/linux/smp.h index c036a2228d8d..ed8f344ba627 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -130,9 +130,9 @@ extern void arch_smp_send_reschedule(int cpu); * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ -#define smp_send_reschedule(cpu) ({ \ - trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, NULL); \ - arch_smp_send_reschedule(cpu); \ +#define smp_send_reschedule(cpu) ({ \ + trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ + arch_smp_send_reschedule(cpu); \ }) /* diff --git a/include/trace/events/ipi.h b/include/trace/events/ipi.h index b1125dc27682..3de9bfc982ce 100644 --- a/include/trace/events/ipi.h +++ b/include/trace/events/ipi.h @@ -35,6 +35,28 @@ TRACE_EVENT(ipi_raise, TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason) ); +TRACE_EVENT(ipi_send_cpu, + + TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback), + + TP_ARGS(cpu, callsite, callback), + + TP_STRUCT__entry( + __field(unsigned int, cpu) + __field(void *, callsite) + __field(void *, callback) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->callsite = (void *)callsite; + __entry->callback = callback; + ), + + TP_printk("cpu=%u callsite=%pS callback=%pS", + __entry->cpu, __entry->callsite, __entry->callback) +); + TRACE_EVENT(ipi_send_cpumask, TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback), diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c33e88e32a67..2f4fb336dda1 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -78,10 +78,8 @@ void __weak arch_irq_work_raise(void) static __always_inline void irq_work_raise(struct irq_work *work) { - if (trace_ipi_send_cpumask_enabled() && arch_irq_work_has_interrupt()) - trace_ipi_send_cpumask(cpumask_of(smp_processor_id()), - _RET_IP_, - work->func); + if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) + trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b0a48cfc0a22..ad40755ddc11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -96,6 +96,7 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); /* diff --git a/kernel/smp.c b/kernel/smp.c index 37e9613a0889..43f0796ecdb2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -107,7 +107,7 @@ static __always_inline void send_call_function_single_ipi(int cpu, smp_call_func_t func) { if (call_function_single_prep_ipi(cpu)) { - trace_ipi_send_cpumask(cpumask_of(cpu), _RET_IP_, func); + trace_ipi_send_cpu(cpu, _RET_IP_, func); arch_send_call_function_single_ipi(cpu); } } @@ -346,7 +346,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ - if (trace_ipi_send_cpumask_enabled()) { + if (trace_ipi_send_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; -- cgit From aa464ba9a1e444d5ef95bb63ee3b2ef26fc96ed7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:34 +1000 Subject: lazy tlb: introduce lazy tlb mm refcount helper functions Add explicit _lazy_tlb annotated functions for lazy tlb mm refcounting. This makes the lazy tlb mm references more obvious, and allows the refcounting scheme to be modified in later changes. There is no functional change with this patch. Link: https://lkml.kernel.org/r/20230203071837.1136453-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/mach-rpc/ecard.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++-- fs/exec.c | 2 +- include/linux/sched/mm.h | 16 ++++++++++++++++ kernel/cpu.c | 2 +- kernel/exit.c | 2 +- kernel/kthread.c | 12 ++++++++++-- kernel/sched/core.c | 15 ++++++++------- 9 files changed, 41 insertions(+), 16 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c index 53813f9464a2..c30df1097c52 100644 --- a/arch/arm/mach-rpc/ecard.c +++ b/arch/arm/mach-rpc/ecard.c @@ -253,7 +253,7 @@ static int ecard_init_mm(void) current->mm = mm; current->active_mm = mm; activate_mm(active_mm, mm); - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); ecard_init_pgtables(mm); return 0; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 6b90f10a6c81..7db6b3faea65 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1611,7 +1611,7 @@ void start_secondary(void *unused) if (IS_ENABLED(CONFIG_PPC32)) setup_kup(); - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; smp_store_cpu_info(cpu); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index e50bc5fc7ddf..ce804b7bf84e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -797,10 +797,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) if (current->active_mm == mm) { WARN_ON_ONCE(current->mm != NULL); /* Is a kernel thread and is using mm as the lazy tlb */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); - mmdrop(mm); + mmdrop_lazy_tlb(mm); } /* diff --git a/fs/exec.c b/fs/exec.c index 7c44d0c65b1b..87cf3a2f0e9a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1034,7 +1034,7 @@ static int exec_mmap(struct mm_struct *mm) mmput(old_mm); return 0; } - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); return 0; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2a243616f222..5376caf6fcf3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -79,6 +79,22 @@ static inline void mmdrop_sched(struct mm_struct *mm) } #endif +/* Helpers for lazy TLB mm refcounting */ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm) +{ + mmgrab(mm); +} + +static inline void mmdrop_lazy_tlb(struct mm_struct *mm) +{ + mmdrop(mm); +} + +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) +{ + mmdrop_sched(mm); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/kernel/cpu.c b/kernel/cpu.c index 6c0a92ca6bb5..189895288d9d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -623,7 +623,7 @@ static int finish_cpu(unsigned int cpu) */ if (mm != &init_mm) idle->active_mm = &init_mm; - mmdrop(mm); + mmdrop_lazy_tlb(mm); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index f2afdb0add7c..86902cb5ab78 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -537,7 +537,7 @@ static void exit_mm(void) return; sync_mm_rss(mm); mmap_read_lock(mm); - mmgrab(mm); + mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); diff --git a/kernel/kthread.c b/kernel/kthread.c index 1f1b60f1a746..470708c205e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1415,6 +1415,11 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + /* + * It is possible for mm to be the same as tsk->active_mm, but + * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), + * because these references are not equivalent. + */ mmgrab(mm); task_lock(tsk); @@ -1438,9 +1443,9 @@ void kthread_use_mm(struct mm_struct *mm) * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by - * mmdrop(). + * mmdrop_lazy_tlb(). */ - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1468,10 +1473,13 @@ void kthread_unuse_mm(struct mm_struct *mm) local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); + mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); + + mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d18c3969f90..143e46bd2a68 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5203,13 +5203,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly - * provided by mmdrop(), + * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop_sched(mm); + mmdrop_lazy_tlb_sched(mm); } + if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -5266,9 +5267,9 @@ context_switch(struct rq *rq, struct task_struct *prev, /* * kernel -> kernel lazy + transfer active - * user -> kernel lazy + mmgrab() active + * user -> kernel lazy + mmgrab_lazy_tlb() active * - * kernel -> user switch + mmdrop() active + * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch */ if (!next->mm) { // to kernel @@ -5276,7 +5277,7 @@ context_switch(struct rq *rq, struct task_struct *prev, next->active_mm = prev->active_mm; if (prev->mm) // from user - mmgrab(prev->active_mm); + mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; } else { // to user @@ -5293,7 +5294,7 @@ context_switch(struct rq *rq, struct task_struct *prev, lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ + /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } @@ -9935,7 +9936,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); /* -- cgit From 9b8e17813aeccc29c2f9f2e6e68997a6eac2d26d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 11 Apr 2023 22:26:41 -0700 Subject: sched/core: Make sched_dynamic_mutex static The sched_dynamic_mutex is only used within the file. Make it static. Fixes: e3ff7c609f39 ("livepatch,sched: Add livepatch task switching to cond_resched()") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/oe-kbuild-all/202304062335.tNuUjgsl-lkp@intel.com/ --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b9616f153946..d861db8aa7ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8674,7 +8674,7 @@ int sched_dynamic_mode(const char *str) #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif -DEFINE_MUTEX(sched_dynamic_mutex); +static DEFINE_MUTEX(sched_dynamic_mutex); static bool klp_override; static void __sched_dynamic_update(int mode) -- cgit From a3b2aeac9d154e5e15ddbf19de934c0c606b6acd Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 8 Apr 2023 17:28:35 +0800 Subject: delayacct: track delays from IRQ/SOFTIRQ Delay accounting does not track the delay of IRQ/SOFTIRQ. While IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such as when workloads are running on system which is busy handling network IRQ/SOFTIRQ. Get the delay of IRQ/SOFTIRQ could help users to reduce such delay. Such as setting interrupt affinity or task affinity, using kernel thread for NAPI etc. This is inspired by "sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"[1]. Also fix some code indent problems of older code. And update tools/accounting/getdelays.c: / # ./getdelays -p 156 -di print delayacct stats ON printing IO accounting PID 156 CPU count real total virtual total delay total delay average 15 15836008 16218149 275700790 18.380ms IO count delay total delay average 0 0 0.000ms SWAP count delay total delay average 0 0 0.000ms RECLAIM count delay total delay average 0 0 0.000ms THRASHING count delay total delay average 0 0 0.000ms COMPACT count delay total delay average 0 0 0.000ms WPCOPY count delay total delay average 36 7586118 0.211ms IRQ count delay total delay average 42 929161 0.022ms [1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure") Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn Signed-off-by: Yang Yang Cc: Jiang Xuexin Cc: wangyong Cc: junhua huang Cc: Balbir Singh Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Juri Lelli Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 7 +++++-- include/linux/delayacct.h | 15 ++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 14 +++++++++++++ kernel/sched/core.c | 1 + tools/accounting/getdelays.c | 30 ++++++++++++++++----------- 6 files changed, 58 insertions(+), 15 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 79f537c9f160..f61c01fc376e 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -16,6 +16,7 @@ d) memory reclaim e) thrashing f) direct compact g) write-protect copy +h) IRQ/SOFTIRQ and makes these statistics available to userspace through the taskstats interface. @@ -49,7 +50,7 @@ this structure. See for a description of the fields pertaining to delay accounting. It will generally be in the form of counters returning the cumulative delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page -cache, direct compact, write-protect copy etc. +cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc. Taking the difference of two successive readings of a given counter (say cpu_delay_total) for a task will give the delay @@ -118,7 +119,9 @@ Get sum of delays, since system boot, for all pids with tgid 5:: 0 0 0.000ms COMPACT count delay total delay average 0 0 0.000ms - WPCOPY count delay total delay average + WPCOPY count delay total delay average + 0 0 0.000ms + IRQ count delay total delay average 0 0 0.000ms Get IO accounting for pid 1, it works only with -p:: diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 0da97dba9ef8..6639f48dac36 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -48,10 +48,13 @@ struct task_delay_info { u64 wpcopy_start; u64 wpcopy_delay; /* wait for write-protect copy */ + u64 irq_delay; /* wait for IRQ/SOFTIRQ */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ u32 compact_count; /* total count of memory compact */ u32 wpcopy_count; /* total count of write-protect copy */ + u32 irq_count; /* total count of IRQ/SOFTIRQ */ }; #endif @@ -81,6 +84,7 @@ extern void __delayacct_compact_start(void); extern void __delayacct_compact_end(void); extern void __delayacct_wpcopy_start(void); extern void __delayacct_wpcopy_end(void); +extern void __delayacct_irq(struct task_struct *task, u32 delta); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -215,6 +219,15 @@ static inline void delayacct_wpcopy_end(void) __delayacct_wpcopy_end(); } +static inline void delayacct_irq(struct task_struct *task, u32 delta) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (task->delays) + __delayacct_irq(task, delta); +} + #else static inline void delayacct_init(void) {} @@ -253,6 +266,8 @@ static inline void delayacct_wpcopy_start(void) {} static inline void delayacct_wpcopy_end(void) {} +static inline void delayacct_irq(struct task_struct *task, u32 delta) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index a7f5b11a8f1b..b50b2eb257a0 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 13 +#define TASKSTATS_VERSION 14 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -198,6 +198,10 @@ struct taskstats { /* v13: Delay waiting for write-protect copy */ __u64 wpcopy_count; __u64 wpcopy_delay_total; + + /* v14: Delay waiting for IRQ/SOFTIRQ */ + __u64 irq_count; + __u64 irq_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e39cb696cfbd..6f0c358e73d8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; + tmp = d->irq_delay_total + tsk->delays->irq_delay; + d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; d->wpcopy_count += tsk->delays->wpcopy_count; + d->irq_count += tsk->delays->irq_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count); } + +void __delayacct_irq(struct task_struct *task, u32 delta) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&task->delays->lock, flags); + task->delays->irq_delay += delta; + task->delays->irq_count++; + raw_spin_unlock_irqrestore(&task->delays->lock, flags); +} + diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d18c3969f90..5473e831daf3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -704,6 +704,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; psi_account_irqtime(rq->curr, irq_delta); + delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 23a15d8f2bf4..1334214546d7 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -198,17 +198,19 @@ static void print_delayacct(struct taskstats *t) printf("\n\nCPU %15s%15s%15s%15s%15s\n" " %15llu%15llu%15llu%15llu%15.3fms\n" "IO %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "SWAP %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "RECLAIM %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "THRASHING%12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "COMPACT %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "WPCOPY %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n", + " %15llu%15llu%15.3fms\n" + "IRQ %15s%15s%15s\n" + " %15llu%15llu%15.3fms\n", "count", "real total", "virtual total", "delay total", "delay average", (unsigned long long)t->cpu_count, @@ -219,27 +221,31 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->blkio_count, (unsigned long long)t->blkio_delay_total, - average_ms((double)t->blkio_delay_total, t->blkio_count), + average_ms((double)t->blkio_delay_total, t->blkio_count), "count", "delay total", "delay average", (unsigned long long)t->swapin_count, (unsigned long long)t->swapin_delay_total, - average_ms((double)t->swapin_delay_total, t->swapin_count), + average_ms((double)t->swapin_delay_total, t->swapin_count), "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms((double)t->freepages_delay_total, t->freepages_count), + average_ms((double)t->freepages_delay_total, t->freepages_count), "count", "delay total", "delay average", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, - average_ms((double)t->thrashing_delay_total, t->thrashing_count), + average_ms((double)t->thrashing_delay_total, t->thrashing_count), "count", "delay total", "delay average", (unsigned long long)t->compact_count, (unsigned long long)t->compact_delay_total, - average_ms((double)t->compact_delay_total, t->compact_count), + average_ms((double)t->compact_delay_total, t->compact_count), "count", "delay total", "delay average", (unsigned long long)t->wpcopy_count, (unsigned long long)t->wpcopy_delay_total, - average_ms((double)t->wpcopy_delay_total, t->wpcopy_count)); + average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), + "count", "delay total", "delay average", + (unsigned long long)t->irq_count, + (unsigned long long)t->irq_delay_total, + average_ms((double)t->irq_delay_total, t->irq_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit From 223baf9d17f25e2608dbdff7232c095c1e612268 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 20 Apr 2023 10:55:48 -0400 Subject: sched: Fix performance regression introduced by mm_cid Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL sysbench regression reported by Aaron Lu. Keep track of the currently allocated mm_cid for each mm/cpu rather than freeing them immediately on context switch. This eliminates most atomic operations when context switching back and forth between threads belonging to different memory spaces in multi-threaded scenarios (many processes, each with many threads). The per-mm/per-cpu mm_cid values are serialized by their respective runqueue locks. Thread migration is handled by introducing invocation to sched_mm_cid_migrate_to() (with destination runqueue lock held) in activate_task() for migrating tasks. If the destination cpu's mm_cid is unset, and if the source runqueue is not actively using its mm_cid, then the source cpu's mm_cid is moved to the destination cpu on migration. Introduce a task-work executed periodically, similarly to NUMA work, which delays reclaim of cid values when they are unused for a period of time. Keep track of the allocation time for each per-cpu cid, and let the task work clear them when they are observed to be older than SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all mm_cids which are greater or equal to the Hamming weight of the mm cidmask to keep concurrency ids compact. Because we want to ensure the mm_cid converges towards the smaller values as migrations happen, the prior optimization that was done when context switching between threads belonging to the same mm is removed, because it could delay the lazy release of the destination runqueue mm_cid after it has been replaced by a migration. Removing this prior optimization is not an issue performance-wise because the introduced per-mm/per-cpu mm_cid tracking also covers this more specific case. Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Reported-by: Aaron Lu Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Tested-by: Aaron Lu Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/ --- include/linux/mm_types.h | 82 +++++++- include/linux/sched.h | 3 + include/linux/sched/mm.h | 5 + kernel/fork.c | 9 +- kernel/sched/core.c | 523 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 239 +++++++++++++++++++--- 6 files changed, 804 insertions(+), 57 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a57e6ae78e65..5eab61156f0e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -550,6 +550,13 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; +#ifdef CONFIG_SCHED_MM_CID +struct mm_cid { + u64 time; + int cid; +}; +#endif + struct kioctx_table; struct mm_struct { struct { @@ -600,15 +607,19 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_SCHED_MM_CID /** - * @cid_lock: Protect cid bitmap updates vs lookups. + * @pcpu_cid: Per-cpu current cid. * - * Prevent situations where updates to the cid bitmap happen - * concurrently with lookups. Those can lead to situations - * where a lookup cannot find a free bit simply because it was - * unlucky enough to load, non-atomically, bitmap words as they - * were being concurrently updated by the updaters. + * Keep track of the currently allocated mm_cid for each cpu. + * The per-cpu mm_cid values are serialized by their respective + * runqueue locks. */ - raw_spinlock_t cid_lock; + struct mm_cid __percpu *pcpu_cid; + /* + * @mm_cid_next_scan: Next mm_cid scan (in jiffies). + * + * When the next mm_cid scan is due (in jiffies). + */ + unsigned long mm_cid_next_scan; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ @@ -873,6 +884,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi, } #ifdef CONFIG_SCHED_MM_CID + +enum mm_cid_state { + MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ + MM_CID_LAZY_PUT = (1U << 31), +}; + +static inline bool mm_cid_is_unset(int cid) +{ + return cid == MM_CID_UNSET; +} + +static inline bool mm_cid_is_lazy_put(int cid) +{ + return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); +} + +static inline bool mm_cid_is_valid(int cid) +{ + return !(cid & MM_CID_LAZY_PUT); +} + +static inline int mm_cid_set_lazy_put(int cid) +{ + return cid | MM_CID_LAZY_PUT; +} + +static inline int mm_cid_clear_lazy_put(int cid) +{ + return cid & ~MM_CID_LAZY_PUT; +} + /* Accessor for struct mm_struct's cidmask. */ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) { @@ -886,16 +928,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) static inline void mm_init_cid(struct mm_struct *mm) { - raw_spin_lock_init(&mm->cid_lock); + int i; + + for_each_possible_cpu(i) { + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); + + pcpu_cid->cid = MM_CID_UNSET; + pcpu_cid->time = 0; + } cpumask_clear(mm_cidmask(mm)); } +static inline int mm_alloc_cid(struct mm_struct *mm) +{ + mm->pcpu_cid = alloc_percpu(struct mm_cid); + if (!mm->pcpu_cid) + return -ENOMEM; + mm_init_cid(mm); + return 0; +} + +static inline void mm_destroy_cid(struct mm_struct *mm) +{ + free_percpu(mm->pcpu_cid); + mm->pcpu_cid = NULL; +} + static inline unsigned int mm_cid_size(void) { return cpumask_size(); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm) { } +static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } +static inline void mm_destroy_cid(struct mm_struct *mm) { } static inline unsigned int mm_cid_size(void) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d654eb4cabd..675298d6eb36 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1314,7 +1314,10 @@ struct task_struct { #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ + int last_mm_cid; /* Most recent cid in mm */ + int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ + struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2a243616f222..f20fc0600fcc 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } +static inline void smp_mb__after_mmgrab(void) +{ + smp_mb__after_atomic(); +} + extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ad2ee22272a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + mm_destroy_cid(mm); for (i = 0; i < NR_MM_COUNTERS; i++) percpu_counter_destroy(&mm->rss_stat[i]); @@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; + tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; + tsk->migrate_from_cpu = -1; #endif return tsk; @@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + if (mm_alloc_cid(mm)) + goto fail_cid; + for (i = 0; i < NR_MM_COUNTERS; i++) if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); - mm_init_cid(mm); return mm; fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + mm_destroy_cid(mm); +fail_cid: fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5a97ceb37c13..898fa3bc2765 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2101,6 +2101,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; + if (flags & ENQUEUE_MIGRATED) + sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -3210,6 +3212,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); + sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -4483,6 +4486,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; #endif + init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -5129,7 +5133,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); - switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5285,6 +5288,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop() active * user -> user switch + * + * switch_mm_cid() needs to be updated if the barriers provided + * by context_switch() are modified. */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); @@ -5314,6 +5320,9 @@ context_switch(struct rq *rq, struct task_struct *prev, } } + /* switch_mm_cid() requires the memory barriers above. */ + switch_mm_cid(rq, prev, next); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); prepare_lock_switch(rq, next, rf); @@ -5602,6 +5611,7 @@ void scheduler_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); + task_tick_mm_cid(rq, curr); rq_unlock(rq, &rf); @@ -11469,45 +11479,524 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_exit_signals(struct task_struct *t) + +/** + * @cid_lock: Guarantee forward-progress of cid allocation. + * + * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock + * is only used when contention is detected by the lock-free allocation so + * forward progress can be guaranteed. + */ +DEFINE_RAW_SPINLOCK(cid_lock); + +/** + * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. + * + * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is + * detected, it is set to 1 to ensure that all newly coming allocations are + * serialized by @cid_lock until the allocation which detected contention + * completes and sets @use_cid_lock back to 0. This guarantees forward progress + * of a cid allocation. + */ +int use_cid_lock; + +/* + * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid + * concurrently with respect to the execution of the source runqueue context + * switch. + * + * There is one basic properties we want to guarantee here: + * + * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively + * used by a task. That would lead to concurrent allocation of the cid and + * userspace corruption. + * + * Provide this guarantee by introducing a Dekker memory ordering to guarantee + * that a pair of loads observe at least one of a pair of stores, which can be + * shown as: + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible. But rather than using + * values 0 and 1, this algorithm cares about specific state transitions of the + * runqueue current task (as updated by the scheduler context switch), and the + * per-mm/cpu cid value. + * + * Let's introduce task (Y) which has task->mm == mm and task (N) which has + * task->mm != mm for the rest of the discussion. There are two scheduler state + * transitions on context switch we care about: + * + * (TSA) Store to rq->curr with transition from (N) to (Y) + * + * (TSB) Store to rq->curr with transition from (Y) to (N) + * + * On the remote-clear side, there is one transition we care about: + * + * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * + * There is also a transition to UNSET state which can be performed from all + * sides (scheduler, remote-clear). It is always performed with a cmpxchg which + * guarantees that only a single thread will succeed: + * + * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * + * Just to be clear, what we do _not_ want to happen is a transition to UNSET + * when a thread is actively using the cid (property (1)). + * + * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * + * Scenario A) (TSA)+(TMA) (from next task perspective) + * + * CPU0 CPU1 + * + * Context switch CS-1 Remote-clear + * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) + * (implied barrier after cmpxchg) + * - switch_mm_cid() + * - memory barrier (see switch_mm_cid() + * comment explaining how this barrier + * is combined with other scheduler + * barriers) + * - mm_cid_get (next) + * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * + * This Dekker ensures that either task (Y) is observed by the + * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are + * observed. + * + * If task (Y) store is observed by rcu_dereference(), it means that there is + * still an active task on the cpu. Remote-clear will therefore not transition + * to UNSET, which fulfills property (1). + * + * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), + * it will move its state to UNSET, which clears the percpu cid perhaps + * uselessly (which is not an issue for correctness). Because task (Y) is not + * observed, CPU1 can move ahead to set the state to UNSET. Because moving + * state to UNSET is done with a cmpxchg expecting that the old state has the + * LAZY flag set, only one thread will successfully UNSET. + * + * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 + * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and + * CPU1 will observe task (Y) and do nothing more, which is fine. + * + * What we are effectively preventing with this Dekker is a scenario where + * neither LAZY flag nor store (Y) are observed, which would fail property (1) + * because this would UNSET a cid which is actively used. + */ + +void sched_mm_cid_migrate_from(struct task_struct *t) +{ + t->migrate_from_cpu = task_cpu(t); +} + +static +int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct task_struct *src_task; + int src_cid, last_mm_cid; if (!mm) + return -1; + + last_mm_cid = t->last_mm_cid; + /* + * If the migrated task has no last cid, or if the current + * task on src rq uses the cid, it means the source cid does not need + * to be moved to the destination cpu. + */ + if (last_mm_cid == -1) + return -1; + src_cid = READ_ONCE(src_pcpu_cid->cid); + if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) + return -1; + + /* + * If we observe an active task using the mm on this rq, it means we + * are not the last task to be migrated from this cpu for this mm, so + * there is no need to move src_cid to the destination cpu. + */ + rcu_read_lock(); + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + rcu_read_unlock(); + t->last_mm_cid = -1; + return -1; + } + rcu_read_unlock(); + + return src_cid; +} + +static +int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid, + int src_cid) +{ + struct task_struct *src_task; + struct mm_struct *mm = t->mm; + int lazy_cid; + + if (src_cid == -1) + return -1; + + /* + * Attempt to clear the source cpu cid to move it to the destination + * cpu. + */ + lazy_cid = mm_cid_set_lazy_put(src_cid); + if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) + return -1; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, this task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + rcu_read_lock(); + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + rcu_read_unlock(); + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } + rcu_read_unlock(); + + /* + * The src_cid is unused, so it can be unset. + */ + if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + return -1; + return src_cid; +} + +/* + * Migration to dst cpu. Called with dst_rq lock held. + * Interrupts are disabled, which keeps the window of cid ownership without the + * source rq lock held small. + */ +void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +{ + struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; + struct mm_struct *mm = t->mm; + int src_cid, dst_cid, src_cpu; + struct rq *src_rq; + + lockdep_assert_rq_held(dst_rq); + + if (!mm) + return; + src_cpu = t->migrate_from_cpu; + if (src_cpu == -1) { + t->last_mm_cid = -1; + return; + } + /* + * Move the src cid if the dst cid is unset. This keeps id + * allocation closest to 0 in cases where few threads migrate around + * many cpus. + * + * If destination cid is already set, we may have to just clear + * the src cid to ensure compactness in frequent migrations + * scenarios. + * + * It is not useful to clear the src cid when the number of threads is + * greater or equal to the number of allowed cpus, because user-space + * can expect that the number of allowed cids can reach the number of + * allowed cpus. + */ + dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); + dst_cid = READ_ONCE(dst_pcpu_cid->cid); + if (!mm_cid_is_unset(dst_cid) && + atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + return; + src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); + src_rq = cpu_rq(src_cpu); + src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); + if (src_cid == -1) + return; + src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, + src_cid); + if (src_cid == -1) + return; + if (!mm_cid_is_unset(dst_cid)) { + __mm_cid_put(mm, src_cid); + return; + } + /* Move src_cid to dst cpu. */ + mm_cid_snapshot_time(dst_rq, mm); + WRITE_ONCE(dst_pcpu_cid->cid, src_cid); +} + +static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, + int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *t; + unsigned long flags; + int cid, lazy_cid; + + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid)) return; + + /* + * Clear the cpu cid if it is set to keep cid allocation compact. If + * there happens to be other tasks left on the source cpu using this + * mm, the next task using this mm will reallocate its cid on context + * switch. + */ + lazy_cid = mm_cid_set_lazy_put(cid); + if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) + return; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, that task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + rcu_read_lock(); + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + /* + * The cid is unused, so it can be unset. + * Disable interrupts to keep the window of cid ownership without rq + * lock small. + */ local_irq_save(flags); - mm_cid_put(mm, t->mm_cid); - t->mm_cid = -1; - t->mm_cid_active = 0; + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); local_irq_restore(flags); } +static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct mm_cid *pcpu_cid; + struct task_struct *curr; + u64 rq_clock; + + /* + * rq->clock load is racy on 32-bit but one spurious clear once in a + * while is irrelevant. + */ + rq_clock = READ_ONCE(rq->clock); + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + + /* + * In order to take care of infrequently scheduled tasks, bump the time + * snapshot associated with this cid if an active task using the mm is + * observed on this rq. + */ + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, + int weight) +{ + struct mm_cid *pcpu_cid; + int cid; + + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid) || cid < weight) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void task_mm_cid_work(struct callback_head *work) +{ + unsigned long now = jiffies, old_scan, next_scan; + struct task_struct *t = current; + struct cpumask *cidmask; + struct mm_struct *mm; + int weight, cpu; + + SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + + work->next = work; /* Prevent double-add */ + if (t->flags & PF_EXITING) + return; + mm = t->mm; + if (!mm) + return; + old_scan = READ_ONCE(mm->mm_cid_next_scan); + next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (!old_scan) { + unsigned long res; + + res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); + if (res != old_scan) + old_scan = res; + else + old_scan = next_scan; + } + if (time_before(now, old_scan)) + return; + if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) + return; + cidmask = mm_cidmask(mm); + /* Clear cids that were not recently used. */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_old(mm, cpu); + weight = cpumask_weight(cidmask); + /* + * Clear cids that are greater or equal to the cidmask weight to + * recompact it. + */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_weight(mm, cpu, weight); +} + +void init_sched_mm_cid(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + int mm_users = 0; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + } + t->cid_work.next = &t->cid_work; /* Protect against double add */ + init_task_work(&t->cid_work, task_mm_cid_work); +} + +void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->cid_work; + unsigned long now = jiffies; + + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || + work->next != work) + return; + if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) + return; + task_work_add(curr, work, TWA_RESUME); +} + +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct rq_flags rf; + struct rq *rq; + + if (!mm) + return; + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; + rq_unlock_irqrestore(rq, &rf); +} + void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct rq_flags rf; + struct rq *rq; if (!mm) return; - local_irq_save(flags); - mm_cid_put(mm, t->mm_cid); - t->mm_cid = -1; - t->mm_cid_active = 0; - local_irq_restore(flags); + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; + rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct rq_flags rf; + struct rq *rq; if (!mm) return; - local_irq_save(flags); - t->mm_cid = mm_cid_get(mm); - t->mm_cid_active = 1; - local_irq_restore(flags); + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + rq_unlock_irqrestore(rq, &rf); rseq_set_notify_resume(t); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 060616944d7a..ec7b3e0a2b20 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3253,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr, } #ifdef CONFIG_SCHED_MM_CID -static inline int __mm_cid_get(struct mm_struct *mm) + +#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ +#define MM_CID_SCAN_DELAY 100 /* 100ms */ + +extern raw_spinlock_t cid_lock; +extern int use_cid_lock; + +extern void sched_mm_cid_migrate_from(struct task_struct *t); +extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); +extern void init_sched_mm_cid(struct task_struct *t); + +static inline void __mm_cid_put(struct mm_struct *mm, int cid) +{ + if (cid < 0) + return; + cpumask_clear_cpu(cid, mm_cidmask(mm)); +} + +/* + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to + * be held to transition to other states. + * + * State transitions synchronized with cmpxchg or try_cmpxchg need to be + * consistent across cpus, which prevents use of this_cpu_cmpxchg. + */ +static inline void mm_cid_put_lazy(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + if (!mm_cid_is_lazy_put(cid) || + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, res; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + for (;;) { + if (mm_cid_is_unset(cid)) + return MM_CID_UNSET; + /* + * Attempt transition from valid or lazy-put to unset. + */ + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); + if (res == cid) + break; + cid = res; + } + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm) +{ + int cid; + + lockdep_assert_irqs_disabled(); + cid = mm_cid_pcpu_unset(mm); + if (cid == MM_CID_UNSET) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int __mm_cid_try_get(struct mm_struct *mm) { struct cpumask *cpumask; int cid; cpumask = mm_cidmask(mm); - cid = cpumask_first_zero(cpumask); - if (cid >= nr_cpu_ids) + /* + * Retry finding first zero bit if the mask is temporarily + * filled. This only happens during concurrent remote-clear + * which owns a cid without holding a rq lock. + */ + for (;;) { + cid = cpumask_first_zero(cpumask); + if (cid < nr_cpu_ids) + break; + cpu_relax(); + } + if (cpumask_test_and_set_cpu(cid, cpumask)) return -1; - __cpumask_set_cpu(cid, cpumask); return cid; } -static inline void mm_cid_put(struct mm_struct *mm, int cid) +/* + * Save a snapshot of the current runqueue time of this cpu + * with the per-cpu cid value, allowing to estimate how recently it was used. + */ +static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) { - lockdep_assert_irqs_disabled(); - if (cid < 0) - return; - raw_spin_lock(&mm->cid_lock); - __cpumask_clear_cpu(cid, mm_cidmask(mm)); - raw_spin_unlock(&mm->cid_lock); + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + + lockdep_assert_rq_held(rq); + WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int mm_cid_get(struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) { - int ret; + int cid; - lockdep_assert_irqs_disabled(); - raw_spin_lock(&mm->cid_lock); - ret = __mm_cid_get(mm); - raw_spin_unlock(&mm->cid_lock); - return ret; + /* + * All allocations (even those using the cid_lock) are lock-free. If + * use_cid_lock is set, hold the cid_lock to perform cid allocation to + * guarantee forward progress. + */ + if (!READ_ONCE(use_cid_lock)) { + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto end; + raw_spin_lock(&cid_lock); + } else { + raw_spin_lock(&cid_lock); + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto unlock; + } + + /* + * cid concurrently allocated. Retry while forcing following + * allocations to use the cid_lock to ensure forward progress. + */ + WRITE_ONCE(use_cid_lock, 1); + /* + * Set use_cid_lock before allocation. Only care about program order + * because this is only required for forward progress. + */ + barrier(); + /* + * Retry until it succeeds. It is guaranteed to eventually succeed once + * all newcoming allocations observe the use_cid_lock flag set. + */ + do { + cid = __mm_cid_try_get(mm); + cpu_relax(); + } while (cid < 0); + /* + * Allocate before clearing use_cid_lock. Only care about + * program order because this is for forward progress. + */ + barrier(); + WRITE_ONCE(use_cid_lock, 0); +unlock: + raw_spin_unlock(&cid_lock); +end: + mm_cid_snapshot_time(rq, mm); + return cid; } -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) { + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + struct cpumask *cpumask; + int cid; + + lockdep_assert_rq_held(rq); + cpumask = mm_cidmask(mm); + cid = __this_cpu_read(pcpu_cid->cid); + if (mm_cid_is_valid(cid)) { + mm_cid_snapshot_time(rq, mm); + return cid; + } + if (mm_cid_is_lazy_put(cid)) { + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + } + cid = __mm_cid_get(rq, mm); + __this_cpu_write(pcpu_cid->cid, cid); + return cid; +} + +static inline void switch_mm_cid(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + /* + * Provide a memory barrier between rq->curr store and load of + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. + * + * Should be adapted if context_switch() is modified. + */ + if (!next->mm) { // to kernel + /* + * user -> kernel transition does not guarantee a barrier, but + * we can use the fact that it performs an atomic operation in + * mmgrab(). + */ + if (prev->mm) // from user + smp_mb__after_mmgrab(); + /* + * kernel -> kernel transition does not change rq->curr->mm + * state. It stays NULL. + */ + } else { // to user + /* + * kernel -> user transition does not provide a barrier + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. + * Provide it here. + */ + if (!prev->mm) // from kernel + smp_mb(); + /* + * user -> user transition guarantees a memory barrier through + * switch_mm() when current->mm changes. If current->mm is + * unchanged, no barrier is needed. + */ + } if (prev->mm_cid_active) { - if (next->mm_cid_active && next->mm == prev->mm) { - /* - * Context switch between threads in same mm, hand over - * the mm_cid from prev to next. - */ - next->mm_cid = prev->mm_cid; - prev->mm_cid = -1; - return; - } - mm_cid_put(prev->mm, prev->mm_cid); + mm_cid_snapshot_time(rq, prev->mm); + mm_cid_put_lazy(prev); prev->mm_cid = -1; } if (next->mm_cid_active) - next->mm_cid = mm_cid_get(next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); } #else -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } +static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } +static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +static inline void init_sched_mm_cid(struct task_struct *t) { } #endif #endif /* _KERNEL_SCHED_SCHED_H */ -- cgit From 0019a2d4b7e37a983d133d42b707b8a3018ae6f4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Apr 2023 20:11:11 -0700 Subject: sched: fix cid_lock kernel-doc warnings Fix kernel-doc warnings for cid_lock and use_cid_lock. These comments are not in kernel-doc format. kernel/sched/core.c:11496: warning: Cannot understand * @cid_lock: Guarantee forward-progress of cid allocation. on line 11496 - I thought it was a doc line kernel/sched/core.c:11505: warning: Cannot understand * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. on line 11505 - I thought it was a doc line Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230428031111.322-1-rdunlap@infradead.org --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..a68d1276bab0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11492,7 +11492,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) #ifdef CONFIG_SCHED_MM_CID -/** +/* * @cid_lock: Guarantee forward-progress of cid allocation. * * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock @@ -11501,7 +11501,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) */ DEFINE_RAW_SPINLOCK(cid_lock); -/** +/* * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. * * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is -- cgit