diff options
Diffstat (limited to 'kernel')
89 files changed, 5403 insertions, 2774 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e7a62ebbf4d1..1edaa4846a47 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1612,7 +1612,7 @@ static void audit_log_multicast(int group, const char *op, int err) cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", - task_pid_nr(current), + task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", @@ -1706,7 +1706,7 @@ static int __init audit_init(void) audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", - audit_default ? "enabled" : "disabled"); + str_enabled_disabled(audit_default)); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d6ef4f4f9cba..470041c49a44 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1344,7 +1344,7 @@ int audit_filter(int msgtype, unsigned int listtype) switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(current); + pid = task_tgid_nr(current); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6f0d6fb6523f..cd57053b4a69 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2933,7 +2933,7 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); - audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_format(ab, " pid=%u", task_tgid_nr(current)); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..ba91be08763a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -823,9 +823,11 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) const char *src = btf_str_by_offset(btf, offset); const char *src_limit; + if (!*src) + return false; + /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; - src++; while (*src && src < src_limit) { if (!isprint(*src)) return false; @@ -6283,7 +6285,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, errout: btf_verifier_env_free(env); - if (base_btf != vmlinux_btf) + if (!IS_ERR(base_btf) && base_btf != vmlinux_btf) btf_free(base_btf); if (btf) { kvfree(btf->data); @@ -6523,6 +6525,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + info->reg_type |= PTR_MAYBE_NULL; + if (tgt_prog) { enum bpf_prog_type tgt_type; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbdf5a1aabfe..a2f46785ac3b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -354,12 +354,14 @@ static int cpu_map_kthread_run(void *data) list_add_tail(&skb->list, &list); } - netif_receive_skb_list(&list); - /* Feedback loop via tracepoint */ + /* Feedback loop via tracepoint. + * NB: keep before recv to allow measuring enqueue/dequeue latency. + */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); + netif_receive_skb_list(&list); local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8520095ca03..39d5710c68ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,6 +28,8 @@ #include <linux/cpumask.h> #include <linux/bpf_mem_alloc.h> #include <net/xdp.h> +#include <linux/trace_events.h> +#include <linux/kallsyms.h> #include "disasm.h" @@ -21154,11 +21156,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; + char trace_symbol[KSYM_SYMBOL_LEN]; const char prefix[] = "btf_trace_"; + struct bpf_raw_event_map *btp; int ret = 0, subprog = -1, i; const struct btf_type *t; bool conservative = true; - const char *tname; + const char *tname, *fname; struct btf *btf; long addr = 0; struct module *mod = NULL; @@ -21289,10 +21293,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_ptr(t)) - /* should never happen in valid vmlinux build */ + + /* The func_proto of "btf_trace_##tname" is generated from typedef without argument + * names. Thus using bpf_raw_event_map to get argument names. + */ + btp = bpf_get_raw_tracepoint(tname); + if (!btp) return -EINVAL; + fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, + trace_symbol); + bpf_put_raw_tracepoint(btp); + + if (fname) + ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC); + + if (!fname || ret < 0) { + bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n", + prefix, tname); + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } else { + t = btf_type_by_id(btf, ret); + if (!btf_type_is_func(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f9..a5c9359d516f 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index b9dbf6bf2779..e28d5f0d20ed 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid) return cgroup_no_v1_mask & (1 << ssid); } +static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) +{ + /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ + return ss->legacy_cftypes == NULL && ss->dfl_cftypes; +} + /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + if (cgroup1_subsys_absent(ss)) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } return 0; } @@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { - if (strcmp(param->key, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name) || + cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", @@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc) mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && + !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c8e4b62b436a..90e50d6d3cf3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, @@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } @@ -4096,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && @@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work) list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -5433,6 +5463,21 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; @@ -5517,8 +5562,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5540,6 +5588,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** @@ -6178,7 +6236,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h new file mode 100644 index 000000000000..976a8bc3ff60 --- /dev/null +++ b/kernel/cgroup/cpuset-internal.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __CPUSET_INTERNAL_H +#define __CPUSET_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/spinlock.h> +#include <linux/union_find.h> + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time64_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, + PERR_HKEEPING, + PERR_ACCESS, +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_ONLINE, + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierarchy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; + + /* + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) + * + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. + * + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. + */ + cpumask_var_t exclusive_cpus; + + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + + struct fmeter fmeter; /* memory_pressure filter */ + + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + + /* for custom sched domain */ + int relax_domain_level; + + /* number of valid local child partitions */ + int nr_subparts; + + /* partition root state */ + int partition_root_state; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; +}; + +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return css_cs(task_css(task, cpuset_cgrp_id)); +} + +static inline struct cpuset *parent_cs(struct cpuset *cs) +{ + return css_cs(cs->css.parent); +} + +/* convenient tests for these bits */ +static inline bool is_cpuset_online(struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); +} + +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_css: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + +void rebuild_sched_domains_locked(void); +void cpuset_callback_lock_irq(void); +void cpuset_callback_unlock_irq(void); +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void cpuset_update_tasks_nodemask(struct cpuset *cs); +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int cpuset_common_seq_show(struct seq_file *sf, void *v); + +/* + * cpuset-v1.c + */ +#ifdef CONFIG_CPUSETS_V1 +extern struct cftype cpuset1_files[]; +void fmeter_init(struct fmeter *fmp); +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk); +void cpuset1_update_tasks_flags(struct cpuset *cs); +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated); +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +#else +static inline void fmeter_init(struct fmeter *fmp) {} +static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) {} +static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} +static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) {} +static inline int cpuset1_validate_change(struct cpuset *cur, + struct cpuset *trial) { return 0; } +#endif /* CONFIG_CPUSETS_V1 */ + +#endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c new file mode 100644 index 000000000000..25c1d7b77e2f --- /dev/null +++ b/kernel/cgroup/cpuset-v1.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cpuset-internal.h" + +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + */ + +void __cpuset_memory_pressure_bump(void) +{ + rcu_read_lock(); + fmeter_markevent(&task_cs(current)->fmeter); + rcu_read_unlock(); +} + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val > sched_domain_level_max + 1) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + rebuild_sched_domains_locked(); + } + + return 0; +} + +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = -ENODEV; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) + goto out_unlock; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) +{ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + if (is_spread_page(cs)) + task_set_spread_page(tsk); + else + task_clear_spread_page(tsk); + + if (is_spread_slab(cs)) + task_set_spread_slab(tsk); + else + task_clear_spread_slab(tsk); +} + +/** + * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ +void cpuset1_update_tasks_flags(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + cpuset1_update_task_spread_flags(cs, task); + css_task_iter_end(&it); +} + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = parent_cs(cs); + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent_cs(parent); + + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); + } +} + +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + cpuset_callback_lock_irq(); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + cpuset_callback_unlock_irq(); + + /* + * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migrated to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + cpuset_update_tasks_cpumask(cs, new_cpus); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + cpuset_update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Execute it asynchronously using workqueue. + */ + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); + } +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cpuset_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * cpuset1_validate_change() - Validate conditions specific to legacy (v1) + * behavior. + */ +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = 0; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; + } + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_SPREAD_PAGE: + retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +struct cftype cpuset1_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + /* obsolete, may be removed in the future */ + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, + + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, + + { } /* terminate */ +}; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4bd9e50bcc8e..a4dd285cdf39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -22,11 +22,8 @@ * distribution for more details. */ #include "cgroup-internal.h" +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -40,10 +37,8 @@ #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> -#include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> @@ -57,30 +52,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -/* - * Invalid partition error code - */ -enum prs_errcode { - PERR_NONE = 0, - PERR_INVCPUS, - PERR_INVPARENT, - PERR_NOTPART, - PERR_NOTEXCL, - PERR_NOCPUS, - PERR_HOTPLUG, - PERR_CPUSEMPTY, - PERR_HKEEPING, -}; - static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", @@ -90,133 +61,7 @@ static const char * const perr_strings[] = { [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", -}; - -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * Exclusive CPUs dedicated to current cgroup (default hierarchy only) - * - * The effective_cpus of a valid partition root comes solely from its - * effective_xcpus and some of the effective_xcpus may be distributed - * to sub-partitions below & hence excluded from its effective_cpus. - * For a valid partition root, its effective_cpus have no relationship - * with cpus_allowed unless its exclusive_cpus isn't set. - * - * This value will only be set if either exclusive_cpus is set or - * when this cpuset becomes a local partition root. - */ - cpumask_var_t effective_xcpus; - - /* - * Exclusive CPUs as requested by the user (default hierarchy only) - * - * Its value is independent of cpus_allowed and designates the set of - * CPUs that can be granted to the current cpuset or its children when - * it becomes a valid partition root. The effective set of exclusive - * CPUs granted (effective_xcpus) depends on whether those exclusive - * CPUs are passed down by its ancestors and not yet taken up by - * another sibling partition root along the way. - * - * If its value isn't set, it defaults to cpus_allowed. - */ - cpumask_var_t exclusive_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* number of valid local child partitions */ - int nr_subparts; - - /* partition root state */ - int partition_root_state; - - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; - - /* - * number of SCHED_DEADLINE tasks attached to this cpuset, so that we - * know when to rebuild associated root domain bandwidth information. - */ - int nr_deadline_tasks; - int nr_migrate_dl_tasks; - u64 sum_migrate_dl_bw; - - /* Invalid partition error code, not lock protected */ - enum prs_errcode prs_err; - - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; - - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; -}; - -/* - * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously - */ -struct cpuset_remove_tasks_struct { - struct work_struct work; - struct cpuset *cs; + [PERR_ACCESS] = "Enable partition not permitted", }; /* @@ -229,6 +74,12 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; + /* List of remote partition root children */ static struct list_head remote_children; @@ -279,22 +130,6 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); @@ -309,59 +144,6 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} - -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -403,34 +185,6 @@ static struct cpuset top_cpuset = { .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of - * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. - */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) - /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a @@ -484,6 +238,16 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} + +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -500,6 +264,26 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } /* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +/* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. @@ -596,45 +380,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Call with callback_lock or cpuset_mutex held. The check can be skipped - * if on default hierarchy. - */ -static void cpuset_update_task_spread_flags(struct cpuset *cs, - struct task_struct *tsk) -{ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) - return; - - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. @@ -750,13 +495,6 @@ static inline bool xcpus_empty(struct cpuset *cs) cpumask_empty(cs->exclusive_cpus); } -static inline struct cpumask *fetch_xcpus(struct cpuset *cs) -{ - return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : - cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed - : cs->effective_xcpus; -} - /* * cpusets_are_exclusive() - check if two cpusets are exclusive * @@ -764,8 +502,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *xcpus1 = fetch_xcpus(cs1); - struct cpumask *xcpus2 = fetch_xcpus(cs2); + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; @@ -773,35 +511,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) } /* - * validate_change_legacy() - Validate conditions specific to legacy (v1) - * behavior. - */ -static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) -{ - struct cgroup_subsys_state *css; - struct cpuset *c, *par; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - par = parent_cs(cur); - if (par && !is_cpuset_subset(trial, par)) - goto out; - - ret = 0; -out: - return ret; -} - -/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -830,7 +539,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); if (!is_in_v2_mode()) - ret = validate_change_legacy(cur, trial); + ret = cpuset1_validate_change(cur, trial); if (ret) goto out; @@ -996,18 +705,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -1015,7 +721,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ @@ -1023,6 +729,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + int nslot_update; doms = NULL; dattr = NULL; @@ -1111,32 +818,28 @@ v2: goto single_root_domain; for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -1167,44 +870,25 @@ restart: } for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; - } - - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -1296,7 +980,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1348,7 +1032,7 @@ static void rebuild_sched_domains_locked(void) partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ @@ -1368,7 +1052,7 @@ void rebuild_sched_domains(void) } /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * @@ -1378,7 +1062,7 @@ void rebuild_sched_domains(void) * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ -static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1428,8 +1112,6 @@ enum partition_cmd { partcmd_invalidate, /* Make partition invalid */ }; -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); @@ -1443,11 +1125,11 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { - if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } @@ -1516,12 +1198,8 @@ static void reset_partition_data(struct cpuset *cs) if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } - if (!cpumask_and(cs->effective_cpus, - parent->effective_cpus, cs->cpus_allowed)) { - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); - } } /* @@ -1662,7 +1340,7 @@ static inline bool is_local_partition(struct cpuset *cs) * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temparary masks - * Return: 1 if successful, 0 if error + * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. @@ -1676,7 +1354,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other @@ -1690,26 +1368,20 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) - return 0; + return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; + return 0; } /* @@ -1743,7 +1415,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } @@ -1795,7 +1467,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; @@ -1850,15 +1522,15 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in - * an isolated partition. + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); - bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + if (!have_boot_isolcpus) + return false; - if (!all_in_hk && (prstate != PRS_ISOLATED)) + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; @@ -2167,7 +1839,7 @@ write_error: update_partition_exclusive(cs, new_prs); if (adding || deleting) { - update_tasks_cpumask(parent, tmp->addmask); + cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } @@ -2325,17 +1997,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; - } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; - } if (remote) goto get_css; @@ -2359,7 +2022,7 @@ update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent + * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { @@ -2416,7 +2079,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -2472,8 +2135,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * - * With the addition of effective_xcpus which is a subset of - * cpus_allowed. It is possible a change in parent's effective_cpus + * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. @@ -2487,8 +2149,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus && - !is_partition_valid(sibling)) { + if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) @@ -2598,7 +2259,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = fetch_xcpus(trialcs); + struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { @@ -2845,14 +2506,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -2950,7 +2611,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -3036,44 +2697,8 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val > sched_domain_level_max + 1) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -3081,7 +2706,7 @@ static void update_tasks_flags(struct cpuset *cs) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -3117,7 +2742,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; @@ -3166,9 +2791,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { - enum partition_cmd cmd = (new_prs == PRS_ROOT) - ? partcmd_enable : partcmd_enablei; - /* * cpus_allowed and exclusive_cpus cannot be both empty. */ @@ -3177,13 +2799,18 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* - * If an attempt to become local partition root fails, - * try to become a remote partition root instead. + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. */ - if (err && remote_partition_enable(cs, new_prs, &tmpmask)) - err = 0; + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -3236,107 +2863,6 @@ out: return 0; } -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - static struct cpuset *cpuset_attach_old_cs; /* @@ -3445,9 +2971,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); @@ -3483,7 +3007,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) @@ -3562,116 +3086,15 @@ out: reset_migrate_dl_data(cs); } - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - - mutex_unlock(&cpuset_mutex); -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_EXCLUSIVE_CPULIST, - FILE_EFFECTIVE_XCPULIST, - FILE_ISOLATED_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; + dec_attach_in_progress_locked(cs); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; } /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3746,7 +3169,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -3787,52 +3210,6 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); @@ -3897,113 +3274,6 @@ out_unlock: } /* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype legacy_files[] = { - { - .name = "cpus", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "effective_cpus", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "effective_mems", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - /* obsolete, may be removed in the future */ - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, - }, - - { } /* terminate */ -}; - -/* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ @@ -4150,8 +3420,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -4215,14 +3483,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); @@ -4312,11 +3573,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + dec_attach_in_progress(cs); } /* @@ -4348,10 +3605,7 @@ static void cpuset_fork(struct task_struct *task) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } @@ -4368,7 +3622,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -4401,91 +3657,14 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} - -static void cpuset_migrate_tasks_workfn(struct work_struct *work) -{ - struct cpuset_remove_tasks_struct *s; - - s = container_of(work, struct cpuset_remove_tasks_struct, work); - remove_tasks_in_empty_cpuset(s->cs); - css_put(&s->cs->css); - kfree(s); -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migrated to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Execute it asynchronously using workqueue. - */ - if (is_empty && cs->css.cgroup->nr_populated_csets && - css_tryget_online(&cs->css)) { - struct cpuset_remove_tasks_struct *s; - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (WARN_ON_ONCE(!s)) { - css_put(&cs->css); - return; - } - - s->cs = cs; - INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); - schedule_work(&s->work); - } + return 0; } static void @@ -4505,9 +3684,9 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } void cpuset_force_rebuild(void) @@ -4608,7 +3787,7 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: @@ -4693,7 +3872,7 @@ static void cpuset_handle_hotplug(void) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -5033,19 +4212,6 @@ int cpuset_mem_spread_node(void) } /** - * cpuset_slab_spread_node() - On which node to begin search for a slab page - */ -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. @@ -5083,39 +4249,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/* - * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - */ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index f5cb0ec45b9d..8f61114c36dd 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -244,7 +244,6 @@ static void pids_event(struct pids_cgroup *pids_forking, struct pids_cgroup *pids_over_limit) { struct pids_cgroup *p = pids_forking; - bool limit = false; /* Only log the first time limit is hit. */ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { @@ -252,20 +251,17 @@ static void pids_event(struct pids_cgroup *pids_forking, pr_cont_cgroup_path(p->css.cgroup); pr_cont("\n"); } - cgroup_file_notify(&p->events_local_file); if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || - cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + cgroup_file_notify(&p->events_local_file); return; + } - for (; parent_pids(p); p = parent_pids(p)) { - if (p == pids_over_limit) { - limit = true; - atomic64_inc(&p->events_local[PIDCG_MAX]); - cgroup_file_notify(&p->events_local_file); - } - if (limit) - atomic64_inc(&p->events[PIDCG_MAX]); + atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); + cgroup_file_notify(&pids_over_limit->events_local_file); + for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { + atomic64_inc(&p->events[PIDCG_MAX]); cgroup_file_notify(&p->events_file); } } @@ -276,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking, */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids, *pids_over_limit; int err; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); err = pids_try_charge(pids, 1, &pids_over_limit); if (err) pids_event(pids, pids_over_limit); @@ -294,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 24b1e1143260..938c48952d26 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -28,34 +28,34 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, + .nesting = 1, + .nmi_nesting = CT_NESTING_IRQ_NONIDLE, #endif - .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), + .state = ATOMIC_INIT(CT_RCU_WATCHING), }; EXPORT_SYMBOL_GPL(context_tracking); #ifdef CONFIG_CONTEXT_TRACKING_IDLE #define TPS(x) tracepoint_string(x) -/* Record the current task on dyntick-idle entry. */ -static __always_inline void rcu_dynticks_task_enter(void) +/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */ +static __always_inline void rcu_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Record no current task on dyntick-idle exit. */ -static __always_inline void rcu_dynticks_task_exit(void) +/* Record no current task on entering RCU-tasks (dyntick-idle exit). */ +static __always_inline void rcu_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static __always_inline void rcu_dynticks_task_trace_enter(void) +/* Turn on heavyweight RCU tasks trace readers on kernel exit. */ +static __always_inline void rcu_task_trace_heavyweight_enter(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -63,8 +63,8 @@ static __always_inline void rcu_dynticks_task_trace_enter(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static __always_inline void rcu_dynticks_task_trace_exit(void) +/* Turn off heavyweight RCU tasks trace readers on kernel entry. */ +static __always_inline void rcu_task_trace_heavyweight_exit(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -87,10 +87,10 @@ static noinstr void ct_kernel_exit_state(int offset) * critical sections, and we also must force ordering with the * next idle sojourn. */ - rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + rcu_task_trace_heavyweight_enter(); // Before CT state update! seq = ct_state_inc(offset); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); } /* @@ -109,15 +109,15 @@ static noinstr void ct_kernel_enter_state(int offset) */ seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! - rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); + rcu_task_trace_heavyweight_exit(); // After CT state update! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * We crowbar the ->nmi_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ @@ -125,19 +125,19 @@ static void noinstr ct_kernel_exit(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE); + WRITE_ONCE(ct->nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - ct_dynticks_nesting() == 0); - if (ct_dynticks_nesting() != 1) { + ct_nesting() == 0); + if (ct_nesting() != 1) { // RCU will still be watching, so just do accounting and leave. - ct->dynticks_nesting--; + ct->nesting--; return; } instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); + trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -145,18 +145,18 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... ct_kernel_exit_state(offset); // ... but is no longer watching here. - rcu_dynticks_task_enter(); + rcu_task_exit(); } /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ @@ -166,14 +166,14 @@ static void noinstr ct_kernel_enter(bool user, int offset) long oldval; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - oldval = ct_dynticks_nesting(); + oldval = ct_nesting(); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { // RCU was already watching, so just do accounting and leave. - ct->dynticks_nesting++; + ct->nesting++; return; } - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... ct_kernel_enter_state(offset); // ... but is watching here. @@ -182,11 +182,11 @@ static void noinstr ct_kernel_enter(bool user, int offset) // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); - trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); + trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(ct->dynticks_nesting, 1); - WARN_ON_ONCE(ct_dynticks_nmi_nesting()); - WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->nesting, 1); + WARN_ON_ONCE(ct_nmi_nesting()); + WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE); instrumentation_end(); } @@ -194,7 +194,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) * ct_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting + * RCU-idle period, update ct->state and ct->nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -207,47 +207,47 @@ void noinstr ct_nmi_exit(void) instrumentation_begin(); /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * Check for ->nmi_nesting underflow and bad CT state. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + WARN_ON_ONCE(ct_nmi_nesting() <= 0); + WARN_ON_ONCE(!rcu_is_watching_curr_cpu()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (ct_dynticks_nmi_nesting() != 1) { - trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, - ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ - ct_dynticks_nmi_nesting() - 2); + if (ct_nmi_nesting() != 1) { + trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2, + ct_rcu_watching()); + WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */ + ct_nmi_nesting() - 2); instrumentation_end(); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching()); + WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); // RCU is watching here ... - ct_kernel_exit_state(RCU_DYNTICKS_IDX); + ct_kernel_exit_state(CT_RCU_WATCHING); // ... but is no longer watching here. if (!in_nmi()) - rcu_dynticks_task_enter(); + rcu_task_exit(); } /** * ct_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update ct->state and - * ct->dynticks_nmi_nesting to let the RCU grace-period handling know + * ct->nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) @@ -261,27 +261,27 @@ void noinstr ct_nmi_enter(void) struct context_tracking *ct = this_cpu_ptr(&context_tracking); /* Complain about underflow. */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); + WARN_ON_ONCE(ct_nmi_nesting() < 0); /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * If idle from RCU viewpoint, atomically increment CT state + * to mark non-idle and increment ->nmi_nesting by one. + * Otherwise, increment ->nmi_nesting by two. This means + * if ->nmi_nesting is equal to one, we are guaranteed * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (rcu_dynticks_curr_cpu_in_eqs()) { + if (!rcu_is_watching_curr_cpu()) { if (!in_nmi()) - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... - ct_kernel_enter_state(RCU_DYNTICKS_IDX); + ct_kernel_enter_state(CT_RCU_WATCHING); // ... but is watching here. instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + // instrumentation for the noinstr rcu_is_watching_curr_cpu() instrument_atomic_read(&ct->state, sizeof(ct->state)); // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); @@ -294,12 +294,12 @@ void noinstr ct_nmi_enter(void) instrumentation_begin(); } - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - ct_dynticks_nmi_nesting(), - ct_dynticks_nmi_nesting() + incby, ct_dynticks()); + trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="), + ct_nmi_nesting(), + ct_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ - ct_dynticks_nmi_nesting() + incby); + WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */ + ct_nmi_nesting() + incby); barrier(); } @@ -317,7 +317,7 @@ void noinstr ct_nmi_enter(void) void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); + ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE); } EXPORT_SYMBOL_GPL(ct_idle_enter); @@ -335,7 +335,7 @@ void noinstr ct_idle_exit(void) unsigned long flags; raw_local_irq_save(flags); - ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); + ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ct_idle_exit); @@ -485,7 +485,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ - if (state == CONTEXT_USER) { + if (state == CT_STATE_USER) { instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); @@ -504,7 +504,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. */ - ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); + ct_kernel_exit(true, CT_RCU_WATCHING + state); /* * Special case if we only track user <-> kernel transitions for tickless @@ -534,7 +534,7 @@ void noinstr __ct_user_enter(enum ctx_state state) /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_add(state, &ct->state); @@ -620,8 +620,8 @@ void noinstr __ct_user_exit(enum ctx_state state) * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. */ - ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); - if (state == CONTEXT_USER) { + ct_kernel_enter(true, CT_RCU_WATCHING - state); + if (state == CT_STATE_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); @@ -634,17 +634,17 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_sub(state, &ct->state); diff --git a/kernel/cpu.c b/kernel/cpu.c index b1fd2a3db91a..d293d52a3e00 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,7 +330,7 @@ static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state st /* Poll for one millisecond */ arch_cpuhp_sync_state_poll(); } else { - usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE); + usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC); } sync = atomic_read(st); } @@ -1808,6 +1808,7 @@ static int __init parallel_bringup_parse_param(char *arg) } early_param("cpuhp.parallel", parallel_bringup_parse_param); +#ifdef CONFIG_HOTPLUG_SMT static inline bool cpuhp_smt_aware(void) { return cpu_smt_max_threads > 1; @@ -1817,6 +1818,21 @@ static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_primary_thread_mask; } +#else +static inline bool cpuhp_smt_aware(void) +{ + return false; +} +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_none_mask; +} +#endif + +bool __weak arch_cpuhp_init_parallel_bringup(void) +{ + return true; +} /* * On architectures which have enabled parallel bringup this invokes all BP @@ -2689,9 +2705,7 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -/** - * Check if the core a CPU belongs to is online - */ +/* Check if the core a CPU belongs to is online */ #if !defined(topology_is_core_online) static inline bool topology_is_core_online(unsigned int cpu) { diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90843cc38588..5b6934e23c21 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -182,7 +182,7 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long nr = syscall_get_nr(current, regs); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + CT_WARN_ON(ct_state() != CT_STATE_KERNEL); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) diff --git a/kernel/events/core.c b/kernel/events/core.c index c973e3c11e03..4f03eb908e7f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); +} + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) @@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -291,22 +327,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); +unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* @@ -369,16 +408,6 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_CGROUP = 0x10, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist */ @@ -407,6 +436,11 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* @@ -685,30 +719,32 @@ do { \ ___p; \ }) +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } } -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF @@ -865,7 +901,7 @@ static void perf_cgroup_switch(struct task_struct *task) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -877,7 +913,7 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -1255,8 +1291,9 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock - * perf_event::mmap_mutex * mmap_lock + * perf_event::mmap_mutex + * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -1768,6 +1805,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu) typeof(*event), group_node)) /* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + +/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1797,6 +1842,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@ -2021,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); @@ -2316,6 +2365,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) event_sched_out(event, ctx); } +static inline void +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } +} + +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} + +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } +} + #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL @@ -2335,10 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - } + ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least @@ -2423,12 +2508,8 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); @@ -2644,7 +2725,8 @@ static void add_event_to_ctx(struct perf_event *event, } static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); @@ -2654,18 +2736,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2683,16 +2766,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ -/* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2703,10 +2782,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); } /* @@ -2717,15 +2800,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx); + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) @@ -2734,7 +2821,7 @@ void perf_pmu_resched(struct pmu *pmu) struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2790,9 +2877,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2935,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2944,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event, if (!ctx->is_active) return; - if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3230,7 +3313,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; - if (ctx->task && !ctx->is_active) { + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); @@ -3238,7 +3321,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, cpc->task_epc = NULL; } - if (!event_type) + if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); @@ -3264,8 +3347,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, perf_pmu_enable(pmu); } +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@ -3296,34 +3388,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; + + if (!(ctx->is_active & EVENT_ALL)) { /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. */ - barrier(); + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) + if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } } /* @@ -3516,12 +3610,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_disable(ctx, false); - /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@ -3562,7 +3661,7 @@ unlock: inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@ -3860,29 +3959,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx, merge_sched_in, &can_add_hw); } -static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } -} + struct perf_event_context *ctx = pmu_ctx->ctx; -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) -{ - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; @@ -3906,7 +3998,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -3918,12 +4010,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } static void perf_event_context_sched_in(struct task_struct *task) @@ -3966,10 +4062,10 @@ static void perf_event_context_sched_in(struct task_struct *task) */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); @@ -4092,7 +4188,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; @@ -4310,14 +4410,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4383,7 +4483,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4395,9 +4495,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -4458,16 +4556,24 @@ struct perf_read_data { int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -4500,10 +4606,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) @@ -4538,8 +4641,11 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } @@ -4700,10 +4806,7 @@ again: * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) @@ -5204,7 +5307,7 @@ static void perf_pending_task_sync(struct perf_event *event) */ if (task_work_cancel(current, head)) { event->pending_work = 0; - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); return; } @@ -5498,7 +5601,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5507,7 +5610,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -5589,14 +5692,14 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -6176,7 +6279,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -6373,12 +6476,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) event->pmu->event_unmapped(event, vma->vm_mm); /* - * rb->aux_mmap_count will always drop before rb->mmap_count and - * event->mmap_count, so it is ok to use event->mmap_mutex to - * serialize with perf_mmap here. + * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex + * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6395,7 +6497,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); - mutex_unlock(&event->mmap_mutex); + mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) @@ -6483,6 +6585,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; @@ -6531,6 +6634,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; + aux_mutex = &rb->aux_mutex; + mutex_lock(aux_mutex); + aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -6681,6 +6787,8 @@ unlock: atomic_dec(&rb->mmap_count); } aux_unlock: + if (aux_mutex) + mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* @@ -6868,7 +6976,7 @@ static void perf_pending_task(struct callback_head *head) if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); @@ -7250,7 +7358,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -7268,14 +7376,15 @@ static void perf_output_read_one(struct perf_output_handle *handle, } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling @@ -7295,7 +7404,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@ -7310,7 +7419,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -7331,6 +7440,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -9741,7 +9854,7 @@ static int __perf_event_overflow(struct perf_event *event, if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; - local_inc(&event->ctx->nr_pending); + local_inc(&event->ctx->nr_no_switch_fast); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -11478,10 +11591,60 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, NULL, }; @@ -11493,6 +11656,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int if (n == 2 && !pmu->nr_addr_filters) return 0; + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + return a->mode; } @@ -11577,6 +11744,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto free_pdc; } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; if (type >= 0) @@ -11731,6 +11903,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -12058,10 +12246,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) @@ -13085,7 +13275,7 @@ static void sync_child_event(struct perf_event *child_event) perf_event_read_event(child_event, task); } - child_val = perf_event_count(child_event); + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: @@ -13176,7 +13366,7 @@ static void perf_event_exit_task_context(struct task_struct *child) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -13352,6 +13542,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * @@ -13682,6 +13881,12 @@ static void __init perf_event_init_all_cpus(void) int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); @@ -13725,12 +13930,46 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; @@ -13738,6 +13977,11 @@ static void perf_event_exit_cpu_context(int cpu) // XXX simplify cpuctx->online mutex_lock(&pmus_lock); + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -13745,7 +13989,6 @@ static void perf_event_exit_cpu_context(int cpu) smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); - cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else @@ -13754,6 +13997,42 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + cpumask_set_cpu(cpu, perf_online_mask); + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online_<domain>_masks includes the first CPU of each domain. + * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks. + */ + if (!topology_sibling_cpumask(cpu)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + return; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; @@ -13762,7 +14041,7 @@ int perf_event_init_cpu(unsigned int cpu) perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); + perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 451514442a1b..e072d995d670 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,6 +40,7 @@ struct perf_buffer { struct user_struct *mmap_user; /* AUX area */ + struct mutex aux_mutex; long aux_head; unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 8cadf97bc290..4f46f688d0d4 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -337,6 +337,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) */ if (!rb->nr_pages) rb->paused = 1; + + mutex_init(&rb->aux_mutex); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 73cc47708679..4b7e590dc428 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -40,6 +40,9 @@ static struct rb_root uprobes_tree = RB_ROOT; #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); + +DEFINE_STATIC_SRCU(uprobes_srcu); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -57,8 +60,9 @@ struct uprobe { struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ + struct rcu_head rcu; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@ -109,6 +113,11 @@ struct xol_area { unsigned long vaddr; /* Page(s) of instruction slots */ }; +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); +} + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -453,7 +462,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_lock held for write. + * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -587,25 +596,63 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v *(uprobe_opcode_t *)&auprobe->insn); } +/* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } +/* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ +static struct uprobe *try_get_uprobe(struct uprobe *uprobe) +{ + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; +} + +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} + +static void uprobe_free_rcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); +} + static void put_uprobe(struct uprobe *uprobe) { - if (refcount_dec_and_test(&uprobe->ref)) { - /* - * If application munmap(exec_vma) before uprobe_unregister() - * gets called, we don't get a chance to remove uprobe from - * delayed_uprobe_list from remove_breakpoint(). Do it here. - */ - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); + rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); } + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); } static __always_inline @@ -647,62 +694,86 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +/* + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. + */ +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; - struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); + struct rb_node *node; + unsigned int seq; - if (node) - return get_uprobe(__node_2_uprobe(node)); + lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); + + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. */ -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) -{ - struct uprobe *uprobe; - - read_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - read_unlock(&uprobes_treelock); - - return uprobe; -} - static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; +again: + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) { + struct uprobe *u = __node_2_uprobe(node); + + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + } - node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); - if (node) - return get_uprobe(__node_2_uprobe(node)); + return u; + } - /* get access + creation ref */ - refcount_set(&uprobe->ref, 2); - return NULL; + return uprobe; } /* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; @@ -725,18 +796,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); @@ -753,32 +827,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; } static int __copy_insn(struct address_space *mapping, struct file *filp, @@ -863,21 +924,20 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); } -static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + ret = consumer_filter(uc, mm); if (ret) break; } @@ -921,27 +981,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } -static inline bool uprobe_is_active(struct uprobe *uprobe) -{ - return !RB_EMPTY_NODE(&uprobe->rb_node); -} -/* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ -static void delete_uprobe(struct uprobe *uprobe) -{ - if (WARN_ON(!uprobe_is_active(uprobe))) - return; - - write_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - write_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - put_uprobe(uprobe); -} - struct map_info { struct map_info *next; struct mm_struct *mm; @@ -1046,7 +1085,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe_rcu() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). + */ mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || @@ -1059,12 +1104,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) + if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) + if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -1079,152 +1122,140 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static void -__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +/** + * uprobe_unregister_nosync - unregister an already registered probe. + * @uprobe: uprobe to remove + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; - if (WARN_ON(!consumer_del(uprobe, uc))) - return; - + down_write(&uprobe->register_rwsem); + consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); -} - -/* - * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; + up_write(&uprobe->register_rwsem); - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); return; + } - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } -EXPORT_SYMBOL_GPL(uprobe_unregister); +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -/* - * __uprobe_register - register a probe +void uprobe_unregister_sync(void) +{ + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_srcu(&uprobes_srcu); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); + +/** + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) - return -EIO; + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL); - retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); - if (!uprobe) - return -ENOMEM; if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe; - /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. - */ down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - consumer_add(uprobe, uc); - ret = register_for_each_vma(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); - if (unlikely(ret == -EAGAIN)) - goto retry; - return ret; -} + if (ret) { + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); + return ERR_PTR(ret); + } -int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, 0, uc); + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, ref_ctr_offset, uc); -} -EXPORT_SYMBOL_GPL(uprobe_register_refctr); - -/* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; - int ret = -ENOENT; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; + int ret = -ENOENT, srcu_idx; down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); + + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + srcu_read_unlock(&uprobes_srcu, srcu_idx); + up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); return ret; } @@ -1305,15 +1336,17 @@ static void build_probe_list(struct inode *inode, u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); @@ -1384,7 +1417,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@ -1489,7 +1522,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) struct xol_area *area; void *insns; - area = kmalloc(sizeof(*area), GFP_KERNEL); + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1499,7 +1532,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; - area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) @@ -1771,6 +1803,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; + /* + * uprobe's refcnt has to be positive at this point, kept by + * utask->return_instances items; return_instances can't be + * removed right now, as task is blocked due to duping; so + * get_uprobe() is safe to use here. + */ get_uprobe(n->uprobe); n->next = NULL; @@ -1782,12 +1820,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return 0; } -static void uprobe_warn(struct task_struct *t, const char *msg) -{ - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); -} - static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) @@ -1884,9 +1916,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } + /* we need to bump refcount to store uprobe in utask */ + if (!try_get_uprobe(uprobe)) + return; + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - return; + goto fail; trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1913,8 +1949,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - - ri->uprobe = get_uprobe(uprobe); + ri->uprobe = uprobe; ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; @@ -1925,8 +1960,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) utask->return_instances = ri; return; - fail: +fail: kfree(ri); + put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ @@ -1941,9 +1977,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) if (!utask) return -ENOMEM; + if (!try_get_uprobe(uprobe)) + return -EINVAL; + xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_vaddr) { + err = -ENOMEM; + goto err_out; + } utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; @@ -1951,12 +1992,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(current); - return err; + goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; +err_out: + put_uprobe(uprobe); + return err; } /* @@ -2029,13 +2073,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - /* - * The NULL 'tsk' here ensures that any faults that occur here - * will not be accounted to the task. 'mm' *is* current->mm, - * but we treat this as a 'remote' access since it is - * essentially a kernel access to the memory. - */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; @@ -2046,7 +2084,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +/* assumes being inside RCU protected region */ +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -2059,7 +2098,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); - uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) @@ -2080,9 +2119,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false; + + current->utask->auprobe = &uprobe->arch; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { int rc = 0; if (uc->handler) { @@ -2095,16 +2137,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) need_prep = true; remove &= rc; + has_consumers = true; } + current->utask->auprobe = NULL; if (need_prep && !remove) prepare_uretprobe(uprobe, regs); /* put bp at return */ - if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); + + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); } static void @@ -2112,13 +2162,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) { struct uprobe *uprobe = ri->uprobe; struct uprobe_consumer *uc; + int srcu_idx; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { if (uc->ret_handler) uc->ret_handler(uc, ri->func, regs); } - up_read(&uprobe->register_rwsem); + srcu_read_unlock(&uprobes_srcu, srcu_idx); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2203,13 +2255,15 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int is_swbp, srcu_idx; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + srcu_idx = srcu_read_lock(&uprobes_srcu); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -2225,7 +2279,7 @@ static void handle_swbp(struct pt_regs *regs) */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; } /* change it in advance for ->handler() and restart */ @@ -2260,12 +2314,12 @@ static void handle_swbp(struct pt_regs *regs) if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out; - /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + srcu_read_unlock(&uprobes_srcu, srcu_idx); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 7430852a8571..0d62a53605df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -428,7 +428,7 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); + set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..d4b2d543f48c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1861,7 +1861,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS - INIT_LIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -2311,7 +2311,6 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; - p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc94e0bf2c94..271e9139de77 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -198,7 +198,7 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(aff, cpu_online_mask)) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index eb8628390156..15a7654eff68 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -37,7 +37,7 @@ static inline bool irq_needs_fixup(struct irq_data *d) * has been removed from the online mask already. */ if (cpumask_any_but(m, cpu) < nr_cpu_ids && - cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { + !cpumask_intersects(m, cpu_online_mask)) { /* * If this happens then there was a missed IRQ fixup at some * point. Warn about it and enforce fixup. @@ -110,7 +110,7 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(affinity, cpu_online_mask)) { /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 3d4036db15ac..1a3d483548e2 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -13,7 +13,6 @@ struct irq_sim_work_ctx { struct irq_work work; - int irq_base; unsigned int irq_count; unsigned long *pending; struct irq_domain *domain; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cea8f6874b1f..e0bff21f30e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -128,72 +128,98 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -static int irq_domain_set_name(struct irq_domain *domain, - const struct fwnode_handle *fwnode, - enum irq_domain_bus_token bus_token) +static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token) +{ + if (bus_token == DOMAIN_BUS_ANY) + domain->name = kasprintf(GFP_KERNEL, "%s", base); + else + domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token); + if (!domain->name) + return -ENOMEM; + + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token, const char *suffix) +{ + const char *sep = suffix ? "-" : ""; + const char *suf = suffix ? : ""; + char *name; + + if (bus_token == DOMAIN_BUS_ANY) + name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf); + else + name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token); + if (!name) + return -ENOMEM; + + /* + * fwnode paths contain '/', which debugfs is legitimately unhappy + * about. Replace them with ':', which does the trick and is not as + * offensive as '\'... + */ + domain->name = strreplace(name, '/', ':'); + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { static atomic_t unknown_domains; - struct irqchip_fwid *fwid; + int id = atomic_inc_return(&unknown_domains); + + if (bus_token == DOMAIN_BUS_ANY) + domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id); + else + domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token); + if (!domain->name) + return -ENOMEM; + + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info) +{ + enum irq_domain_bus_token bus_token = info->bus_token; + const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { - fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + /* + * The name_suffix is only intended to be used to avoid a name + * collision when multiple domains are created for a single + * device and the name is picked using a real device node. + * (Typical use-case is regmap-IRQ controllers for devices + * providing more than one physical IRQ.) There should be no + * need to use name_suffix with irqchip-fwnode. + */ + if (info->name_suffix) + return -EINVAL; switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: - domain->name = bus_token ? - kasprintf(GFP_KERNEL, "%s-%d", - fwid->name, bus_token) : - kstrdup(fwid->name, GFP_KERNEL); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - break; + return alloc_name(domain, fwid->name, bus_token); default: domain->name = fwid->name; - if (bus_token) { - domain->name = kasprintf(GFP_KERNEL, "%s-%d", - fwid->name, bus_token); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - break; + if (bus_token != DOMAIN_BUS_ANY) + return alloc_name(domain, fwid->name, bus_token); } - } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || - is_software_node(fwnode)) { - char *name; - - /* - * fwnode paths contain '/', which debugfs is legitimately - * unhappy about. Replace them with ':', which does - * the trick and is not as offensive as '\'... - */ - name = bus_token ? - kasprintf(GFP_KERNEL, "%pfw-%d", fwnode, bus_token) : - kasprintf(GFP_KERNEL, "%pfw", fwnode); - if (!name) - return -ENOMEM; - domain->name = strreplace(name, '/', ':'); - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) { + return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix); } - if (!domain->name) { - if (fwnode) - pr_err("Invalid fwnode type for irqdomain\n"); - domain->name = bus_token ? - kasprintf(GFP_KERNEL, "unknown-%d-%d", - atomic_inc_return(&unknown_domains), - bus_token) : - kasprintf(GFP_KERNEL, "unknown-%d", - atomic_inc_return(&unknown_domains)); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } + if (domain->name) + return 0; - return 0; + if (fwnode) + pr_err("Invalid fwnode type for irqdomain\n"); + return alloc_unknown_name(domain, bus_token); } static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) @@ -211,7 +237,7 @@ static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info if (!domain) return ERR_PTR(-ENOMEM); - err = irq_domain_set_name(domain, info->fwnode, info->bus_token); + err = irq_domain_set_name(domain, info); if (err) { kfree(domain); return ERR_PTR(err); @@ -267,13 +293,20 @@ static void irq_domain_free(struct irq_domain *domain) kfree(domain); } -/** - * irq_domain_instantiate() - Instantiate a new irq domain data structure - * @info: Domain information pointer pointing to the information for this domain - * - * Return: A pointer to the instantiated irq domain or an ERR_PTR value. - */ -struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) +static void irq_domain_instantiate_descs(const struct irq_domain_info *info) +{ + if (!IS_ENABLED(CONFIG_SPARSE_IRQ)) + return; + + if (irq_alloc_descs(info->virq_base, info->virq_base, info->size, + of_node_to_nid(to_of_node(info->fwnode))) < 0) { + pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + info->virq_base); + } +} + +static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info, + bool cond_alloc_descs, bool force_associate) { struct irq_domain *domain; int err; @@ -306,6 +339,19 @@ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) __irq_domain_publish(domain); + if (cond_alloc_descs && info->virq_base > 0) + irq_domain_instantiate_descs(info); + + /* + * Legacy interrupt domains have a fixed Linux interrupt number + * associated. Other interrupt domains can request association by + * providing a Linux interrupt number > 0. + */ + if (force_associate || info->virq_base > 0) { + irq_domain_associate_many(domain, info->virq_base, info->hwirq_base, + info->size - info->hwirq_base); + } + return domain; err_domain_gc_remove: @@ -315,6 +361,17 @@ err_domain_free: irq_domain_free(domain); return ERR_PTR(err); } + +/** + * irq_domain_instantiate() - Instantiate a new irq domain data structure + * @info: Domain information pointer pointing to the information for this domain + * + * Return: A pointer to the instantiated irq domain or an ERR_PTR value. + */ +struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) +{ + return __irq_domain_instantiate(info, false, false); +} EXPORT_SYMBOL_GPL(irq_domain_instantiate); /** @@ -413,28 +470,13 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, .fwnode = fwnode, .size = size, .hwirq_max = size, + .virq_base = first_irq, .ops = ops, .host_data = host_data, }; - struct irq_domain *domain; - - domain = irq_domain_instantiate(&info); - if (IS_ERR(domain)) - return NULL; + struct irq_domain *domain = __irq_domain_instantiate(&info, true, false); - if (first_irq > 0) { - if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* attempt to allocated irq_descs */ - int rc = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(to_of_node(fwnode))); - if (rc < 0) - pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", - first_irq); - } - irq_domain_associate_many(domain, first_irq, 0, size); - } - - return domain; + return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_simple); @@ -476,18 +518,14 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, .fwnode = fwnode, .size = first_hwirq + size, .hwirq_max = first_hwirq + size, + .hwirq_base = first_hwirq, + .virq_base = first_irq, .ops = ops, .host_data = host_data, }; - struct irq_domain *domain; + struct irq_domain *domain = __irq_domain_instantiate(&info, false, true); - domain = irq_domain_instantiate(&info); - if (IS_ERR(domain)) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); - - return domain; + return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_legacy); @@ -1365,7 +1403,7 @@ static int irq_domain_trim_hierarchy(unsigned int virq) tail = NULL; /* The first entry must have a valid irqchip */ - if (!irq_data->chip || IS_ERR(irq_data->chip)) + if (IS_ERR_OR_NULL(irq_data->chip)) return -EINVAL; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dd53298ef1a5..f0803d6bd296 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -218,21 +218,20 @@ static void irq_validate_effective_affinity(struct irq_data *data) static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif +static DEFINE_PER_CPU(struct cpumask, __tmp_mask); + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { + struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; - static DEFINE_RAW_SPINLOCK(tmp_mask_lock); - static struct cpumask tmp_mask; - if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock(&tmp_mask_lock); /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with @@ -258,11 +257,11 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); - cpumask_and(&tmp_mask, mask, hk_mask); - if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) + cpumask_and(tmp_mask, mask, hk_mask); + if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else - prog_mask = &tmp_mask; + prog_mask = tmp_mask; } else { prog_mask = mask; } @@ -272,16 +271,14 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * unless we are being asked to force the affinity (in which * case we do as we are told). */ - cpumask_and(&tmp_mask, prog_mask, cpu_online_mask); - if (!force && !cpumask_empty(&tmp_mask)) - ret = chip->irq_set_affinity(data, &tmp_mask, force); + cpumask_and(tmp_mask, prog_mask, cpu_online_mask); + if (!force && !cpumask_empty(tmp_mask)) + ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; - raw_spin_unlock(&tmp_mask_lock); - switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 61ca924ef4b4..eb150afd671f 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -26,7 +26,7 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) * The outgoing CPU might be the last online target in a pending * interrupt move. If that's the case clear the pending move bit. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(desc->pending_mask, cpu_online_mask)) { irqd_clr_move_pending(data); return false; } @@ -74,7 +74,7 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + if (cpumask_intersects(desc->pending_mask, cpu_online_mask)) { int ret; ret = irq_do_set_affinity(data, desc->pending_mask, false); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5fa0547ece0c..1c7e5159064c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -82,7 +82,7 @@ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8cccdf40725a..9081ada81c3d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -52,10 +52,8 @@ static int show_irq_affinity(int type, struct seq_file *m) case AFFINITY: case AFFINITY_LIST: mask = desc->irq_common_data.affinity; -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; -#endif + if (irq_move_pending(&desc->irq_data)) + mask = irq_desc_get_pending_mask(desc); break; case EFFECTIVE: case EFFECTIVE_LIST: @@ -142,7 +140,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, int err; if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) - return -EIO; + return -EPERM; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; @@ -362,8 +360,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) goto out_unlock; #ifdef CONFIG_SMP + umode_t umode = S_IRUGO; + + if (irq_can_set_affinity_usr(desc->irq_data.irq)) + umode |= S_IWUSR; + /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0644, desc->dir, + proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ @@ -371,7 +374,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ - proc_create_data("smp_affinity_list", 0644, desc->dir, + proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, diff --git a/kernel/kcov.c b/kernel/kcov.c index 274b6b7c718d..28a6be6e64fd 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/hashtable.h> #include <linux/init.h> +#include <linux/jiffies.h> #include <linux/kmsan-checks.h> #include <linux/mm.h> #include <linux/preempt.h> @@ -1067,6 +1068,32 @@ u64 kcov_common_handle(void) } EXPORT_SYMBOL(kcov_common_handle); +#ifdef CONFIG_KCOV_SELFTEST +static void __init selftest(void) +{ + unsigned long start; + + pr_err("running self test\n"); + /* + * Test that interrupts don't produce spurious coverage. + * The coverage callback filters out interrupt code, but only + * after the handler updates preempt count. Some code periodically + * leaks out of that section and leads to spurious coverage. + * It's hard to call the actual interrupt handler directly, + * so we just loop here for a bit waiting for a timer interrupt. + * We set kcov_mode to enable tracing, but don't setup the area, + * so any attempt to trace will crash. Note: we must not call any + * potentially traced functions in this region. + */ + start = jiffies; + current->kcov_mode = KCOV_MODE_TRACE_PC; + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300) + ; + current->kcov_mode = 0; + pr_err("done running self test\n"); +} +#endif + static int __init kcov_init(void) { int cpu; @@ -1086,6 +1113,10 @@ static int __init kcov_init(void) */ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); +#ifdef CONFIG_KCOV_SELFTEST + selftest(); +#endif + return 0; } diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 1d1d1b0e4248..53b21ae30e00 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -225,7 +225,7 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o { char kbuf[KSYM_NAME_LEN]; char *arg; - int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + const size_t read_len = min(count, sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) return -EFAULT; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3d64290d24c9..3eedb8c226ad 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -752,7 +752,7 @@ static int kexec_calculate_store_digests(struct kimage *image) #ifdef CONFIG_CRASH_HOTPLUG /* Exclude elfcorehdr segment to allow future changes via hotplug */ - if (j == image->elfcorehdr_index) + if (i == image->elfcorehdr_index) continue; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 0349f957e672..7963deac33c3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -56,6 +56,7 @@ #include <linux/kprobes.h> #include <linux/lockdep.h> #include <linux/context_tracking.h> +#include <linux/console.h> #include <asm/sections.h> @@ -573,8 +574,10 @@ static struct lock_trace *save_trace(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } @@ -887,11 +890,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { instrumentation_begin(); debug_locks_off(); + nbcon_cpu_emergency_enter(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); instrumentation_end(); return NULL; } @@ -968,11 +973,13 @@ static bool assign_lock_key(struct lockdep_map *lock) else { /* Debug-check: all keys must be persistent! */ debug_locks_off(); + nbcon_cpu_emergency_enter(); pr_err("INFO: trying to register non-static key.\n"); pr_err("The code is fine but needs lockdep annotation, or maybe\n"); pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); return false; } @@ -1316,8 +1323,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_lock_classes++; @@ -1349,11 +1358,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); + nbcon_cpu_emergency_enter(); printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); if (!graph_lock()) { return NULL; @@ -1392,8 +1403,10 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_list_entries++; @@ -2039,6 +2052,8 @@ static noinline void print_circular_bug(struct lock_list *this, depth = get_lock_depth(target); + nbcon_cpu_emergency_enter(); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -2057,6 +2072,8 @@ static noinline void print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinline void print_bfs_bug(int ret) @@ -2569,6 +2586,8 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", @@ -2618,11 +2637,13 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn(" and %s-irq-unsafe lock:\n", irqclass); next_root->trace = save_trace(); if (!next_root->trace) - return; + goto out; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } static const char *state_names[] = { @@ -2987,6 +3008,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); @@ -3009,6 +3032,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -3606,6 +3631,8 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); @@ -3622,6 +3649,8 @@ static void print_collision(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } #endif @@ -3712,8 +3741,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } chain->chain_key = chain_key; @@ -3730,8 +3761,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -3970,6 +4003,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); @@ -3998,6 +4033,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -4032,6 +4069,8 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); @@ -4072,11 +4111,13 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); root->trace = save_trace(); if (!root->trace) - return; + goto out; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } /* @@ -4153,6 +4194,8 @@ void print_irqtrace_events(struct task_struct *curr) { const struct irqtrace_events *trace = &curr->irqtrace; + nbcon_cpu_emergency_enter(); + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, @@ -4166,6 +4209,8 @@ void print_irqtrace_events(struct task_struct *curr) printk("softirqs last disabled at (%u): [<%px>] %pS\n", trace->softirq_disable_event, (void *)trace->softirq_disable_ip, (void *)trace->softirq_disable_ip); + + nbcon_cpu_emergency_exit(); } static int HARDIRQ_verbose(struct lock_class *class) @@ -4686,10 +4731,12 @@ unlock: * We must printk outside of the graph_lock: */ if (ret == 2) { + nbcon_cpu_emergency_enter(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); + nbcon_cpu_emergency_exit(); } return ret; @@ -4730,6 +4777,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, if (debug_locks_silent) return 0; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("[ BUG: Invalid wait context ]\n"); @@ -4749,6 +4798,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, pr_warn("stack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); + return 0; } @@ -4956,6 +5007,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); @@ -4976,6 +5029,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -5024,11 +5079,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, debug_class_ops_inc(class); if (very_verbose(class)) { + nbcon_cpu_emergency_enter(); printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); } /* @@ -5155,6 +5212,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); @@ -5162,6 +5220,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_print_held_locks(current); debug_show_all_locks(); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -5181,6 +5240,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); @@ -5197,6 +5258,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinstr int match_held_lock(const struct held_lock *hlock, @@ -5901,6 +5964,8 @@ static void print_lock_contention_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); @@ -5917,6 +5982,8 @@ static void print_lock_contention_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static void @@ -6536,6 +6603,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); @@ -6548,6 +6617,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static inline int not_in_range(const void* mem_from, unsigned long mem_len, @@ -6594,6 +6665,8 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", @@ -6603,6 +6676,8 @@ static void print_held_locks_bug(void) lockdep_print_held_locks(current); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } void debug_check_no_locks_held(void) @@ -6660,6 +6735,7 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); @@ -6668,6 +6744,7 @@ asmlinkage __visible void lockdep_sys_exit(void) pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); + nbcon_cpu_emergency_exit(); } /* @@ -6684,6 +6761,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); @@ -6722,6 +6800,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 88d08eeb8bc0..fba1229f1de6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, } static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_base *lock, struct rt_mutex_waiter *w) { /* @@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, if (build_ww_mutex() && w->ww_ctx) return; - /* - * Yell loudly and stop the task right here. - */ + raw_spin_unlock_irq(&lock->wait_lock); + WARN(1, "rtmutex deadlock detected\n"); + while (1) { set_current_state(TASK_INTERRUPTIBLE); rt_mutex_schedule(); @@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); - rt_mutex_handle_deadlock(ret, chwalk, waiter); + rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); } /* diff --git a/kernel/module/Makefile b/kernel/module/Makefile index a10b2b9a6fdf..50ffcc413b54 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -5,7 +5,7 @@ # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_main.o := n obj-y += main.o obj-y += strict_rwx.o diff --git a/kernel/padata.c b/kernel/padata.c index 0fa6c2895460..d899f34558af 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -404,7 +404,8 @@ void padata_do_serial(struct padata_priv *padata) /* Sort in ascending order of sequence number. */ list_for_each_prev(pos, &reorder->list) { cur = list_entry(pos, struct padata_priv, list); - if (cur->seq_nr < padata->seq_nr) + /* Compare by difference to consider integer wrap around */ + if ((signed int)(cur->seq_nr - padata->seq_nr) < 0) break; } list_add(&padata->list, pos); @@ -512,9 +513,12 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) * thread function. Load balance large jobs between threads by * increasing the number of chunks, guarantee at least the minimum * chunk size from the caller, and honor the caller's alignment. + * Ensure chunk_size is at least 1 to prevent divide-by-0 + * panic in padata_mt_helper(). */ ps.chunk_size = job->size / (ps.nworks * load_balance_factor); ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = max(ps.chunk_size, 1ul); ps.chunk_size = roundup(ps.chunk_size, job->align); /* diff --git a/kernel/panic.c b/kernel/panic.c index 2a0449144f82..753d12f4dc8f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -374,6 +374,8 @@ void panic(const char *fmt, ...) panic_other_cpus_shutdown(_crash_kexec_post_notifiers); + printk_legacy_allow_panic_sync(); + /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. @@ -463,6 +465,7 @@ void panic(const char *fmt, ...) * Explicitly flush the kernel log buffer one last time. */ console_flush_on_panic(CONSOLE_FLUSH_PENDING); + nbcon_atomic_flush_unsafe(); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { @@ -682,6 +685,7 @@ bool oops_may_print(void) */ void oops_enter(void) { + nbcon_cpu_emergency_enter(); tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); @@ -704,6 +708,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + nbcon_cpu_emergency_exit(); kmsg_dump(KMSG_DUMP_OOPS); } @@ -715,6 +720,8 @@ struct warn_args { void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args) { + nbcon_cpu_emergency_enter(); + disable_trace_on_warning(); if (file) @@ -750,6 +757,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + nbcon_cpu_emergency_exit(); } #ifdef CONFIG_BUG diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0a213f69a9e4..e35829d36039 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1123,11 +1123,11 @@ static const char * const hibernation_modes[] = { static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + ssize_t count = 0; int i; - char *start = buf; if (!hibernation_available()) - return sprintf(buf, "[disabled]\n"); + return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) @@ -1147,12 +1147,16 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, continue; } if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } - buf += sprintf(buf, "\n"); - return buf-start; + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1210,8 +1214,8 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); + return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1270,7 +1274,7 @@ power_attr(resume); static ssize_t resume_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); + return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); } static ssize_t resume_offset_store(struct kobject *kobj, @@ -1293,7 +1297,7 @@ power_attr(resume_offset); static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", image_size); + return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1314,7 +1318,7 @@ power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", reserved_size); + return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, diff --git a/kernel/power/main.c b/kernel/power/main.c index a9e0693aaf69..6254814d4817 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -115,7 +115,7 @@ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_async_enabled); + return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -139,7 +139,7 @@ power_attr(pm_async); static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { @@ -149,17 +149,17 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) - s += sprintf(s, "[%s] ", label); + count += sysfs_emit_at(buf, count, "[%s] ", label); else - s += sprintf(s, "%s ", label); + count += sysfs_emit_at(buf, count, "%s ", label); } } /* Convert the last space to a newline if needed. */ - if (s != buf) - *(s-1) = '\n'; + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) @@ -220,7 +220,7 @@ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", sync_on_suspend_enabled); + return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, @@ -257,22 +257,22 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = { static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else - s += sprintf(s, "%s ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -390,7 +390,7 @@ static const char * const suspend_step_names[] = { static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ - return sprintf(buf, format_str, suspend_stats._name); \ + return sysfs_emit(buf, format_str, suspend_stats._name);\ } \ static struct kobj_attribute _name = __ATTR_RO(_name) @@ -404,7 +404,7 @@ suspend_attr(max_hw_sleep, "%llu\n"); static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ - return sprintf(buf, "%u\n", \ + return sysfs_emit(buf, "%u\n", \ suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) @@ -428,7 +428,7 @@ static ssize_t last_failed_dev_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; - return sprintf(buf, "%s\n", last_failed_dev); + return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); @@ -442,7 +442,7 @@ static ssize_t last_failed_errno_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; - return sprintf(buf, "%d\n", last_failed_errno); + return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); @@ -456,7 +456,7 @@ static ssize_t last_failed_step_show(struct kobject *kobj, index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; - return sprintf(buf, "%s\n", suspend_step_names[step]); + return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); @@ -571,7 +571,7 @@ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_print_times_enabled); + return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, @@ -604,7 +604,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, if (!pm_wakeup_irq()) return -ENODATA; - return sprintf(buf, "%u\n", pm_wakeup_irq()); + return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); @@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_debug_messages_on); + return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, @@ -668,21 +668,23 @@ struct kobject *power_kobj; static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) - s += sprintf(s,"%s ", pm_states[i]); + count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) - s += sprintf(s, "disk "); - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; - return (s - buf); + count += sysfs_emit_at(buf, count, "disk "); + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static suspend_state_t decode_state(const char *buf, size_t n) @@ -782,7 +784,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj, unsigned int val; return pm_get_wakeup_count(&val, true) ? - sprintf(buf, "%u\n", val) : -EINTR; + sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -824,17 +826,17 @@ static ssize_t autosleep_show(struct kobject *kobj, suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) - return sprintf(buf, "off\n"); + return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state] ? + return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION - return sprintf(buf, "disk\n"); + return sysfs_emit(buf, "disk\n"); #else - return sprintf(buf, "error"); + return sysfs_emit(buf, "error\n"); #endif } @@ -903,7 +905,7 @@ int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_trace_enabled); + return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t @@ -940,7 +942,7 @@ power_attr_ro(pm_trace_dev_match); static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", freeze_timeout_msecs); + return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 405eddbda4fc..30894d8f0a78 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1365,11 +1365,6 @@ static unsigned int count_highmem_pages(void) } return n; } -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} #endif /* CONFIG_HIGHMEM */ /** diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 19dcc5832651..3fcb48502adb 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -2,11 +2,12 @@ /* * internal.h - printk internal definitions */ -#include <linux/percpu.h> #include <linux/console.h> -#include "printk_ringbuffer.h" +#include <linux/percpu.h> +#include <linux/types.h> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +struct ctl_table; void __init printk_sysctl_init(void); int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -20,6 +21,19 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, (con->flags & CON_BOOT) ? "boot" : "", \ con->name, con->index, ##__VA_ARGS__) +/* + * Identify if legacy printing is forced in a dedicated kthread. If + * true, all printing via console lock occurs within a dedicated + * legacy printer thread. The only exception is on panic, after the + * nbcon consoles have had their chance to print the panic messages + * first. + */ +#ifdef CONFIG_PREEMPT_RT +# define force_legacy_kthread() (true) +#else +# define force_legacy_kthread() (false) +#endif + #ifdef CONFIG_PRINTK #ifdef CONFIG_PRINTK_CALLER @@ -43,7 +57,11 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +struct printk_ringbuffer; +struct dev_printk_info; + extern struct printk_ringbuffer *prb; +extern bool printk_kthreads_running; __printf(4, 0) int vprintk_store(int facility, int level, @@ -53,6 +71,9 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + bool printk_percpu_data_ready(void); #define printk_safe_enter_irqsave(flags) \ @@ -68,15 +89,85 @@ bool printk_percpu_data_ready(void); } while (0) void defer_console_output(void); +bool is_printk_legacy_deferred(void); u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); +void console_lock_spinning_enable(void); +int console_lock_spinning_disable_and_check(int cookie); u64 nbcon_seq_read(struct console *con); void nbcon_seq_force(struct console *con, u64 seq); bool nbcon_alloc(struct console *con); -void nbcon_init(struct console *con); void nbcon_free(struct console *con); +enum nbcon_prio nbcon_get_default_prio(void); +void nbcon_atomic_flush_pending(void); +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic); +bool nbcon_kthread_create(struct console *con); +void nbcon_kthread_stop(struct console *con); +void nbcon_kthreads_wake(void); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + /* The write_atomic() callback is optional. */ + if (use_atomic && !con->write_atomic) + return false; + + /* + * For the !use_atomic case, @printk_kthreads_running is not + * checked because the write_thread() callback is also used + * via the legacy loop when the printer threads are not + * available. + */ + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + +/** + * nbcon_kthread_wake - Wake up a console printing thread + * @con: Console to operate on + */ +static inline void nbcon_kthread_wake(struct console *con) +{ + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * The full memory barrier in rcuwait_wake_up() pairs with the full + * memory barrier within set_current_state() of + * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() + * adds the waiter but before it has checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ +} #else @@ -84,6 +175,8 @@ void nbcon_free(struct console *con); #define PRINTK_MESSAGE_MAX 0 #define PRINTKRB_RECORD_MAX 0 +#define printk_kthreads_running (false) + /* * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so @@ -93,14 +186,119 @@ void nbcon_free(struct console *con); #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) static inline bool printk_percpu_data_ready(void) { return false; } +static inline void defer_console_output(void) { } +static inline bool is_printk_legacy_deferred(void) { return false; } static inline u64 nbcon_seq_read(struct console *con) { return 0; } static inline void nbcon_seq_force(struct console *con, u64 seq) { } static inline bool nbcon_alloc(struct console *con) { return false; } -static inline void nbcon_init(struct console *con) { } static inline void nbcon_free(struct console *con) { } +static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } +static inline void nbcon_atomic_flush_pending(void) { } +static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) { return false; } +static inline void nbcon_kthread_wake(struct console *con) { } +static inline void nbcon_kthreads_wake(void) { } + +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif /* CONFIG_PRINTK */ +extern bool have_boot_console; +extern bool have_nbcon_console; +extern bool have_legacy_console; +extern bool legacy_allow_panic_sync; + +/** + * struct console_flush_type - Define available console flush methods + * @nbcon_atomic: Flush directly using nbcon_atomic() callback + * @nbcon_offload: Offload flush to printer thread + * @legacy_direct: Call the legacy loop in this context + * @legacy_offload: Offload the legacy loop into IRQ or legacy thread + * + * Note that the legacy loop also flushes the nbcon consoles. + */ +struct console_flush_type { + bool nbcon_atomic; + bool nbcon_offload; + bool legacy_direct; + bool legacy_offload; +}; + +/* + * Identify which console flushing methods should be used in the context of + * the caller. + */ +static inline void printk_get_console_flush_type(struct console_flush_type *ft) +{ + memset(ft, 0, sizeof(*ft)); + + switch (nbcon_get_default_prio()) { + case NBCON_PRIO_NORMAL: + if (have_nbcon_console && !have_boot_console) { + if (printk_kthreads_running) + ft->nbcon_offload = true; + else + ft->nbcon_atomic = true; + } + + /* Legacy consoles are flushed directly when possible. */ + if (have_legacy_console || have_boot_console) { + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + else + ft->legacy_offload = true; + } + break; + + case NBCON_PRIO_EMERGENCY: + if (have_nbcon_console && !have_boot_console) + ft->nbcon_atomic = true; + + /* Legacy consoles are flushed directly when possible. */ + if (have_legacy_console || have_boot_console) { + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + else + ft->legacy_offload = true; + } + break; + + case NBCON_PRIO_PANIC: + /* + * In panic, the nbcon consoles will directly print. But + * only allowed if there are no boot consoles. + */ + if (have_nbcon_console && !have_boot_console) + ft->nbcon_atomic = true; + + if (have_legacy_console || have_boot_console) { + /* + * This is the same decision as NBCON_PRIO_NORMAL + * except that offloading never occurs in panic. + * + * Note that console_flush_on_panic() will flush + * legacy consoles anyway, even if unsafe. + */ + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + + /* + * In panic, if nbcon atomic printing occurs, + * the legacy consoles must remain silent until + * explicitly allowed. + */ + if (ft->nbcon_atomic && !legacy_allow_panic_sync) + ft->legacy_direct = false; + } + break; + + default: + WARN_ON_ONCE(1); + break; + } +} + extern struct printk_buffers printk_shared_pbufs; /** @@ -135,4 +333,5 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, #ifdef CONFIG_PRINTK void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); +void console_prepend_replay(struct printk_message *pmsg); #endif diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index c8093bcc01fe..fd12efcc4aed 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,11 +2,25 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner -#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kthread.h> +#include <linux/minmax.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/slab.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> #include "internal.h" +#include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. @@ -172,9 +186,6 @@ void nbcon_seq_force(struct console *con, u64 seq) u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); - - /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ - con->seq = 0; } /** @@ -231,6 +242,13 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state new; do { + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * acquire ownership in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ if (other_cpu_in_panic()) return -EPERM; @@ -262,18 +280,29 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) /* * The request context is well defined by the @req_prio because: * - * - Only a context with a higher priority can take over the request. + * - Only a context with a priority higher than the owner can become + * a waiter. + * - Only a context with a priority higher than the waiter can + * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * - * 1. Another context with a higher priority directly takes ownership. - * 2. The higher priority context releases the ownership. - * 3. A lower priority context takes the ownership. - * 4. Another context with the same priority as this context + * 1. This context is currently a waiter. + * 2. Another context with a higher priority than this context + * directly takes ownership. + * 3. The higher priority context releases the ownership. + * 4. Another lower priority context takes the ownership. + * 5. Another context with the same priority as this context * creates a request and starts waiting. + * + * Event #1 implies this context is EMERGENCY. + * Event #2 implies the new context is PANIC. + * Event #3 occurs when panic() has flushed the console. + * Events #4 and #5 are not possible due to the other_cpu_in_panic() + * check in nbcon_context_try_acquire_direct(). */ return (cur->req_prio == expected_prio); @@ -531,6 +560,7 @@ static struct printk_buffers panic_nbcon_pbufs; * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * + * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the @@ -538,7 +568,6 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -__maybe_unused static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); @@ -581,11 +610,29 @@ static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* - * Since consoles can only be acquired by higher priorities, - * owning contexts are uniquely identified by @prio. However, - * since contexts can unexpectedly lose ownership, it is - * possible that later another owner appears with the same - * priority. For this reason @cpu is also needed. + * A similar function, nbcon_waiter_matches(), only deals with + * EMERGENCY and PANIC priorities. However, this function must also + * deal with the NORMAL priority, which requires additional checks + * and constraints. + * + * For the case where preemption and interrupts are disabled, it is + * enough to also verify that the owning CPU has not changed. + * + * For the case where preemption or interrupts are enabled, an + * external synchronization method *must* be used. In particular, + * the driver-specific locking mechanism used in device_lock() + * (including disabling migration) should be used. It prevents + * scenarios such as: + * + * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and + * is scheduled out. + * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY + * and releases it. + * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] + * and is scheduled out. + * 4. [Task A] gets running on [CPU X] and sees that the console is + * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus + * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) @@ -784,6 +831,19 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } +static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct nbcon_state cur; + + wctxt->outbuf = buf; + wctxt->len = len; + nbcon_state_read(con, &cur); + wctxt->unsafe_takeover = cur.unsafe_takeover; +} + /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function @@ -799,8 +859,12 @@ out: bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool is_owner; - return nbcon_context_enter_unsafe(ctxt); + is_owner = nbcon_context_enter_unsafe(ctxt); + if (!is_owner) + nbcon_write_context_set_buf(wctxt, NULL, 0); + return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); @@ -819,14 +883,47 @@ EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool ret; - return nbcon_context_exit_unsafe(ctxt); + ret = nbcon_context_exit_unsafe(ctxt); + if (!ret) + nbcon_write_context_set_buf(wctxt, NULL, 0); + return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** + * nbcon_reacquire_nobuf - Reacquire a console after losing ownership + * while printing + * @wctxt: The write context that was handed to the write callback + * + * Since ownership can be lost at any time due to handover or takeover, a + * printing context _must_ be prepared to back out immediately and + * carefully. However, there are scenarios where the printing context must + * reacquire ownership in order to finalize or revert hardware changes. + * + * This function allows a printing context to reacquire ownership using the + * same priority as its previous ownership. + * + * Note that after a successful reacquire the printing context will have no + * output buffer because that has been lost. This function cannot be used to + * resume printing. + */ +void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + while (!nbcon_context_try_acquire(ctxt)) + cpu_relax(); + + nbcon_write_context_set_buf(wctxt, NULL, 0); +} +EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); + +/** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function + * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. @@ -840,8 +937,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ -__maybe_unused -static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) +static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -852,7 +948,22 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; - bool done; + unsigned long ulseq; + + /* + * This function should never be called for consoles that have not + * implemented the necessary callback for writing: i.e. legacy + * consoles and, when atomic, nbcon consoles with no write_atomic(). + * Handle it as if ownership was lost and try to continue. + * + * Note that for nbcon consoles the write_thread() callback is + * mandatory and was already checked in nbcon_alloc(). + */ + if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || + !(console_srcu_read_flags(con) & CON_NBCON))) { + nbcon_context_release(ctxt); + return false; + } /* * The printk buffers are filled within an unsafe section. This @@ -878,6 +989,29 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); + /* + * If the previous owner was assigned the same record, this context + * has taken over ownership and is replaying the record. Prepend a + * message to let the user know the record is replayed. + */ + ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); + if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { + console_prepend_replay(&pmsg); + } else { + /* + * Ensure this context is still the owner before trying to + * update @nbcon_prev_seq. Otherwise the value in @ulseq may + * not be from the previous owner and instead be some later + * value from the context that took over ownership. + */ + nbcon_state_read(con, &cur); + if (!nbcon_context_can_proceed(ctxt, &cur)) + return false; + + atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, + __u64seq_to_ulseq(pmsg.seq)); + } + if (!nbcon_context_exit_unsafe(ctxt)) return false; @@ -886,22 +1020,27 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) goto update_con; /* Initialize the write context for driver callbacks. */ - wctxt->outbuf = &pmsg.pbufs->outbuf[0]; - wctxt->len = pmsg.outbuf_len; - nbcon_state_read(con, &cur); - wctxt->unsafe_takeover = cur.unsafe_takeover; + nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); - if (con->write_atomic) { - done = con->write_atomic(con, wctxt); - } else { + if (use_atomic) + con->write_atomic(con, wctxt); + else + con->write_thread(con, wctxt); + + if (!wctxt->outbuf) { + /* + * Ownership was lost and reacquired by the driver. Handle it + * as if ownership was lost. + */ nbcon_context_release(ctxt); - WARN_ON_ONCE(1); - done = false; + return false; } - /* If not done, the emit was aborted. */ - if (!done) - return false; + /* + * Ownership may have been lost but _not_ reacquired by the driver. + * This case is detected and handled when entering unsafe to update + * dropped/seq values. + */ /* * Since any dropped message was successfully output, reset the @@ -928,54 +1067,650 @@ update_con: return nbcon_context_exit_unsafe(ctxt); } +/* + * nbcon_emit_one - Print one record for an nbcon console using the + * specified callback + * @wctxt: An initialized write context struct to use for this context + * @use_atomic: True if the write_atomic() callback is to be used + * + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This is an internal helper to handle the locking of the console before + * calling nbcon_emit_next_record(). + */ +static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + unsigned long flags; + bool ret = false; + + if (!use_atomic) { + con->device_lock(con, &flags); + + /* + * Ensure this stays on the CPU to make handover and + * takeover possible. + */ + cant_migrate(); + } + + if (!nbcon_context_try_acquire(ctxt)) + goto out; + + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + * + * The higher priority printing context takes over responsibility + * to print the pending records. + */ + if (!nbcon_emit_next_record(wctxt, use_atomic)) + goto out; + + nbcon_context_release(ctxt); + + ret = ctxt->backlog; +out: + if (!use_atomic) + con->device_unlock(con, flags); + return ret; +} + /** - * nbcon_alloc - Allocate buffers needed by the nbcon console - * @con: Console to allocate buffers for + * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup + * @con: Console to operate on + * @ctxt: The nbcon context from nbcon_context_try_acquire() * - * Return: True on success. False otherwise and the console cannot - * be used. + * Return: True if the thread should shutdown or if the console is + * allowed to print and a record is available. False otherwise. * - * This is not part of nbcon_init() because buffer allocation must - * be performed earlier in the console registration process. + * After the thread wakes up, it must first check if it should shutdown before + * attempting any printing. */ -bool nbcon_alloc(struct console *con) +static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { - if (con->flags & CON_BOOT) { + bool ret = false; + short flags; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + + flags = console_srcu_read_flags(con); + if (console_is_usable(con, flags, false)) { + /* Bring the sequence in @ctxt up to date */ + ctxt->seq = nbcon_seq_read(con); + + ret = prb_read_valid(prb, ctxt->seq, NULL); + } + + console_srcu_read_unlock(cookie); + return ret; +} + +/** + * nbcon_kthread_func - The printer thread function + * @__console: Console to operate on + * + * Return: 0 + */ +static int nbcon_kthread_func(void *__console) +{ + struct console *con = __console; + struct nbcon_write_context wctxt = { + .ctxt.console = con, + .ctxt.prio = NBCON_PRIO_NORMAL, + }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + short con_flags; + bool backlog; + int cookie; + +wait_for_event: + /* + * Guarantee this task is visible on the rcuwait before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * ___rcuwait_wait_event() pairs with the full memory + * barrier within rcuwait_has_sleeper(). + * + * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. + */ + rcuwait_wait_event(&con->rcuwait, + nbcon_kthread_should_wakeup(con, ctxt), + TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ + + do { + if (kthread_should_stop()) + return 0; + + backlog = false; + /* - * Boot console printing is synchronized with legacy console - * printing, so boot consoles can share the same global printk - * buffers. + * Keep the srcu read lock around the entire operation so that + * synchronize_srcu() can guarantee that the kthread stopped + * or suspended printing. */ - con->pbufs = &printk_shared_pbufs; + cookie = console_srcu_read_lock(); + + con_flags = console_srcu_read_flags(con); + + if (console_is_usable(con, con_flags, false)) + backlog = nbcon_emit_one(&wctxt, false); + + console_srcu_read_unlock(cookie); + + cond_resched(); + + } while (backlog); + + goto wait_for_event; +} + +/** + * nbcon_irq_work - irq work to wake console printer thread + * @irq_work: The irq work to operate on + */ +static void nbcon_irq_work(struct irq_work *irq_work) +{ + struct console *con = container_of(irq_work, struct console, irq_work); + + nbcon_kthread_wake(con); +} + +static inline bool rcuwait_has_sleeper(struct rcuwait *w) +{ + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * This full memory barrier pairs with the full memory barrier within + * set_current_state() of ___rcuwait_wait_event(), which is called + * after prepare_to_rcuwait() adds the waiter but before it has + * checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ + return rcuwait_active(w); +} + +/** + * nbcon_kthreads_wake - Wake up printing threads using irq_work + */ +void nbcon_kthreads_wake(void) +{ + struct console *con; + int cookie; + + if (!printk_kthreads_running) + return; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + if (!(console_srcu_read_flags(con) & CON_NBCON)) + continue; + + /* + * Only schedule irq_work if the printing thread is + * actively waiting. If not waiting, the thread will + * notice by itself that it has work to do. + */ + if (rcuwait_has_sleeper(&con->rcuwait)) + irq_work_queue(&con->irq_work); + } + console_srcu_read_unlock(cookie); +} + +/* + * nbcon_kthread_stop - Stop a console printer thread + * @con: Console to operate on + */ +void nbcon_kthread_stop(struct console *con) +{ + lockdep_assert_console_list_lock_held(); + + if (!con->kthread) + return; + + kthread_stop(con->kthread); + con->kthread = NULL; +} + +/** + * nbcon_kthread_create - Create a console printer thread + * @con: Console to operate on + * + * Return: True if the kthread was started or already exists. + * Otherwise false and @con must not be registered. + * + * This function is called when it will be expected that nbcon consoles are + * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL + * will be no longer flushed by the legacy loop. This is why failure must + * be fatal for console registration. + * + * If @con was already registered and this function fails, @con must be + * unregistered before the global state variable @printk_kthreads_running + * can be set. + */ +bool nbcon_kthread_create(struct console *con) +{ + struct task_struct *kt; + + lockdep_assert_console_list_lock_held(); + + if (con->kthread) + return true; + + kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); + if (WARN_ON(IS_ERR(kt))) { + con_printk(KERN_ERR, con, "failed to start printing thread\n"); + return false; + } + + con->kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(con->kthread, -20); + + return true; +} + +/* Track the nbcon emergency nesting per CPU. */ +static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); +static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; + +/** + * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer + * + * Context: For reading, any context. For writing, any context which could + * not be migrated to another CPU. + * Return: Either a pointer to the per CPU emergency nesting counter of + * the current CPU or to the init data during early boot. + * + * The function is safe for reading per-CPU variables in any context because + * preemption is disabled if the current CPU is in the emergency state. See + * also nbcon_cpu_emergency_enter(). + */ +static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) +{ + /* + * The value of __printk_percpu_data_ready gets set in normal + * context and before SMP initialization. As a result it could + * never change while inside an nbcon emergency section. + */ + if (!printk_percpu_data_ready()) + return &early_nbcon_pcpu_emergency_nesting; + + return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); +} + +/** + * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon + * printing on the current CPU + * + * Context: Any context. + * Return: The nbcon_prio to use for acquiring an nbcon console in this + * context for printing. + * + * The function is safe for reading per-CPU data in any context because + * preemption is disabled if the current CPU is in the emergency or panic + * state. + */ +enum nbcon_prio nbcon_get_default_prio(void) +{ + unsigned int *cpu_emergency_nesting; + + if (this_cpu_in_panic()) + return NBCON_PRIO_PANIC; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + if (*cpu_emergency_nesting) + return NBCON_PRIO_EMERGENCY; + + return NBCON_PRIO_NORMAL; +} + +/** + * nbcon_legacy_emit_next_record - Print one record for an nbcon console + * in legacy contexts + * @con: The console to print on + * @handover: Will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding + * both the console_lock and the SRCU read lock. Otherwise it + * is set to false. + * @cookie: The cookie from the SRCU read lock. + * @use_atomic: Set true when called in an atomic or unknown context. + * It affects which nbcon callback will be used: write_atomic() + * or write_thread(). + * + * When false, the write_thread() callback is used and would be + * called in a preemtible context unless disabled by the + * device_lock. The legacy handover is not allowed in this mode. + * + * Context: Any context except NMI. + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This function is meant to be called by console_flush_all() to print records + * on nbcon consoles from legacy context (printing via console unlocking). + * Essentially it is the nbcon version of console_emit_next_record(). + */ +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + unsigned long flags; + bool progress; + + ctxt->console = con; + ctxt->prio = nbcon_get_default_prio(); + + if (use_atomic) { + /* + * In an atomic or unknown context, use the same procedure as + * in console_emit_next_record(). It allows to handover. + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + stop_critical_timings(); + } + + progress = nbcon_emit_one(&wctxt, use_atomic); + + if (use_atomic) { + start_critical_timings(); + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); } else { - con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); - if (!con->pbufs) { - con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); - return false; + /* Non-atomic does not perform legacy spinning handovers. */ + *handover = false; + } + + return progress; +} + +/** + * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on + * failure. + * + * Errors: + * + * -EPERM: Unable to acquire console ownership. + * + * -EAGAIN: Another context took over ownership while printing. + * + * -ENOENT: A record before @stop_seq is not available. + * + * If flushing up to @stop_seq was not successful, it only makes sense for the + * caller to try again when -EAGAIN was returned. When -EPERM is returned, + * this context is not allowed to acquire the console. When -ENOENT is + * returned, it cannot be expected that the unfinalized record will become + * available. + */ +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + int err = 0; + + ctxt->console = con; + ctxt->spinwait_max_us = 2000; + ctxt->prio = nbcon_get_default_prio(); + ctxt->allow_unsafe_takeover = allow_unsafe_takeover; + + if (!nbcon_context_try_acquire(ctxt)) + return -EPERM; + + while (nbcon_seq_read(con) < stop_seq) { + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + */ + if (!nbcon_emit_next_record(&wctxt, true)) + return -EAGAIN; + + if (!ctxt->backlog) { + /* Are there reserved but not yet finalized records? */ + if (nbcon_seq_read(con) < stop_seq) + err = -ENOENT; + break; } } - return true; + nbcon_context_release(ctxt); + return err; +} + +/** + * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * This will stop flushing before @stop_seq if another context has ownership. + * That context is then responsible for the flushing. Likewise, if new records + * are added while this context was flushing and there is no other context + * to handle the printing, this context must also flush those records. + */ +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + struct console_flush_type ft; + unsigned long flags; + int err; + +again: + /* + * Atomic flushing does not use console driver synchronization (i.e. + * it does not hold the port lock for uart consoles). Therefore IRQs + * must be disabled to avoid being interrupted and then calling into + * a driver that will deadlock trying to acquire console ownership. + */ + local_irq_save(flags); + + err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + + local_irq_restore(flags); + + /* + * If there was a new owner (-EPERM, -EAGAIN), that context is + * responsible for completing. + * + * Do not wait for records not yet finalized (-ENOENT) to avoid a + * possible deadlock. They will either get flushed by the writer or + * eventually skipped on panic CPU. + */ + if (err) + return; + + /* + * If flushing was successful but more records are available, this + * context must flush those remaining records if the printer thread + * is not available do it. + */ + printk_get_console_flush_type(&ft); + if (!ft.nbcon_offload && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + stop_seq = prb_next_reserve_seq(prb); + goto again; + } +} + +/** + * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + */ +static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +{ + struct console *con; + int cookie; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + + if (!(flags & CON_NBCON)) + continue; + + if (!console_is_usable(con, flags, true)) + continue; + + if (nbcon_seq_read(con) >= stop_seq) + continue; + + nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + } + console_srcu_read_unlock(cookie); +} + +/** + * nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * + * Flush the backlog up through the currently newest record. Any new + * records added while flushing will not be flushed if there is another + * context available to handle the flushing. This is to avoid one CPU + * printing unbounded because other CPUs continue to add records. + */ +void nbcon_atomic_flush_pending(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); +} + +/** + * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their + * write_atomic() callback and allowing unsafe hostile takeovers + * + * Flush the backlog up through the currently newest record. Unsafe hostile + * takeovers will be performed, if necessary. + */ +void nbcon_atomic_flush_unsafe(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); +} + +/** + * nbcon_cpu_emergency_enter - Enter an emergency section where printk() + * messages for that CPU are flushed directly + * + * Context: Any context. Disables preemption. + * + * When within an emergency section, printk() calls will attempt to flush any + * pending messages in the ringbuffer. + */ +void nbcon_cpu_emergency_enter(void) +{ + unsigned int *cpu_emergency_nesting; + + preempt_disable(); + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + (*cpu_emergency_nesting)++; +} + +/** + * nbcon_cpu_emergency_exit - Exit an emergency section + * + * Context: Within an emergency section. Enables preemption. + */ +void nbcon_cpu_emergency_exit(void) +{ + unsigned int *cpu_emergency_nesting; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + + if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) + (*cpu_emergency_nesting)--; + + preempt_enable(); } /** - * nbcon_init - Initialize the nbcon console specific data + * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * - * nbcon_alloc() *must* be called and succeed before this function - * is called. + * Return: True if the console was fully allocated and initialized. + * Otherwise @con must not be registered. * - * This function expects that the legacy @con->seq has been set. + * When allocation and init was successful, the console must be properly + * freed using nbcon_free() once it is no longer needed. */ -void nbcon_init(struct console *con) +bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; - /* nbcon_alloc() must have been called and successful! */ - BUG_ON(!con->pbufs); + /* The write_thread() callback is mandatory. */ + if (WARN_ON(!con->write_thread)) + return false; - nbcon_seq_force(con, con->seq); + rcuwait_init(&con->rcuwait); + init_irq_work(&con->irq_work, nbcon_irq_work); + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); + + /* + * Initialize @nbcon_seq to the highest possible sequence number so + * that practically speaking it will have nothing to print until a + * desired initial sequence number has been set via nbcon_seq_force(). + */ + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); + + if (con->flags & CON_BOOT) { + /* + * Boot console printing is synchronized with legacy console + * printing, so boot consoles can share the same global printk + * buffers. + */ + con->pbufs = &printk_shared_pbufs; + } else { + con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); + if (!con->pbufs) { + con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); + return false; + } + + if (printk_kthreads_running) { + if (!nbcon_kthread_create(con)) { + kfree(con->pbufs); + con->pbufs = NULL; + return false; + } + } + } + + return true; } /** @@ -986,6 +1721,9 @@ void nbcon_free(struct console *con) { struct nbcon_state state = { }; + if (printk_kthreads_running) + nbcon_kthread_stop(con); + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -994,3 +1732,85 @@ void nbcon_free(struct console *con) con->pbufs = NULL; } + +/** + * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * + * Context: Under the locking mechanism implemented in + * @con->device_lock() including disabling migration. + * Return: True if the console was acquired. False otherwise. + * + * Console drivers will usually use their own internal synchronization + * mechasism to synchronize between console printing and non-printing + * activities (such as setting baud rates). However, nbcon console drivers + * supporting atomic consoles may also want to mark unsafe sections when + * performing non-printing activities in order to synchronize against their + * atomic_write() callback. + * + * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL + * and marks it unsafe for handover/takeover. + */ +bool nbcon_device_try_acquire(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + + cant_migrate(); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_NORMAL; + + if (!nbcon_context_try_acquire(ctxt)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); + +/** + * nbcon_device_release - Exit unsafe section and release the nbcon console + * @con: The nbcon console acquired in nbcon_device_try_acquire() + */ +void nbcon_device_release(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + struct console_flush_type ft; + int cookie; + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * This context must flush any new records added while the console + * was locked if the printer thread is not available to do it. The + * console_srcu_read_lock must be taken to ensure the console is + * usable throughout flushing. + */ + cookie = console_srcu_read_lock(); + printk_get_console_flush_type(&ft); + if (console_is_usable(con, console_srcu_read_flags(con), true) && + !ft.nbcon_offload && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + /* + * If nbcon_atomic flushing is not available, fallback to + * using the legacy loop. + */ + if (ft.nbcon_atomic) { + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + } else if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } else if (ft.legacy_offload) { + printk_trigger_flush(); + } + } + console_srcu_read_unlock(cookie); +} +EXPORT_SYMBOL_GPL(nbcon_device_release); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c22b07049c38..71e4fe6f9b85 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -34,6 +34,7 @@ #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> +#include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> @@ -282,6 +283,7 @@ EXPORT_SYMBOL(console_list_unlock); * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) + __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } @@ -295,6 +297,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) + __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } @@ -461,14 +464,43 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * Specifies if a legacy console is registered. If legacy consoles are + * present, it is necessary to perform the console lock/unlock dance + * whenever console flushing should occur. + */ +bool have_legacy_console; + +/* + * Specifies if an nbcon console is registered. If nbcon consoles are present, + * synchronous printing of legacy consoles will not occur during panic until + * the backtrace has been stored to the ringbuffer. + */ +bool have_nbcon_console; + +/* + * Specifies if a boot console is registered. If boot consoles are present, + * nbcon consoles cannot print simultaneously and must be synchronized by + * the console lock. This is because boot consoles and nbcon consoles may + * have mapped the same hardware. + */ +bool have_boot_console; + +/* See printk_legacy_allow_panic_sync() for details. */ +bool legacy_allow_panic_sync; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); +static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; +/* True when _all_ printer threads are available for printing. */ +bool printk_kthreads_running; + struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -1850,7 +1882,7 @@ static bool console_waiter; * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ -static void console_lock_spinning_enable(void) +void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. @@ -1889,7 +1921,7 @@ lockdep: * * Return: 1 if the lock rights were passed, 0 otherwise. */ -static int console_lock_spinning_disable_and_check(int cookie) +int console_lock_spinning_disable_and_check(int cookie) { int waiter; @@ -2300,12 +2332,30 @@ out: return ret; } +/* + * This acts as a one-way switch to allow legacy consoles to print from + * the printk() caller context on a panic CPU. It also attempts to flush + * the legacy consoles in this context. + */ +void printk_legacy_allow_panic_sync(void) +{ + struct console_flush_type ft; + + legacy_allow_panic_sync = true; + + printk_get_console_flush_type(&ft); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } +} + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { + struct console_flush_type ft; int printed_len; - bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2319,17 +2369,26 @@ asmlinkage int vprintk_emit(int facility, int level, if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; + printk_get_console_flush_type(&ft); + + /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - in_sched = true; + ft.legacy_offload |= ft.legacy_direct; + ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + + if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during @@ -2349,7 +2408,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (in_sched) + if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); @@ -2620,6 +2679,7 @@ int match_devname_and_update_preferred_console(const char *devname, return -ENOENT; } +EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); @@ -2677,6 +2737,7 @@ void suspend_console(void) void resume_console(void) { + struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) @@ -2694,6 +2755,12 @@ void resume_console(void) */ synchronize_srcu(&console_srcu); + printk_get_console_flush_type(&ft); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_offload) + defer_console_output(); + pr_flush(1000, true); } @@ -2708,10 +2775,16 @@ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { + struct console_flush_type ft; + if (!cpuhp_tasks_frozen) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } } return 0; } @@ -2765,36 +2838,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_srcu_read_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - short flags = console_srcu_read_flags(con); - - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (!con->write) - return false; - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - static void __console_unlock(void) { console_locked = 0; @@ -2804,30 +2847,31 @@ static void __console_unlock(void) #ifdef CONFIG_PRINTK /* - * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This - * is achieved by shifting the existing message over and inserting the dropped - * message. + * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting + * the existing message over and inserting the scratchbuf message. * - * @pmsg is the printk message to prepend. - * - * @dropped is the dropped count to report in the dropped message. + * @pmsg is the original printk message. + * @fmt is the printf format of the message which will prepend the existing one. * - * If the message text in @pmsg->pbufs->outbuf does not have enough space for - * the dropped message, the message text will be sufficiently truncated. + * If there is not enough space in @pmsg->pbufs->outbuf, the existing + * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ -void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +__printf(2, 3) +static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; + va_list args; size_t len; - len = scnprintf(scratchbuf, scratchbuf_sz, - "** %lu printk messages dropped **\n", dropped); + va_start(args, fmt); + len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); + va_end(args); /* * Make sure outbuf is sufficiently large before prepending. @@ -2850,6 +2894,30 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) } /* + * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + * + * @dropped is the dropped count to report in the dropped message. + */ +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +{ + console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); +} + +/* + * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + */ +void console_prepend_replay(struct printk_message *pmsg) +{ + console_prepend_message(pmsg, "** replaying previous printk message **\n"); +} + +/* * Read and format the specified record (or a later record if the specified * record is not available). * @@ -2915,6 +2983,34 @@ out: } /* + * Legacy console printing from printk() caller context does not respect + * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a + * false positive. For PREEMPT_RT the false positive condition does not + * occur. + * + * This map is used to temporarily establish LD_WAIT_SLEEP context for the + * console write() callback when legacy printing to avoid false positive + * lockdep complaints, thus allowing lockdep to continue to function for + * real issues. + */ +#ifdef CONFIG_PREEMPT_RT +static inline void printk_legacy_allow_spinlock_enter(void) { } +static inline void printk_legacy_allow_spinlock_exit(void) { } +#else +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); + +static inline void printk_legacy_allow_spinlock_enter(void) +{ + lock_map_acquire_try(&printk_legacy_map); +} + +static inline void printk_legacy_allow_spinlock_exit(void) +{ + lock_map_release(&printk_legacy_map); +} +#endif /* CONFIG_PREEMPT_RT */ + +/* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. @@ -2963,31 +3059,46 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co con->dropped = 0; } - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); + /* Write everything out to the hardware. */ - /* Do not trace print latency. */ - stop_critical_timings(); + if (force_legacy_kthread() && !panic_in_progress()) { + /* + * With forced threading this function is in a task context + * (either legacy kthread or get_init_console_seq()). There + * is no need for concern about printk reentrance, handovers, + * or lockdep complaints. + */ - /* Write everything out to the hardware. */ - con->write(con, outbuf, pmsg.outbuf_len); + con->write(con, outbuf, pmsg.outbuf_len); + con->seq = pmsg.seq + 1; + } else { + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); - start_critical_timings(); + /* Do not trace print latency. */ + stop_critical_timings(); - con->seq = pmsg.seq + 1; + printk_legacy_allow_spinlock_enter(); + con->write(con, outbuf, pmsg.outbuf_len); + printk_legacy_allow_spinlock_exit(); - *handover = console_lock_spinning_disable_and_check(cookie); - printk_safe_exit_irqrestore(flags); + start_critical_timings(); + + con->seq = pmsg.seq + 1; + + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); + } skip: return true; } @@ -3000,6 +3111,8 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co return false; } +static inline void printk_kthreads_check_locked(void) { } + #endif /* CONFIG_PRINTK */ /* @@ -3027,6 +3140,7 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { + struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; @@ -3038,15 +3152,34 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove do { any_progress = false; + printk_get_console_flush_type(&ft); + cookie = console_srcu_read_lock(); for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; bool progress; - if (!console_is_usable(con)) + /* + * console_flush_all() is only responsible for nbcon + * consoles when the nbcon consoles cannot print via + * their atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; + + if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; - progress = console_emit_next_record(con, handover, cookie); + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } /* * If a handover has occurred, the SRCU read lock @@ -3056,8 +3189,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove return false; /* Track the next of the highest seq flushed. */ - if (con->seq > *next_seq) - *next_seq = con->seq; + if (printk_seq > *next_seq) + *next_seq = printk_seq; if (!progress) continue; @@ -3080,19 +3213,7 @@ abandon: return false; } -/** - * console_unlock - unblock the console subsystem from printing - * - * Releases the console_lock which the caller holds to block printing of - * the console subsystem. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) +static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; @@ -3136,6 +3257,29 @@ void console_unlock(void) */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } + +/** + * console_unlock - unblock the legacy console subsystem from printing + * + * Releases the console_lock which the caller holds to block printing of + * the legacy console subsystem. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock() emits the output on + * legacy consoles prior to releasing the lock. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.legacy_direct) + __console_flush_and_unlock(); + else + __console_unlock(); +} EXPORT_SYMBOL(console_unlock); /** @@ -3258,6 +3402,7 @@ static void __console_rewind_all(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + struct console_flush_type ft; bool handover; u64 next_seq; @@ -3281,7 +3426,13 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); - console_flush_all(false, &next_seq, &handover); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + + /* Flush legacy consoles once allowed, even when dangerous. */ + if (legacy_allow_panic_sync) + console_flush_all(false, &next_seq, &handover); } /* @@ -3338,13 +3489,236 @@ EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { + struct console_flush_type ft; + bool is_nbcon; + console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); + is_nbcon = console->flags & CON_NBCON; console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. The related + * printing context must be able to see it is enabled so that + * it is guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + + printk_get_console_flush_type(&ft); + if (is_nbcon && ft.nbcon_offload) + nbcon_kthread_wake(console); + else if (ft.legacy_offload) + defer_console_output(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); +#ifdef CONFIG_PRINTK +static int unregister_console_locked(struct console *console); + +/* True when system boot is far enough to create printer threads. */ +static bool printk_kthreads_ready __ro_after_init; + +static struct task_struct *printk_legacy_kthread; + +static bool legacy_kthread_should_wakeup(void) +{ + struct console_flush_type ft; + struct console *con; + bool ret = false; + int cookie; + + if (kthread_should_stop()) + return true; + + printk_get_console_flush_type(&ft); + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + + /* + * The legacy printer thread is only responsible for nbcon + * consoles when the nbcon consoles cannot print via their + * atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; + + if (!console_is_usable(con, flags, false)) + continue; + + if (flags & CON_NBCON) { + printk_seq = nbcon_seq_read(con); + } else { + /* + * It is safe to read @seq because only this + * thread context updates @seq. + */ + printk_seq = con->seq; + } + + if (prb_read_valid(prb, printk_seq, NULL)) { + ret = true; + break; + } + } + console_srcu_read_unlock(cookie); + + return ret; +} + +static int legacy_kthread_func(void *unused) +{ + for (;;) { + wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + + if (kthread_should_stop()) + break; + + console_lock(); + __console_flush_and_unlock(); + } + + return 0; +} + +static bool legacy_kthread_create(void) +{ + struct task_struct *kt; + + lockdep_assert_console_list_lock_held(); + + kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); + if (WARN_ON(IS_ERR(kt))) { + pr_err("failed to start legacy printing thread\n"); + return false; + } + + printk_legacy_kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(printk_legacy_kthread, -20); + + return true; +} + +/** + * printk_kthreads_shutdown - shutdown all threaded printers + * + * On system shutdown all threaded printers are stopped. This allows printk + * to transition back to atomic printing, thus providing a robust mechanism + * for the final shutdown/reboot messages to be output. + */ +static void printk_kthreads_shutdown(void) +{ + struct console *con; + + console_list_lock(); + if (printk_kthreads_running) { + printk_kthreads_running = false; + + for_each_console(con) { + if (con->flags & CON_NBCON) + nbcon_kthread_stop(con); + } + + /* + * The threads may have been stopped while printing a + * backlog. Flush any records left over. + */ + nbcon_atomic_flush_pending(); + } + console_list_unlock(); +} + +static struct syscore_ops printk_syscore_ops = { + .shutdown = printk_kthreads_shutdown, +}; + +/* + * If appropriate, start nbcon kthreads and set @printk_kthreads_running. + * If any kthreads fail to start, those consoles are unregistered. + * + * Must be called under console_list_lock(). + */ +static void printk_kthreads_check_locked(void) +{ + struct hlist_node *tmp; + struct console *con; + + lockdep_assert_console_list_lock_held(); + + if (!printk_kthreads_ready) + return; + + if (have_legacy_console || have_boot_console) { + if (!printk_legacy_kthread && + force_legacy_kthread() && + !legacy_kthread_create()) { + /* + * All legacy consoles must be unregistered. If there + * are any nbcon consoles, they will set up their own + * kthread. + */ + hlist_for_each_entry_safe(con, tmp, &console_list, node) { + if (con->flags & CON_NBCON) + continue; + + unregister_console_locked(con); + } + } + } else if (printk_legacy_kthread) { + kthread_stop(printk_legacy_kthread); + printk_legacy_kthread = NULL; + } + + /* + * Printer threads cannot be started as long as any boot console is + * registered because there is no way to synchronize the hardware + * registers between boot console code and regular console code. + * It can only be known that there will be no new boot consoles when + * an nbcon console is registered. + */ + if (have_boot_console || !have_nbcon_console) { + /* Clear flag in case all nbcon consoles unregistered. */ + printk_kthreads_running = false; + return; + } + + if (printk_kthreads_running) + return; + + hlist_for_each_entry_safe(con, tmp, &console_list, node) { + if (!(con->flags & CON_NBCON)) + continue; + + if (!nbcon_kthread_create(con)) + unregister_console_locked(con); + } + + printk_kthreads_running = true; +} + +static int __init printk_set_kthreads_ready(void) +{ + register_syscore_ops(&printk_syscore_ops); + + console_list_lock(); + printk_kthreads_ready = true; + printk_kthreads_check_locked(); + console_list_unlock(); + + return 0; +} +early_initcall(printk_set_kthreads_ready); +#endif /* CONFIG_PRINTK */ + static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) @@ -3446,19 +3820,21 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } -static void console_init_seq(struct console *newcon, bool bootcon_registered) +/* Return the starting sequence number for a newly registered console. */ +static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; + u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - newcon->seq = syslog_seq; + init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered @@ -3479,7 +3855,7 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) * Flush all consoles and set the console to start at * the next unprinted sequence number. */ - if (!console_flush_all(true, &newcon->seq, &handover)) { + if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. @@ -3492,19 +3868,30 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) if (handover) console_lock(); - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); for_each_console(con) { - if ((con->flags & CON_BOOT) && - (con->flags & CON_ENABLED) && - con->seq < newcon->seq) { - newcon->seq = con->seq; + u64 seq; + + if (!(con->flags & CON_BOOT) || + !(con->flags & CON_ENABLED)) { + continue; } + + if (con->flags & CON_NBCON) + seq = nbcon_seq_read(con); + else + seq = con->seq; + + if (seq < init_seq) + init_seq = seq; } } console_unlock(); } } + + return init_seq; } #define console_first() \ @@ -3533,9 +3920,12 @@ static int unregister_console_locked(struct console *console); */ void register_console(struct console *newcon) { - struct console *con; + bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; + struct console *con; + unsigned long flags; + u64 init_seq; int err; console_list_lock(); @@ -3613,10 +4003,31 @@ void register_console(struct console *newcon) } newcon->dropped = 0; - console_init_seq(newcon, bootcon_registered); + init_seq = get_init_console_seq(newcon, bootcon_registered); + + if (newcon->flags & CON_NBCON) { + have_nbcon_console = true; + nbcon_seq_force(newcon, init_seq); + } else { + have_legacy_console = true; + newcon->seq = init_seq; + } + + if (newcon->flags & CON_BOOT) + have_boot_console = true; - if (newcon->flags & CON_NBCON) - nbcon_init(newcon); + /* + * If another context is actively using the hardware of this new + * console, it will not be aware of the nbcon synchronization. This + * is a risk that two contexts could access the hardware + * simultaneously if this new console is used for atomic printing + * and the other context is still using the hardware. + * + * Use the driver synchronization to ensure that the hardware is not + * in use while this new console transitions to being registered. + */ + if (use_device_lock) + newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the @@ -3642,6 +4053,10 @@ void register_console(struct console *newcon) * register_console() completes. */ + /* This new console is now registered. */ + if (use_device_lock) + newcon->device_unlock(newcon, flags); + console_sysfs_notify(); /* @@ -3662,6 +4077,9 @@ void register_console(struct console *newcon) unregister_console_locked(con); } } + + /* Changed console list, may require printer threads to start/stop. */ + printk_kthreads_check_locked(); unlock: console_list_unlock(); } @@ -3670,6 +4088,12 @@ EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { + bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; + bool found_legacy_con = false; + bool found_nbcon_con = false; + bool found_boot_con = false; + unsigned long flags; + struct console *c; int res; lockdep_assert_console_list_lock_held(); @@ -3682,14 +4106,29 @@ static int unregister_console_locked(struct console *console) if (res > 0) return 0; + if (!console_is_registered_locked(console)) + res = -ENODEV; + else if (console_is_usable(console, console->flags, true)) + __pr_flush(console, 1000, true); + /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); - if (!console_is_registered_locked(console)) - return -ENODEV; + if (res < 0) + return res; + + /* + * Use the driver synchronization to ensure that the hardware is not + * in use while this console transitions to being unregistered. + */ + if (use_device_lock) + console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); + if (use_device_lock) + console->device_unlock(console, flags); + /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we @@ -3717,6 +4156,29 @@ static int unregister_console_locked(struct console *console) if (console->exit) res = console->exit(console); + /* + * With this console gone, the global flags tracking registered + * console types may have changed. Update them. + */ + for_each_console(c) { + if (c->flags & CON_BOOT) + found_boot_con = true; + + if (c->flags & CON_NBCON) + found_nbcon_con = true; + else + found_legacy_con = true; + } + if (!found_boot_con) + have_boot_console = found_boot_con; + if (!found_legacy_con) + have_legacy_console = found_legacy_con; + if (!found_nbcon_con) + have_nbcon_console = found_nbcon_con; + + /* Changed console list, may require printer threads to start/stop. */ + printk_kthreads_check_locked(); + return res; } @@ -3863,6 +4325,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; + struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; @@ -3871,13 +4334,22 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre u64 diff; u64 seq; + /* Sorry, pr_flush() will not work this early. */ + if (system_state < SYSTEM_SCHEDULING) + return false; + might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ - console_lock(); - console_unlock(); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.legacy_direct) { + console_lock(); + console_unlock(); + } for (;;) { unsigned long begin_jiffies; @@ -3890,6 +4362,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. + * + * Holding the console_lock is not necessary if there + * are no legacy or boot consoles. However, such a + * console could register at any time. Always hold the + * console_lock as a precaution rather than + * synchronizing against register_console(). */ console_lock(); @@ -3905,8 +4383,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * that they make forward progress, so only increment * @diff for usable consoles. */ - if (!console_is_usable(c)) + if (!console_is_usable(c, flags, true) && + !console_is_usable(c, flags, false)) { continue; + } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); @@ -3974,9 +4454,13 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + if (force_legacy_kthread()) { + if (printk_legacy_kthread) + wake_up_interruptible(&legacy_wait); + } else { + if (console_trylock()) + console_unlock(); + } } if (pending & PRINTK_PENDING_WAKEUP) @@ -4382,8 +4866,17 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); */ void console_try_replay_all(void) { + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_offload) + defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 52626d0f1fa3..4ef81349d9fb 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -4,7 +4,10 @@ #define _KERNEL_PRINTK_RINGBUFFER_H #include <linux/atomic.h> +#include <linux/bits.h> #include <linux/dev_printk.h> +#include <linux/stddef.h> +#include <linux/types.h> /* * Meta information about each stored message. @@ -120,7 +123,7 @@ enum desc_state { #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) #define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) -#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_SV_BITS BITS_PER_LONG #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) #define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) @@ -401,10 +404,12 @@ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); #define __u64seq_to_ulseq(u64seq) (u64seq) #define __ulseq_to_u64seq(rb, ulseq) (ulseq) +#define ULSEQ_MAX(rb) (-1) #else /* CONFIG_64BIT */ #define __u64seq_to_ulseq(u64seq) ((u32)u64seq) +#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL) static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) { diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 6d10927a07d8..2b35a9d3919d 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -26,6 +26,29 @@ void __printk_safe_exit(void) this_cpu_dec(printk_context); } +void __printk_deferred_enter(void) +{ + cant_migrate(); + __printk_safe_enter(); +} + +void __printk_deferred_exit(void) +{ + cant_migrate(); + __printk_safe_exit(); +} + +bool is_printk_legacy_deferred(void) +{ + /* + * The per-CPU variable @printk_context can be read safely in any + * context. CPU migration is always disabled when set. + */ + return (force_legacy_kthread() || + this_cpu_read(printk_context) || + in_nmi()); +} + asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB @@ -38,7 +61,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) + if (is_printk_legacy_deferred()) return vprintk_deferred(fmt, args); /* No obstacles. */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..feb3ac1dc5d5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -54,9 +54,6 @@ * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) - /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 @@ -255,6 +252,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp) kmem_dump_obj(rhp); } +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) +{ + return rhp->next == rhp; +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) @@ -606,7 +608,7 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, #endif #ifdef CONFIG_TINY_RCU -static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } +static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -619,7 +621,7 @@ static inline void rcu_fwd_progress_check(unsigned long j) { } static inline void rcu_gp_slow_register(atomic_t *rgssp) { } static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); +bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1693ea22ef1b..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,17 +261,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded (or not) - */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) -{ - if (offload) - rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - else - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); -} - -/* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? */ diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 4fe877f5f654..259904075636 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,16 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) - return true; - - return false; -} - -static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) -{ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) return true; return false; diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b53a9e8f5904..6d37596deb1f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -39,6 +39,7 @@ #include <linux/torture.h> #include <linux/vmalloc.h> #include <linux/rcupdate_trace.h> +#include <linux/sched/debug.h> #include "rcu.h" @@ -104,6 +105,20 @@ static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + atomic_t ws_inflight; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; @@ -111,6 +126,8 @@ static struct task_struct **reader_tasks; static struct task_struct *shutdown_task; static u64 **writer_durations; +static bool *writer_done; +static struct writer_freelist *writer_freelists; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -120,7 +137,6 @@ static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -143,6 +159,7 @@ struct rcu_scale_ops { void (*sync)(void); void (*exp_sync)(void); struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); const char *name; }; @@ -224,6 +241,11 @@ static void srcu_scale_synchronize(void) synchronize_srcu(srcu_ctlp); } +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); @@ -241,6 +263,7 @@ static struct rcu_scale_ops srcu_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcu" }; @@ -270,6 +293,7 @@ static struct rcu_scale_ops srcud_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcud" }; @@ -288,6 +312,11 @@ static void tasks_scale_read_unlock(int idx) { } +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -300,6 +329,7 @@ static struct rcu_scale_ops tasks_ops = { .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, .name = "tasks" }; @@ -326,6 +356,11 @@ static void tasks_rude_scale_read_unlock(int idx) { } +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_rude_ops = { .ptype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_scale_init, @@ -333,11 +368,10 @@ static struct rcu_scale_ops tasks_rude_ops = { .readunlock = tasks_rude_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks_rude, - .gp_barrier = rcu_barrier_tasks_rude, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, .name = "tasks-rude" }; @@ -366,6 +400,11 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -378,6 +417,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; @@ -438,12 +478,52 @@ rcu_scale_reader(void *arg) } /* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + +/* * Callback function for asynchronous grace periods from rcu_scale_writer(). */ static void rcu_scale_async_cb(struct rcu_head *rhp) { - atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; + + atomic_dec(&wflp->ws_inflight); + rcu_scale_free(wmbp); } /* @@ -456,12 +536,14 @@ rcu_scale_writer(void *arg) int i_max; unsigned long jdone; long me = (long)arg; - struct rcu_head *rhp = NULL; + bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; + struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); @@ -493,30 +575,34 @@ rcu_scale_writer(void *arg) jdone = jiffies + minruntime * HZ; do { + bool gp_succeeded = false; + if (writer_holdoff) udelay(writer_holdoff); if (writer_holdoff_jiffies) schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { -retry: - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_scale_async_cb); - rhp = NULL; + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; + gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); - goto retry; } else { - kfree(rhp); /* Because we are stopping. */ + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; } } else if (gp_exp) { cur_ops->exp_sync(); + gp_succeeded = true; } else { cur_ops->sync(); + gp_succeeded = true; } t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; @@ -526,6 +612,7 @@ retry: started = true; if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; + WRITE_ONCE(writer_done[me], true); sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", scale_type, SCALE_FLAG, me, MIN_MEAS); @@ -551,11 +638,32 @@ retry: if (done && !alldone && atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; - if (started && !alldone && i < MAX_MEAS - 1) + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + if (cur_ops->stats) + cur_ops->stats(); + } + } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - if (gp_async) { + if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; @@ -713,6 +821,7 @@ kfree_scale_cleanup(void) torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; } torture_cleanup_end(); @@ -881,6 +990,7 @@ rcu_scale_cleanup(void) torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (writer_tasks) { @@ -919,10 +1029,33 @@ rcu_scale_cleanup(void) schedule_timeout_uninterruptible(1); } kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } } kfree(writer_tasks); + writer_tasks = NULL; kfree(writer_durations); + writer_durations = NULL; kfree(writer_n_durations); + writer_n_durations = NULL; + kfree(writer_done); + writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -949,8 +1082,9 @@ rcu_scale_shutdown(void *arg) static int __init rcu_scale_init(void) { - long i; int firsterr = 0; + long i; + long j; static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; @@ -1017,14 +1151,22 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; @@ -1037,6 +1179,24 @@ rcu_scale_init(void) firsterr = -ENOMEM; goto unwind; } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..bb75dbf5c800 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -366,8 +367,6 @@ struct rcu_torture_ops { bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); - unsigned long (*get_gp_completed)(void); - void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); @@ -375,6 +374,8 @@ struct rcu_torture_ops { bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -553,8 +554,6 @@ static struct rcu_torture_ops rcu_ops = { .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, .get_gp_state_full = get_state_synchronize_rcu_full, - .get_gp_completed = get_completed_synchronize_rcu, - .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, @@ -562,6 +561,8 @@ static struct rcu_torture_ops rcu_ops = { .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, @@ -740,9 +741,12 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -780,9 +784,12 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -915,11 +922,6 @@ static struct rcu_torture_ops tasks_ops = { * Definitions for rude RCU-tasks torture testing. */ -static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); -} - static struct rcu_torture_ops tasks_rude_ops = { .ttype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_torture_init, @@ -927,11 +929,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, .get_gp_seq = rcu_no_completed, - .deferred_free = rcu_tasks_rude_torture_deferred_free, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, - .call = call_rcu_tasks_rude, - .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, @@ -1318,6 +1317,7 @@ static void rcu_torture_write_types(void) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); } /* @@ -1374,17 +1374,20 @@ rcu_torture_writer(void *arg) int i; int idx; int oldnice = task_nice(current); - struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); unsigned long stallsdone = jiffies; bool stutter_waited; - unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + unsigned long *ulo = NULL; + int ulo_size = 0; // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) - stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1401,6 +1404,16 @@ rcu_torture_writer(void *arg) torture_kthread_stopping("rcu_torture_writer"); return 0; } + if (cur_ops->poll_active > 0) { + ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; + } do { rcu_torture_writer_state = RTWS_FIXED_DELAY; @@ -1437,8 +1450,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); - if (cur_ops->get_gp_completed) { - cookie = cur_ops->get_gp_completed(); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); } cur_ops->readunlock(idx); @@ -1452,8 +1465,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); - if (cur_ops->get_gp_completed_full) { - cur_ops->get_gp_completed_full(&cookie_full); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); } cur_ops->readunlock(idx); @@ -1502,19 +1515,19 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) if (cur_ops->poll_gp_state(ulo[i]) || cur_ops->same_gp_state(ulo[i], gp_snap1)) { ulo[i] = gp_snap1; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1522,20 +1535,20 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET_FULL: rcu_torture_writer_state = RTWS_POLL_GET_FULL; - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) if (cur_ops->poll_gp_state_full(&rgo[i]) || cur_ops->same_gp_state_full(&rgo[i], &gp_snap1_full)) { rgo[i] = gp_snap1_full; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1617,6 +1630,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -2370,7 +2385,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " - "stall_cpu_block=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " @@ -2382,7 +2397,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, - stall_cpu_block, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, @@ -2460,19 +2475,11 @@ static struct notifier_block rcu_torture_stall_block = { * induces a CPU stall for the time specified by stall_cpu. If a new * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { int idx; - int ret; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); - if (rcu_cpu_stall_notifiers) { - ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); - if (ret) - pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", - __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); - } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2492,12 +2499,12 @@ static int rcu_torture_stall(void *args) stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("%s start on CPU %d.\n", - __func__, raw_smp_processor_id()); + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && !kthread_should_stop()) if (stall_cpu_block) { @@ -2509,12 +2516,42 @@ static int rcu_torture_stall(void *args) } else if (stall_no_softlockup) { touch_softlockup_watchdog(); } - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_enable(); else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } pr_alert("%s end.\n", __func__); if (rcu_cpu_stall_notifiers && !ret) { ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); @@ -2680,7 +2717,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(freed); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } @@ -2830,7 +2867,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f4ea5b1ec068..0db9db73f57f 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -28,6 +28,7 @@ #include <linux/rcupdate_trace.h> #include <linux/reboot.h> #include <linux/sched.h> +#include <linux/seq_buf.h> #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/stat.h> @@ -134,7 +135,7 @@ struct ref_scale_ops { const char *name; }; -static struct ref_scale_ops *cur_ops; +static const struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -170,7 +171,7 @@ static bool rcu_sync_scale_init(void) return true; } -static struct ref_scale_ops rcu_ops = { +static const struct ref_scale_ops rcu_ops = { .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, @@ -204,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const } } -static struct ref_scale_ops srcu_ops = { +static const struct ref_scale_ops srcu_ops = { .init = rcu_sync_scale_init, .readsection = srcu_ref_scale_read_section, .delaysection = srcu_ref_scale_delay_section, @@ -231,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c un_delay(udl, ndl); } -static struct ref_scale_ops rcu_tasks_ops = { +static const struct ref_scale_ops rcu_tasks_ops = { .init = rcu_sync_scale_init, .readsection = rcu_tasks_ref_scale_read_section, .delaysection = rcu_tasks_ref_scale_delay_section, @@ -270,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c } } -static struct ref_scale_ops rcu_trace_ops = { +static const struct ref_scale_ops rcu_trace_ops = { .init = rcu_sync_scale_init, .readsection = rcu_trace_ref_scale_read_section, .delaysection = rcu_trace_ref_scale_delay_section, @@ -309,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops refcnt_ops = { +static const struct ref_scale_ops refcnt_ops = { .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, @@ -346,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops rwlock_ops = { +static const struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -383,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_scale_ops rwsem_ops = { +static const struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, @@ -418,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_enable(); } -static struct ref_scale_ops lock_ops = { +static const struct ref_scale_ops lock_ops = { .readsection = ref_lock_section, .delaysection = ref_lock_delay_section, .name = "lock" @@ -453,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_enable(); } -static struct ref_scale_ops lock_irq_ops = { +static const struct ref_scale_ops lock_irq_ops = { .readsection = ref_lock_irq_section, .delaysection = ref_lock_irq_delay_section, .name = "lock-irq" @@ -489,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int preempt_enable(); } -static struct ref_scale_ops acqrel_ops = { +static const struct ref_scale_ops acqrel_ops = { .readsection = ref_acqrel_section, .delaysection = ref_acqrel_delay_section, .name = "acqrel" @@ -523,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n stopopts = x; } -static struct ref_scale_ops clock_ops = { +static const struct ref_scale_ops clock_ops = { .readsection = ref_clock_section, .delaysection = ref_clock_delay_section, .name = "clock" @@ -555,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int stopopts = x; } -static struct ref_scale_ops jiffies_ops = { +static const struct ref_scale_ops jiffies_ops = { .readsection = ref_jiffies_section, .delaysection = ref_jiffies_delay_section, .name = "jiffies" @@ -705,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in) preempt_enable(); } -static struct ref_scale_ops typesafe_ref_ops; -static struct ref_scale_ops typesafe_lock_ops; -static struct ref_scale_ops typesafe_seqlock_ops; +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; // Initialize for a typesafe test. static bool typesafe_init(void) @@ -768,7 +769,7 @@ static void typesafe_cleanup(void) } // The typesafe_init() function distinguishes these structures by address. -static struct ref_scale_ops typesafe_ref_ops = { +static const struct ref_scale_ops typesafe_ref_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -776,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = { .name = "typesafe_ref" }; -static struct ref_scale_ops typesafe_lock_ops = { +static const struct ref_scale_ops typesafe_lock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -784,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = { .name = "typesafe_lock" }; -static struct ref_scale_ops typesafe_seqlock_ops = { +static const struct ref_scale_ops typesafe_seqlock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -891,32 +892,34 @@ static u64 process_durations(int n) { int i; struct reader_task *rt; - char buf1[64]; + struct seq_buf s; char *buf; u64 sum = 0; buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", - exp_idx); + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); if (i % 5 == 0) - strcat(buf, "\n"); - if (strlen(buf) >= 800) { - pr_alert("%s", buf); - buf[0] = 0; + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); } - strcat(buf, buf1); + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); sum += rt->last_duration_ns; } - pr_alert("%s\n", buf); + pr_alert("%s\n", seq_buf_str(&s)); kfree(buf); return sum; @@ -1023,7 +1026,7 @@ end: } static void -ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, @@ -1078,7 +1081,7 @@ ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_scale_ops *scale_ops[] = { + static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b24db425f16d..31706e3293bc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -247,7 +248,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); @@ -258,7 +259,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); - ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) @@ -266,7 +267,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; - smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ return 0; err_free_sda: @@ -628,6 +630,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; @@ -1560,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) @@ -1818,6 +1822,7 @@ static void process_srcu(struct work_struct *work) } else { j = jiffies; if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ba3440a45b6d..6333f4ccf024 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_blkd_tasks: List of tasks blocked as readers. * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { @@ -49,6 +50,7 @@ struct rcu_tasks_percpu { struct list_head rtp_blkd_tasks; struct list_head rtp_exit_list; int cpu; + int index; struct rcu_tasks *rtpp; }; @@ -63,7 +65,7 @@ struct rcu_tasks_percpu { * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @tasks_gp_seq: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. @@ -76,6 +78,7 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. @@ -84,6 +87,7 @@ struct rcu_tasks_percpu { * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -110,6 +114,7 @@ struct rcu_tasks { call_rcu_func_t call_func; unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; @@ -118,6 +123,7 @@ struct rcu_tasks { atomic_t barrier_q_count; struct completion barrier_q_completion; unsigned long barrier_q_seq; + unsigned long barrier_q_start; char *name; char *kname; }; @@ -182,6 +188,8 @@ module_param(rcu_task_collapse_lim, int, 0444); static int rcu_task_lazy_lim __read_mostly = 32; module_param(rcu_task_lazy_lim, int, 0444); +static int rcu_task_cpu_ids; + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -245,6 +253,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; int lim; int shift; + int maxcpu; + int index = 0; if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; @@ -254,14 +264,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) } lim = rcu_task_enqueue_lim; - if (lim > nr_cpu_ids) - lim = nr_cpu_ids; - shift = ilog2(nr_cpu_ids / lim); - if (((nr_cpu_ids - 1) >> shift) >= lim) - shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -273,14 +278,30 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; + maxcpu = cpu; } - pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, - data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); } // Compute wakeup time for lazy callback timer. @@ -339,6 +360,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rcu_read_lock(); ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. @@ -348,7 +370,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_n_lock_retries = 0; } if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && - READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } // Queuing callbacks before initialization not yet supported. @@ -368,10 +390,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, 0); - WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); - smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -388,6 +410,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) struct rcu_tasks *rtp; struct rcu_tasks_percpu *rtpcp; + rhp->next = rhp; // Mark the callback as having been invoked. rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); rtp = rtpcp->rtpp; if (atomic_dec_and_test(&rtp->barrier_q_count)) @@ -396,7 +419,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) // Wait for all in-flight callbacks for the specified RCU Tasks flavor. // Operates in a manner similar to rcu_barrier(). -static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; @@ -409,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) mutex_unlock(&rtp->barrier_q_mutex); return; } + rtp->barrier_q_start = jiffies; rcu_seq_start(&rtp->barrier_q_seq); init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); @@ -444,6 +468,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -481,7 +507,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); gpdone = false; @@ -496,7 +522,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } if (rtp->percpu_dequeue_lim == 1) { - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); @@ -511,30 +539,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) // Advance callbacks and invoke any that are ready. static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { - int cpu; - int cpunext; int cpuwq; unsigned long flags; int len; + int index; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_tasks_percpu *rtpcp_next; - cpu = rtpcp->cpu; - cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); - cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + } + } } } - if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) + if (rcu_segcblist_empty(&rtpcp->cblist)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -687,9 +717,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -#endif /* #ifndef CONFIG_TINY_RCU */ -#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -723,6 +751,53 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) rtp->lazy_jiffies, s); } + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + #endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -1174,6 +1249,12 @@ void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_gp_kthread(void) @@ -1244,13 +1325,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } //////////////////////////////////////////////////////////////////////// // -// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of -// passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching of -// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. -// This invokes schedule_on_each_cpu() in order to send IPIs far and wide -// and induces otherwise unnecessary context switches on all online CPUs, -// whether idle or not. +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. // // Callback handling is provided by the rcu_tasks_kthread() function. // @@ -1268,11 +1348,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) schedule_on_each_cpu(rcu_tasks_be_rude); } -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, "RCU Tasks Rude"); -/** +/* * call_rcu_tasks_rude() - Queue a callback rude task-based grace period * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period @@ -1289,12 +1369,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). */ -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) { call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); } -EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); /** * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period @@ -1320,26 +1402,9 @@ void synchronize_rcu_tasks_rude(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); -/** - * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_rude(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_rude); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); - -int rcu_tasks_rude_lazy_ms = -1; -module_param(rcu_tasks_rude_lazy_ms, int, 0444); - static int __init rcu_spawn_tasks_rude_kthread(void) { rcu_tasks_rude.gp_sleep = HZ / 10; - if (rcu_tasks_rude_lazy_ms >= 0) - rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1350,6 +1415,12 @@ void show_rcu_tasks_rude_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_rude_gp_kthread(void) @@ -1613,7 +1684,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // However, we cannot safely change its state. n_heavy_reader_attempts++; // Check for "running" idle tasks on offline CPUs. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; nesting = 0; @@ -2027,6 +2098,12 @@ void show_rcu_tasks_trace_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_trace_gp_kthread(void) @@ -2070,17 +2147,13 @@ static struct rcu_tasks_test_desc tests[] = { .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { - .name = "call_rcu_tasks_rude()", - /* If not defined, the test is skipped. */ - .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), - }, - { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void test_rcu_tasks_callback(struct rcu_head *rhp) { struct rcu_tasks_test_desc *rttd = @@ -2090,6 +2163,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = false; } +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void rcu_tasks_initiate_self_tests(void) { @@ -2102,16 +2176,14 @@ static void rcu_tasks_initiate_self_tests(void) #ifdef CONFIG_TASKS_RUDE_RCU pr_info("Running RCU Tasks Rude wait API self tests\n"); - tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); - call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU pr_info("Running RCU Tasks Trace wait API self tests\n"); - tests[2].runstart = jiffies; + tests[1].runstart = jiffies; synchronize_rcu_tasks_trace(); - call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); #endif } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4402d6f5f857..b3b3ce34df63 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -105,7 +105,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..a60616e69b66 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, -#ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_RCU_CORE, -#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, @@ -97,6 +94,9 @@ static struct rcu_state rcu_state = { .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -283,37 +283,45 @@ void rcu_softirq_qs(void) } /* - * Reset the current CPU's ->dynticks counter to indicate that the + * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits - * of the ->dynticks counter are manipulated only by the corresponding CPU, + * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ -static void rcu_dynticks_eqs_online(void) +static void rcu_watching_online(void) { - if (ct_dynticks() & RCU_DYNTICKS_IDX) + if (ct_rcu_watching() & CT_RCU_WATCHING) return; - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); } /* - * Return true if the snapshot returned from rcu_dynticks_snap() + * Return true if the snapshot returned from ct_rcu_watching() * indicates that RCU is in an extended quiescent state. */ -static bool rcu_dynticks_in_eqs(int snap) +static bool rcu_watching_snap_in_eqs(int snap) { - return !(snap & RCU_DYNTICKS_IDX); + return !(snap & CT_RCU_WATCHING); } -/* - * Return true if the CPU corresponding to the specified rcu_data - * structure has spent some time in an extended quiescent state since - * rcu_dynticks_snap() returned the specified snapshot. +/** + * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU + * since the specified @snap? + * + * @rdp: The rcu_data corresponding to the CPU for which to check EQS. + * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. + * + * Returns true if the CPU corresponding to @rdp has spent some time in an + * extended quiescent state since @snap. Note that this doesn't check if it + * /still/ is in an EQS, just that it went through one since @snap. + * + * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) +static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { /* * The first failing snapshot is already ordered against the accesses @@ -323,26 +331,29 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) * performed by the remote CPU prior to entering idle and therefore can * rely solely on acquire semantics. */ - return snap != ct_dynticks_cpu_acquire(rdp->cpu); + if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap))) + return true; + + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } /* * Return true if the referenced integer is zero while the specified * CPU remains within a single extended quiescent state. */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +bool rcu_watching_zero_in_eqs(int cpu, int *vp) { int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; - smp_rmb(); // Order ->dynticks and *vp reads. + snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING; + smp_rmb(); // Order CT state and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; - smp_rmb(); // Order *vp read and ->dynticks re-read. + smp_rmb(); // Order *vp read and CT state re-read. // If still in the same extended quiescent state, we are good! - return snap == ct_dynticks_cpu(cpu); + return snap == ct_rcu_watching_cpu(cpu); } /* @@ -356,17 +367,17 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) * * The caller must have disabled interrupts and must not be idle. */ -notrace void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_eqs(void) { int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); + seq = ct_state_inc(2 * CT_RCU_WATCHING); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } -EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); +EXPORT_SYMBOL_GPL(rcu_momentary_eqs); /** * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle @@ -388,13 +399,13 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, - "RCU dynticks_nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, - "RCU dynticks_nmi_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nesting() < 0, + "RCU nesting counter underflow!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, + "RCU nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - nesting = ct_dynticks_nmi_nesting(); + nesting = ct_nmi_nesting(); if (nesting > 1) return false; @@ -404,7 +415,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) WARN_ON_ONCE(!nesting && !is_idle_task(current)); /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_dynticks_nesting() == 0; + return ct_nesting() == 0; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -596,12 +607,12 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(ct_nesting() <= 0, + "RCU nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() != + CT_NESTING_IRQ_NONIDLE, + "Bad RCU nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "RCU in extended quiescent state!"); } #endif /* #ifdef CONFIG_PROVE_RCU */ @@ -641,7 +652,7 @@ void __rcu_irq_enter_check_tick(void) if (in_nmi()) return; - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); if (!tick_nohz_full_cpu(rdp->cpu) || @@ -723,7 +734,7 @@ notrace bool rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = !rcu_dynticks_curr_cpu_in_eqs(); + ret = rcu_is_watching_curr_cpu(); preempt_enable_notrace(); return ret; } @@ -765,11 +776,11 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Snapshot the specified CPU's dynticks counter so that we can later + * Snapshot the specified CPU's RCU_WATCHING counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int rcu_watching_snap_save(struct rcu_data *rdp) { /* * Full ordering between remote CPU's post idle accesses and updater's @@ -782,8 +793,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Ordering between remote CPU's pre idle accesses and post grace period * updater's accesses is enforced by the below acquire semantic. */ - rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); - if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { + rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); + if (rcu_watching_snap_in_eqs(rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -794,14 +805,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since - * the last call to dyntick_save_progress_counter() for this same CPU, or by + * the last call to rcu_watching_snap_save() for this same CPU, or by * virtue of having been offline. * * Returns negative if the specified CPU needs a force resched. * * Returns zero otherwise. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_watching_snap_recheck(struct rcu_data *rdp) { unsigned long jtsq; int ret = 0; @@ -815,7 +826,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1649,7 +1660,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) * the done tail list manipulations are protected here. */ done = smp_load_acquire(&rcu_state.srs_done_tail); - if (!done) + if (WARN_ON_ONCE(!done)) return; WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); @@ -1984,10 +1995,10 @@ static void rcu_gp_fqs(bool first_time) if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(dyntick_save_progress_counter); + force_qs_rnp(rcu_watching_snap_save); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_watching_snap_recheck); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { @@ -2383,7 +2394,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2420,23 +2430,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * to return true. So complain, but don't awaken. */ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); - } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - /* - * ...but NOCB kthreads may miss or delay callbacks acceleration - * if in the middle of a (de-)offloading process. - */ - needacc = true; } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - - if (needacc) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); - } } } @@ -2791,24 +2789,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - /* - * On RT rcu_core() can be preempted when IRQs aren't disabled. - * Therefore this function can race with concurrent NOCB (de-)offloading - * on this CPU and the below condition must be considered volatile. - * However if we race with: - * - * _ Offloading: In the worst case we accelerate or process callbacks - * concurrently with NOCB kthreads. We are guaranteed to - * call rcu_nocb_lock() if that happens. - * - * _ Deoffloading: In the worst case we miss callbacks acceleration or - * processing. This is fine because the early stage - * of deoffloading invokes rcu_core() after setting - * SEGCBLIST_RCU_CORE. So we guarantee that we'll process - * what could have been dismissed without the need to wait - * for the next rcu_pending() check in the next jiffy. - */ - const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2828,17 +2808,17 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { - rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); + local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); /* Re-invoke RCU core processing if there are callbacks remaining. */ @@ -2855,7 +2835,7 @@ static __latent_entropy void rcu_core(void) queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } -static void rcu_core_si(struct softirq_action *h) +static void rcu_core_si(void) { rcu_core(); } @@ -3227,7 +3207,7 @@ struct kvfree_rcu_bulk_data { struct list_head list; struct rcu_gp_oldstate gp_snap; unsigned long nr_records; - void *records[]; + void *records[] __counted_by(nr_records); }; /* @@ -3539,10 +3519,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); + mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); + queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); } static void @@ -3584,18 +3564,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) } /* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Return: %true if a work is queued, %false otherwise. */ -static void kfree_rcu_monitor(struct work_struct *work) +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); unsigned long flags; + bool queued = false; int i, j; - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. @@ -3634,11 +3611,27 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); } } raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one @@ -3704,7 +3697,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, + queue_delayed_work(system_unbound_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { @@ -3767,7 +3760,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } // Finally insert and update the GP for this page. - bnode->records[bnode->nr_records++] = ptr; + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); @@ -3859,6 +3853,86 @@ unlock_return: } EXPORT_SYMBOL_GPL(kvfree_call_rcu); +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -4403,6 +4477,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { unsigned long __maybe_unused s = rcu_state.barrier_sequence; + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); @@ -4804,8 +4879,8 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); + WARN_ON_ONCE(ct->nesting != 1); + WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; @@ -4898,7 +4973,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -5058,7 +5133,7 @@ void rcutree_report_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; arch_spin_lock(&rcu_state.ofl_lock); - rcu_dynticks_eqs_online(); + rcu_watching_online(); raw_spin_lock(&rcu_state.barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); @@ -5424,6 +5499,8 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcf2b4aa3441..a9a811d9d7a3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -206,7 +206,7 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ + int watching_snap; /* Per-GP tracking for dynticks. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ @@ -215,7 +215,7 @@ struct rcu_data { /* 4) rcu_barrier(), OOM callbacks, and expediting. */ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; - int exp_dynticks_snap; /* Double-check need for IPI. */ + int exp_watching_snap; /* Double-check need for IPI. */ /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -411,7 +411,6 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ - int nocb_is_setup; /* nocb is setup from boot */ /* synchronize_rcu() part. */ struct llist_head srs_next; /* request a GP users. */ @@ -420,6 +419,11 @@ struct rcu_state { struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ + int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4acd29d16fdb..fb664d3a01c9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -7,6 +7,7 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/lockdep.h> static void rcu_exp_handler(void *unused); @@ -376,11 +377,11 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) * post grace period updater's accesses is enforced by the * below acquire semantic. */ - snap = ct_dynticks_cpu_acquire(cpu); - if (rcu_dynticks_in_eqs(snap)) + snap = ct_rcu_watching_cpu_acquire(cpu); + if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else - rdp->exp_dynticks_snap = snap; + rdp->exp_watching_snap = snap; } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -400,7 +401,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) unsigned long mask = rdp->grpmask; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) { mask_ofl_test |= mask; continue; } @@ -543,6 +544,67 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) } /* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) +{ + int cpu; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(); + + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + dump_cpu_task(cpu); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + +/* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ @@ -553,10 +615,8 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; - int ndetected; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(); unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); @@ -590,59 +650,17 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; + + nbcon_cpu_emergency_enter(); + j = jiffies; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; - rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - j - jiffies_start, rcu_state.expedited_sequence, - data_race(rnp_root->expmask), - ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { - pr_err("blocking rcu_node structures (internal RCU debug):"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_exp_done_unlocked(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - data_race(rnp->expmask), - ".T"[!!data_race(rnp->exp_tasks)]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). - dump_cpu_task(cpu); - preempt_enable(); - } - rcu_exp_print_detail_task_stall_rnp(rnp); - } + synchronize_rcu_expedited_stall(jiffies_start, j); jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); } } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3ce30841119a..97b99cd06923 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -16,10 +16,6 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) { @@ -220,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); + swake_up_one_online(&rdp_gp->nocb_gp_wq); } return needwake; @@ -413,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return false; } - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { rcu_nocb_lock(rdp); @@ -505,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ + // A wake up of the grace period kthread or timer adjustment // needs to be done only if: // 1. Bypass list was fully empty before (this is the first @@ -616,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp) +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { /* * Offloading. Set our flag and notify the offload worker. * We will handle this rdp until it ever gets de-offloaded. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 1; - } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { /* * De-offloading. Clear our flag and notify the de-offload worker. * We will ignore this rdp until it ever gets re-offloaded. */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 0; - } else { - WARN_ON_ONCE(1); - ret = -1; + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); } - - rcu_nocb_unlock_irqrestore(rdp, flags); - - return ret; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) @@ -853,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - int ret; - - ret = nocb_gp_toggle_rdp(rdp_toggling); - if (ret == 1) - list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); - else if (ret == 0) - list_del(&rdp_toggling->nocb_entry_rdp); - + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); swake_up_one(&rdp_toggling->nocb_state_wq); } @@ -917,7 +894,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); /* * Disable BH to provide the expected environment. Also, when @@ -1030,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void) } EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - rcu_nocb_unlock_irqrestore(rdp, flags); + unsigned long flags; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog @@ -1053,88 +1025,73 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } -static long rcu_nocb_rdp_deoffload(void *arg) +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool ret; /* - * rcu_nocb_rdp_deoffload() may be called directly if - * rcuog/o[p] spawn failed, because at this time the rdp->cpu - * is not online yet. + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. */ - WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + unsigned long flags; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); pr_info("De-offloading %d\n", rdp->cpu); - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, - * leaving some work dismissed because rcu_core() still thinks the rdp is - * completely offloaded, we are guaranteed a nearby future instance of - * rcu_core() to catch up. + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); - invoke_rcu_core(); - wake_gp = rdp_offload_toggle(rdp, false, flags); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_GP)); - if (rdp->nocb_cb_kthread) - kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_rdp_deoffload_wait_cond(rdp)); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list * to iterate. Do it here instead. Locking doesn't look stricly necessary * but we stick to paranoia in this rare path. */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); list_del(&rdp->nocb_entry_rdp); } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could clear SEGCBLIST_LOCKING after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); - /* - * Without SEGCBLIST_LOCKING, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return 0; } @@ -1145,33 +1102,42 @@ int rcu_nocb_cpu_deoffload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static long rcu_nocb_rdp_offload(void *arg) +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + WARN_ON_ONCE(cpu_online(rdp->cpu)); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -1184,44 +1150,17 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING - * is set. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - /* - * We didn't take the nocb lock while working on the - * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - wake_gp = rdp_offload_toggle(rdp, true, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - kthread_unpark(rdp->nocb_cb_kthread); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_offload_wait_cond(rdp)); - /* - * All kthreads are ready to work, we can finally relieve rcu_core() and - * enable nocb bypass. - */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); - rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_unpark(rdp->nocb_cb_kthread); return 0; } @@ -1232,18 +1171,18 @@ int rcu_nocb_cpu_offload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1261,7 +1200,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return 0; /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) + if (!mutex_trylock(&rcu_state.nocb_mutex)) return 0; /* Snapshot count of all CPUs */ @@ -1271,7 +1210,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) count += READ_ONCE(rdp->lazy_len); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_EMPTY; } @@ -1289,9 +1228,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * Protect against concurrent (de-)offloading. Otherwise nocb locking * may be ignored or imbalanced. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) { + if (!mutex_trylock(&rcu_state.nocb_mutex)) { /* - * But really don't insist if barrier_mutex is contended since we + * But really don't insist if nocb_mutex is contended since we * can't guarantee that it will never engage in a dependency * chain involving memory allocation. The lock is seldom contended * anyway. @@ -1330,7 +1269,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_STOP; } @@ -1396,9 +1335,7 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); } rcu_organize_nocb_kthreads(); } @@ -1446,7 +1383,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - goto end; + goto err; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1458,7 +1395,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_create(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - goto end; + goto err; if (rcu_rdp_is_offloaded(rdp)) wake_up_process(t); @@ -1471,13 +1408,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -end: - mutex_lock(&rcu_state.barrier_mutex); + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); cpumask_clear_cpu(cpu, rcu_nocb_mask); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1653,16 +1598,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - /* No ->nocb_lock to acquire. */ static void rcu_nocb_lock(struct rcu_data *rdp) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c569da65b421..1c7cbd145d5e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -24,10 +24,11 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) * timers have their own means of synchronization against the * offloaded state updaters. */ - RCU_LOCKDEP_WARN( + RCU_NOCB_LOCKDEP_WARN( !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || - rcu_lockdep_is_held_nocb(rdp) || + lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), @@ -869,7 +870,7 @@ static void rcu_qs(void) /* * Register an urgently needed quiescent state. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * emergency, invoke rcu_momentary_eqs() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. @@ -889,7 +890,7 @@ void rcu_all_qs(void) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } rcu_qs(); @@ -909,7 +910,7 @@ void rcu_note_context_switch(bool preempt) goto out; this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); out: rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4b0e9d7c4c68..4432db6d0b99 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,8 +7,10 @@ * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/kvm_para.h> #include <linux/rcu_notifier.h> +#include <linux/smp.h> ////////////////////////////////////////////////////////////////////////////// // @@ -370,6 +372,7 @@ static void rcu_dump_cpu_stacks(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + printk_deferred_enter(); raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -379,6 +382,7 @@ static void rcu_dump_cpu_stacks(void) dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + printk_deferred_exit(); } } @@ -501,7 +505,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); + rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,8 +519,8 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - ct_dynticks_cpu(cpu) & 0xffff, - ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), + ct_rcu_watching_cpu(cpu) & 0xffff, + ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", @@ -605,6 +609,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -657,6 +663,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); rcu_force_quiescent_state(); /* Kick them all. */ @@ -677,6 +685,8 @@ static void print_cpu_stall(unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -706,6 +716,8 @@ static void print_cpu_stall(unsigned long gps) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); /* @@ -719,6 +731,9 @@ static void print_cpu_stall(unsigned long gps) set_preempt_need_resched(); } +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + static void check_cpu_stall(struct rcu_data *rdp) { bool self_detected; @@ -791,7 +806,9 @@ static void check_cpu_stall(struct rcu_data *rdp) return; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); - if (self_detected) { + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); } else { diff --git a/kernel/resource.c b/kernel/resource.c index 14777afb0a99..a83040fde236 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + end = min_t(resource_size_t, base->end, PHYSMEM_END); return end - size + 1; } @@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + addr <= min_t(resource_size_t, base->end, PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3951e4a55e5..1d7f5941bcdc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5762,7 +5762,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CONTEXT_USER); + SCHED_WARN_ON(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6658,7 +6658,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); @@ -9752,7 +9752,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9057584ec06d..8dc9385f6da4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12483,7 +12483,7 @@ out: * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void sched_balance_softirq(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ae1b42775ef9..195d2f2834a9 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -406,6 +406,14 @@ static void __setscheduler_params(struct task_struct *p, else if (fair_policy(policy)) p->static_prio = NICE_TO_PRIO(attr->sched_nice); + /* rt-policy tasks do not have a timerslack */ + if (task_is_realtime(p)) { + p->timer_slack_ns = 0; + } else if (p->timer_slack_ns == 0) { + /* when switching back to non-rt policy, restore timerslack */ + p->timer_slack_ns = p->default_timer_slack_ns; + } + /* * __sched_setscheduler() ensures attr->sched_priority == 0 when * !rt_policy. Always setting this ensures that things like diff --git a/kernel/signal.c b/kernel/signal.c index 60c737e423a1..6fe29715105b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -618,20 +618,18 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is - * expected to free it. - * - * All callers have to hold the siglock. + * Try to dequeue a signal. If a deliverable signal is found fill in the + * caller provided siginfo and return the signal number. Otherwise return + * 0. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, - kernel_siginfo_t *info, enum pid_type *type) +int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { + struct task_struct *tsk = current; bool resched_timer = false; int signr; - /* We only dequeue private signals from ourselves, we don't let - * signalfd steal them - */ + lockdep_assert_held(&tsk->sighand->siglock); + *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { @@ -1940,10 +1938,11 @@ struct sigqueue *sigqueue_alloc(void) void sigqueue_free(struct sigqueue *q) { - unsigned long flags; spinlock_t *lock = ¤t->sighand->siglock; + unsigned long flags; - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) + return; /* * We must hold ->siglock while testing q->list * to serialize with collect_signal() or with @@ -1971,7 +1970,10 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) unsigned long flags; int ret, result; - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) + return 0; + if (WARN_ON_ONCE(q->info.si_code != SI_TIMER)) + return 0; ret = -1; rcu_read_lock(); @@ -2006,7 +2008,6 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) * If an SI_TIMER entry is already queue just increment * the overrun count. */ - BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; result = TRACE_SIGNAL_ALREADY_PENDING; goto out; @@ -2793,8 +2794,7 @@ relock: type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) - signr = dequeue_signal(current, ¤t->blocked, - &ksig->info, &type); + signr = dequeue_signal(¤t->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ @@ -2888,6 +2888,8 @@ relock: current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { + int ret; + if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); @@ -2899,7 +2901,24 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(&ksig->info); + ret = do_coredump(&ksig->info); + if (ret) + coredump_report_failure("coredump has not been created, error %d", + ret); + else if (!IS_ENABLED(CONFIG_COREDUMP)) { + /* + * Coredumps are not available, can't fail collecting + * the coredump. + * + * Leave a note though that the coredump is going to be + * not created. This is not an error or a warning as disabling + * support in the kernel for coredumps isn't commonplace, and + * the user must've built the kernel with the custom config so + * let them know all works as desired. + */ + coredump_report("no coredump collected as " + "that is disabled in the kernel configuration"); + } } /* @@ -3648,7 +3667,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info, &type); + sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested @@ -3667,7 +3686,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); - sig = dequeue_signal(tsk, &mask, info, &type); + sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); diff --git a/kernel/smp.c b/kernel/smp.c index aaffecdad319..f25e20617b7e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -208,12 +208,25 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static atomic_t n_csd_lock_stuck; + +/** + * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? + * + * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * long enough for a "non-responsive CSD lock" message to be printed. + */ +bool csd_lock_is_stuck(void) +{ + return !!atomic_read(&n_csd_lock_stuck); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; @@ -229,15 +242,26 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); + atomic_dec(&n_csd_lock_stuck); return true; } ts2 = sched_clock(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; - if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) + if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * + (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || + csd_lock_timeout_ns == 0)) return false; + if (ts0 > ts2) { + /* Our own sched_clock went backward; don't blame another CPU. */ + ts_delta = ts0 - ts2; + pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); + *ts1 = ts2; + return false; + } + firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); @@ -249,9 +273,12 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; - pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + (*nmessages)++; + if (firsttime) + atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering @@ -290,12 +317,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in */ static void __csd_lock_wait(call_single_data_t *csd) { + unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = sched_clock(); for (;;) { - if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 02582017759a..d082e7840f88 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -551,7 +551,7 @@ restart: kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); - h->action(h); + h->action(); trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", @@ -700,7 +700,7 @@ void __raise_softirq_irqoff(unsigned int nr) or_softirq_pending(1UL << nr); } -void open_softirq(int nr, void (*action)(struct softirq_action *)) +void open_softirq(int nr, void (*action)(void)) { softirq_vec[nr].action = action; } @@ -760,8 +760,7 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } -static void tasklet_action_common(struct softirq_action *a, - struct tasklet_head *tl_head, +static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { struct tasklet_struct *list; @@ -805,16 +804,16 @@ static void tasklet_action_common(struct softirq_action *a, } } -static __latent_entropy void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(void) { workqueue_softirq_action(false); - tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(void) { workqueue_softirq_action(true); - tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_setup(struct tasklet_struct *t, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cedb17ba158a..da821ce258ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -251,7 +251,7 @@ static int multi_cpu_stop(void *data) */ touch_nmi_watchdog(); } - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); diff --git a/kernel/sys.c b/kernel/sys.c index 3a2df1bd9f64..e3c4cffb520c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2557,6 +2557,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: + if (task_is_realtime(current)) + break; if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5abfa4390673..8bf888641694 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -493,7 +493,7 @@ static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throt * promised in the context of posix_timer_fn() never * materialized, but someone should really work on it. * - * To prevent DOS fake @now to be 1 jiffie out which keeps + * To prevent DOS fake @now to be 1 jiffy out which keeps * the overrun accounting correct but creates an * inconsistency vs. timer_gettime(2). */ @@ -574,15 +574,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, it.alarm.alarmtimer); enum alarmtimer_restart result = ALARMTIMER_NORESTART; unsigned long flags; - int si_private = 0; spin_lock_irqsave(&ptr->it_lock, flags); - ptr->it_active = 0; - if (ptr->it_interval) - si_private = ++ptr->it_requeue_pending; - - if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + if (posix_timer_queue_signal(ptr) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go * away once we handle ignored signals proper. Ensure that diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 60a6484831b1..78c7bd64d0dd 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev) #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST -/* Limit min_delta to a jiffie */ +/* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d0538a75f4c6..23336eecb4f4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,6 @@ static u64 suspend_start; /* * Threshold: 0.0312s, when doubled: 0.0625s. - * Also a default for cs->uncertainty_margin when registering clocks. */ #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) @@ -125,6 +124,13 @@ static u64 suspend_start; * * The default of 500 parts per million is based on NTP's limits. * If a clocksource is good enough for NTP, it is good enough for us! + * + * In other words, by default, even if a clocksource is extremely + * precise (for example, with a sub-nanosecond period), the maximum + * permissible skew between the clocksource watchdog and the clocksource + * under test is not permitted to go below the 500ppm minimum defined + * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the + * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US @@ -132,6 +138,13 @@ static u64 suspend_start; #define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) #endif +/* + * Default for maximum permissible skew when cs->uncertainty_margin is + * not specified, and the lower bound even when cs->uncertainty_margin + * is specified. This is also the default that is used when registering + * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. + */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -231,6 +244,7 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { + int64_t md = 2 * watchdog->uncertainty_margin; unsigned int nretries, max_retries; int64_t wd_delay, wd_seq_delay; u64 wd_end, wd_end2; @@ -245,7 +259,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, local_irq_enable(); wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= WATCHDOG_MAX_SKEW) { + if (wd_delay <= md + cs->uncertainty_margin) { if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); @@ -258,12 +272,12 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, * there is too much external interferences that cause * significant delay in reading both clocksource and watchdog. * - * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, - * report system busy, reinit the watchdog and skip the current + * If consecutive WD read-back delay > md, report + * system busy, reinit the watchdog and skip the current * watchdog test. */ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + if (wd_seq_delay > md) goto skip_test; } @@ -1146,14 +1160,19 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } /* - * If the uncertainty margin is not specified, calculate it. - * If both scale and freq are non-zero, calculate the clock - * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, - * if either of scale or freq is zero, be very conservative and - * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the - * uncertainty margin. Allow stupidly small uncertainty margins - * to be specified by the caller for testing purposes, but warn - * to discourage production use of this capability. + * If the uncertainty margin is not specified, calculate it. If + * both scale and freq are non-zero, calculate the clock period, but + * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. + * However, if either of scale or freq is zero, be very conservative + * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value + * for the uncertainty margin. Allow stupidly small uncertainty + * margins to be specified by the caller for testing purposes, + * but warn to discourage production use of this capability. + * + * Bottom line: The sum of the uncertainty margins of the + * watchdog clocksource and the clocksource under test will be at + * least 500ppm by default. For more information, please see the + * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. */ if (scale && freq && !cs->uncertainty_margin) { cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b8ee320208d4..12eb40d6290e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1177,7 +1177,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) @@ -1351,11 +1351,13 @@ static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) + __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) + __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } @@ -1757,7 +1759,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; @@ -2072,14 +2074,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -2249,7 +2246,7 @@ void __init hrtimers_init(void) /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used */ @@ -2276,13 +2273,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - /* - * Override any slack passed by the user if under - * rt contraints. - */ - if (rt_task(current)) - delta = 0; - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@ -2302,7 +2292,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d2dd214ec68..802b336f4b8c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -660,9 +660,17 @@ rearm: sched_sync_hw_clock(offset_nsec, res != 0); } -void ntp_notify_cmos_timer(void) +void ntp_notify_cmos_timer(bool offset_set) { /* + * If the time jumped (using ADJ_SETOFFSET) cancels sync timer, + * which may have been running if the time was synchronized + * prior to the ADJ_SETOFFSET call. + */ + if (offset_set) + hrtimer_cancel(&sync_hrtimer); + + /* * When the work is currently executed but has not yet the timer * rearmed this queues the work immediately again. No big issue, * just a pointless work scheduled. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 23d1b74c3065..5a633dce9057 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -14,9 +14,9 @@ extern int __do_adjtimex(struct __kernel_timex *txc, extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -extern void ntp_notify_cmos_timer(void); +extern void ntp_notify_cmos_timer(bool offset_set); #else -static inline void ntp_notify_cmos_timer(void) { } +static inline void ntp_notify_cmos_timer(bool offset_set) { } #endif #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e9c6f9d0e42c..6bcee4704059 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -453,6 +453,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; + timer->it_active = 0; if (!cpu_timer_dequeue(ctmr)) return; @@ -559,6 +560,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); + timer->it_active = 1; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -584,12 +586,8 @@ static void cpu_timer_fire(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - /* - * User don't want any signal. - */ - cpu_timer_setexpires(ctmr, 0); - } else if (unlikely(timer->sigq == NULL)) { + timer->it_active = 0; + if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. @@ -600,9 +598,9 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * One-shot timer. Clear it as soon as it's fired. */ - posix_timer_event(timer, 0); + posix_timer_queue_signal(timer); cpu_timer_setexpires(ctmr, 0); - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + } else if (posix_timer_queue_signal(timer)) { /* * The signal did not get queued because the signal * was ignored, so we won't get any callback to @@ -614,6 +612,8 @@ static void cpu_timer_fire(struct k_itimer *timer) } } +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now); + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -623,9 +623,10 @@ static void cpu_timer_fire(struct k_itimer *timer) static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec64 *new, struct itimerspec64 *old) { + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; + u64 old_expires, new_expires, now; struct sighand_struct *sighand; struct task_struct *p; unsigned long flags; @@ -662,10 +663,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, return -ESRCH; } - /* - * Disarm any old timer after extracting its expiry time. - */ - old_incr = timer->it_interval; + /* Retrieve the current expiry time before disarming the timer */ old_expires = cpu_timer_getexpires(ctmr); if (unlikely(timer->it.cpu.firing)) { @@ -673,157 +671,122 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); + timer->it_active = 0; } /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. + * Sample the current clock for saving the previous setting + * and for rearming the timer. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) - val = cpu_clock_sample(clkid, p); + now = cpu_clock_sample(clkid, p); else - val = cpu_clock_sample_group(clkid, p, true); + now = cpu_clock_sample_group(clkid, p, !sigev_none); + /* Retrieve the previous expiry value if requested. */ if (old) { - if (old_expires == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has overrun already. - * If it has, we'll report it as having overrun and - * with the next reloaded timer already ticking, - * though we are swallowing that pending - * notification here to install the new setting. - */ - u64 exp = bump_cpu_timer(timer, val); - - if (val < exp) { - old_expires = exp - val; - old->it_value = ns_to_timespec64(old_expires); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } + old->it_value = (struct timespec64){ }; + if (old_expires) + __posix_cpu_timer_get(timer, old, now); } + /* Retry if the timer expiry is running concurrently */ if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ unlock_task_sighand(p, &flags); goto out; } - if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { - new_expires += val; - } + /* Convert relative expiry time to absolute */ + if (new_expires && !(timer_flags & TIMER_ABSTIME)) + new_expires += now; + + /* Set the new expiry time (might be 0) */ + cpu_timer_setexpires(ctmr, new_expires); /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). + * Arm the timer if it is not disabled, the new expiry value has + * not yet expired and the timer requires signal delivery. + * SIGEV_NONE timers are never armed. In case the timer is not + * armed, enforce the reevaluation of the timer base so that the + * process wide cputime counter can be disabled eventually. */ - cpu_timer_setexpires(ctmr, new_expires); - if (new_expires != 0 && val < new_expires) { - arm_timer(timer, p); + if (likely(!sigev_none)) { + if (new_expires && now < new_expires) + arm_timer(timer, p); + else + trigger_base_recalc_expires(timer, p); } unlock_task_sighand(p, &flags); + + posix_timer_set_common(timer, new); + /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. + * If the new expiry time was already in the past the timer was not + * queued. Fire it immediately even if the thread never runs to + * accumulate more time on this clock. */ - timer->it_interval = timespec64_to_ktime(new->it_interval); + if (!sigev_none && new_expires && now >= new_expires) + cpu_timer_fire(timer); +out: + rcu_read_unlock(); + return ret; +} + +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now) +{ + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; + u64 expires, iv = timer->it_interval; /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. + * Make sure that interval timers are moved forward for the + * following cases: + * - SIGEV_NONE timers which are never armed + * - Timers which expired, but the signal has not yet been + * delivered */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (val >= new_expires) { - if (new_expires != 0) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } + if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none)) + expires = bump_cpu_timer(timer, now); + else + expires = cpu_timer_getexpires(&timer->it.cpu); + /* + * Expired interval timers cannot have a remaining time <= 0. + * The kernel has to move them forward so that the next + * timer expiry is > @now. + */ + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); + } else { /* - * Make sure we don't keep around the process wide cputime - * counter or the tick dependency if they are not necessary. + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - goto out; - - if (!cpu_timer_queued(ctmr)) - trigger_base_recalc_expires(timer, p); - - unlock_task_sighand(p, &flags); + if (!sigev_none) + itp->it_value.tv_nsec = 1; } - out: - rcu_read_unlock(); - if (old) - old->it_interval = ns_to_timespec64(old_incr); - - return ret; } static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - u64 now, expires = cpu_timer_getexpires(ctmr); struct task_struct *p; + u64 now; rcu_read_lock(); p = cpu_timer_task_rcu(timer); - if (!p) - goto out; + if (p && cpu_timer_getexpires(&timer->it.cpu)) { + itp->it_interval = ktime_to_timespec64(timer->it_interval); - /* - * Easy part: convert the reload time. - */ - itp->it_interval = ktime_to_timespec64(timer->it_interval); - - if (!expires) - goto out; - - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - now = cpu_clock_sample(clkid, p); - else - now = cpu_clock_sample_group(clkid, p, false); + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + now = cpu_clock_sample(clkid, p); + else + now = cpu_clock_sample_group(clkid, p, false); - if (now < expires) { - itp->it_value = ns_to_timespec64(expires - now); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; + __posix_cpu_timer_get(timer, itp, now); } -out: rcu_read_unlock(); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b924f0f096fa..4576aaed13b2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -277,10 +277,17 @@ void posixtimer_rearm(struct kernel_siginfo *info) unlock_timer(timr, flags); } -int posix_timer_event(struct k_itimer *timr, int si_private) +int posix_timer_queue_signal(struct k_itimer *timr) { + int ret, si_private = 0; enum pid_type type; - int ret; + + lockdep_assert_held(&timr->it_lock); + + timr->it_active = 0; + if (timr->it_interval) + si_private = ++timr->it_requeue_pending; + /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -309,19 +316,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { + struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); enum hrtimer_restart ret = HRTIMER_NORESTART; - struct k_itimer *timr; unsigned long flags; - int si_private = 0; - timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - timr->it_active = 0; - if (timr->it_interval != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { + if (posix_timer_queue_signal(timr)) { /* * The signal was not queued due to SIG_IGN. As a * consequence the timer is not going to be rearmed from @@ -338,14 +339,14 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * change to the signal handling code. * * For now let timers with an interval less than a - * jiffie expire every jiffie and recheck for a + * jiffy expire every jiffy and recheck for a * valid signal handler. * * This avoids interrupt starvation in case of a * very small interval, which would expire the * timer immediately again. * - * Moving now ahead of time by one jiffie tricks + * Moving now ahead of time by one jiffy tricks * hrtimer_forward() to expire the timer later, * while it still maintains the overrun accuracy * for the price of a slight inconsistency in the @@ -515,7 +516,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_irq(¤t->sighand->siglock); /* This makes the timer valid in the hash table */ WRITE_ONCE(new_timer->it_signal, current->signal); - list_add(&new_timer->list, ¤t->signal->posix_timers); + hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); /* * After unlocking sighand::siglock @new_timer is subject to @@ -856,6 +857,23 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, return lock_timer(timer_id, flags); } +/* + * Set up the new interval and reset the signal delivery data + */ +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting) +{ + if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec) + timer->it_interval = timespec64_to_ktime(new_setting->it_interval); + else + timer->it_interval = 0; + + /* Prevent reloading in case there is a signal pending */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING; + /* Reset overrun accounting */ + timer->it_overrun_last = 0; + timer->it_overrun = -1LL; +} + /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, @@ -878,15 +896,12 @@ int common_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; timr->it_active = 0; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; + posix_timer_set_common(timr, new_setting); - /* Switch off the timer when it_value is zero */ + /* Keep timer disarmed when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); @@ -904,7 +919,7 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; - int error = 0; + int error; if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) @@ -918,6 +933,9 @@ retry: if (!timr) return -EINVAL; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -1021,7 +1039,7 @@ retry_delete: } spin_lock(¤t->sighand->siglock); - list_del(&timer->list); + hlist_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It @@ -1071,7 +1089,7 @@ retry_delete: goto retry_delete; } - list_del(&timer->list); + hlist_del(&timer->list); /* * Setting timer::it_signal to NULL is technically not required @@ -1092,22 +1110,19 @@ retry_delete: */ void exit_itimers(struct task_struct *tsk) { - struct list_head timers; - struct k_itimer *tmr; + struct hlist_head timers; - if (list_empty(&tsk->signal->posix_timers)) + if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); - list_replace_init(&tsk->signal->posix_timers, &timers); + hlist_move_list(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); /* The timers are not longer accessible via tsk::signal */ - while (!list_empty(&timers)) { - tmr = list_first_entry(&timers, struct k_itimer, list); - itimer_delete(tmr); - } + while (!hlist_empty(&timers)) + itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index f32a2ebba9b8..4784ea65f685 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -36,10 +36,11 @@ extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; -int posix_timer_event(struct k_itimer *timr, int si_private); +int posix_timer_queue_signal(struct k_itimer *timr); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting); int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5391e4167d60..7e6f409bf311 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2553,6 +2553,7 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; + bool offset_set = false; bool clock_set = false; struct timespec64 ts; unsigned long flags; @@ -2575,6 +2576,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (ret) return ret; + offset_set = delta.tv_sec != 0; audit_tk_injoffset(delta); } @@ -2608,7 +2610,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(); + ntp_notify_cmos_timer(offset_set); return ret; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 64b0d8a0aa0f..0fc9d066a7be 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -365,7 +365,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, rem = j % HZ; /* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ - base->next_expiry = bucket_expiry; + WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); @@ -1561,6 +1561,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) + __releases(&base->lock) __releases(&base->expiry_lock) + __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); @@ -1898,7 +1900,7 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, * * Store next expiry time in base->next_expiry. */ -static void next_expiry_recalc(struct timer_base *base) +static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1928,7 +1930,7 @@ static void next_expiry_recalc(struct timer_base *base) * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@ -1964,7 +1966,7 @@ static void next_expiry_recalc(struct timer_base *base) clk += adj; } - base->next_expiry = next; + WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } @@ -1993,7 +1995,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return basem; /* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@ -2007,7 +2009,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) - next_expiry_recalc(base); + timer_recalc_next_expiry(base); /* * Move next_expiry for the empty base into the future to prevent an @@ -2018,7 +2020,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; + WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); return base->next_expiry; } @@ -2252,7 +2254,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, base_global, &tevt); /* - * If the next event is only one jiffie ahead there is no need to call + * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. @@ -2411,7 +2413,7 @@ static inline void __run_timers(struct timer_base *base) * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; - next_expiry_recalc(base); + timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); @@ -2440,7 +2442,7 @@ static void run_timer_base(int index) /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { @@ -2462,8 +2464,40 @@ static void run_local_timers(void) hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { - /* Raise the softirq only if required. */ - if (time_after_eq(jiffies, base->next_expiry) || + /* + * Raise the softirq only if required. + * + * timer_base::next_expiry can be written by a remote CPU while + * holding the lock. If this write happens at the same time than + * the lockless local read, sanity checker could complain about + * data corruption. + * + * There are two possible situations where + * timer_base::next_expiry is written by a remote CPU: + * + * 1. Remote CPU expires global timers of this CPU and updates + * timer_base::next_expiry of BASE_GLOBAL afterwards in + * next_timer_interrupt() or timer_recalc_next_expiry(). The + * worst outcome is a superfluous raise of the timer softirq + * when the not yet updated value is read. + * + * 2. A new first pinned timer is enqueued by a remote CPU + * and therefore timer_base::next_expiry of BASE_LOCAL is + * updated. When this update is missed, this isn't a + * problem, as an IPI is executed nevertheless when the CPU + * was idle before. When the CPU wasn't idle but the update + * is missed, then the timer would expire one jiffy late - + * bad luck. + * + * Those unlikely corner cases where the worst outcome is only a + * one jiffy delay or a superfluous raise of the softirq are + * not that expensive as doing the check always while holding + * the lock. + * + * Possible remote writers are using WRITE_ONCE(). Local reader + * uses therefore READ_ONCE(). + */ + if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_softirq(TIMER_SOFTIRQ); return; @@ -2730,7 +2764,7 @@ void __init init_timers(void) */ void msleep(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); @@ -2744,7 +2778,7 @@ EXPORT_SYMBOL(msleep); */ unsigned long msleep_interruptible(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..ac0a01cc8634 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3160,6 +3160,7 @@ struct bpf_uprobe { loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -3178,15 +3179,15 @@ struct bpf_uprobe_multi_run_ctx { struct bpf_uprobe *uprobe; }; -static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) +static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; - for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@ -3194,7 +3195,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); @@ -3322,8 +3323,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, } static bool -uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, - struct mm_struct *mm) +uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; @@ -3480,22 +3480,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); - goto error_free; + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + link->cnt = i; + goto error_unregister; } } err = bpf_link_prime(&link->link, &link_primer); if (err) - goto error_free; + goto error_unregister; return bpf_link_settle(&link_primer); +error_unregister: + bpf_uprobe_unregister(uprobes, link->cnt); + error_free: kvfree(uprobes); kfree(link); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d1d5ea2d0a1b..d7d4fb403f6f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1206,18 +1206,24 @@ static void init_task_vars(int idx) read_unlock(&tasklist_lock); } -static void ftrace_graph_enable_direct(bool enable_branch) +static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops) { trace_func_graph_ent_t func = NULL; trace_func_graph_ret_t retfunc = NULL; int i; - for_each_set_bit(i, &fgraph_array_bitmask, - sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { - func = fgraph_array[i]->entryfunc; - retfunc = fgraph_array[i]->retfunc; - fgraph_direct_gops = fgraph_array[i]; - } + if (gops) { + func = gops->entryfunc; + retfunc = gops->retfunc; + fgraph_direct_gops = gops; + } else { + for_each_set_bit(i, &fgraph_array_bitmask, + sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { + func = fgraph_array[i]->entryfunc; + retfunc = fgraph_array[i]->retfunc; + fgraph_direct_gops = fgraph_array[i]; + } + } if (WARN_ON_ONCE(!func)) return; @@ -1256,8 +1262,6 @@ int register_ftrace_graph(struct fgraph_ops *gops) ret = -ENOSPC; goto out; } - - fgraph_array[i] = gops; gops->idx = i; ftrace_graph_active++; @@ -1266,7 +1270,7 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_disable_direct(true); if (ftrace_graph_active == 1) { - ftrace_graph_enable_direct(false); + ftrace_graph_enable_direct(false, gops); register_pm_notifier(&ftrace_suspend_notifier); ret = start_graph_tracing(); if (ret) @@ -1281,14 +1285,15 @@ int register_ftrace_graph(struct fgraph_ops *gops) } else { init_task_vars(gops->idx); } - /* Always save the function, and reset at unregistering */ gops->saved_func = gops->entryfunc; ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); + if (!ret) + fgraph_array[i] = gops; + error: if (ret) { - fgraph_array[i] = &fgraph_stub; ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); @@ -1324,7 +1329,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) ftrace_shutdown_subops(&graph_ops, &gops->ops, command); if (ftrace_graph_active == 1) - ftrace_graph_enable_direct(true); + ftrace_graph_enable_direct(true, NULL); else if (!ftrace_graph_active) ftrace_graph_disable_direct(false); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebe7ce2f5f4a..c3b2c7dfadef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2226,10 +2226,6 @@ static __init int init_trace_selftests(void) } core_initcall(init_trace_selftests); #else -static inline int run_tracer_selftest(struct tracer *type) -{ - return 0; -} static inline int do_run_tracer_selftest(struct tracer *type) { return 0; @@ -3958,6 +3954,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) break; entries++; ring_buffer_iter_advance(buf_iter); + /* This could be a big loop */ + cond_resched(); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 66a871553d4a..1439064f65d6 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); } +/* + * Protect the interface. + */ +static struct mutex interface_lock; + #ifdef CONFIG_TIMERLAT_TRACER /* * Runtime information for the timer mode. @@ -259,14 +264,20 @@ static inline void tlat_var_reset(void) { struct timerlat_variables *tlat_var; int cpu; + + /* Synchronize with the timerlat interfaces */ + mutex_lock(&interface_lock); /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ for_each_cpu(cpu, cpu_online_mask) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ #define tlat_var_reset() do {} while (0) @@ -332,11 +343,6 @@ struct timerlat_sample { #endif /* - * Protect the interface. - */ -static struct mutex interface_lock; - -/* * Tracer data. */ static struct osnoise_data { @@ -1535,7 +1541,7 @@ static int run_osnoise(void) * This will eventually cause unwarranted noise as PREEMPT_RCU * will force preemption as the means of ending the current * grace period. We avoid this problem by calling - * rcu_momentary_dyntick_idle(), which performs a zero duration + * rcu_momentary_eqs(), which performs a zero duration * EQS allowing PREEMPT_RCU to end the current grace period. * This call shouldn't be wrapped inside an RCU critical * section. @@ -1547,7 +1553,7 @@ static int run_osnoise(void) if (!disable_irq) local_irq_disable(); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); if (!disable_irq) local_irq_enable(); @@ -1612,6 +1618,7 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +static struct cpumask kthread_cpumask; /* * osnoise_sleep - sleep until the next period @@ -1675,6 +1682,7 @@ static inline int osnoise_migration_pending(void) */ mutex_lock(&interface_lock); this_cpu_osn_var()->kthread = NULL; + cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask); mutex_unlock(&interface_lock); return 1; @@ -1945,11 +1953,16 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; + mutex_lock(&interface_lock); kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + mutex_unlock(&interface_lock); + + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && + !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); - } else { + } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) { /* * This is a user thread waiting on the timerlat_fd. We need * to close all users, and the best way to guarantee this is @@ -1958,8 +1971,8 @@ static void stop_kthread(unsigned int cpu) kill_pid(kthread->thread_pid, SIGKILL, 1); put_task_struct(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1967,7 +1980,6 @@ static void stop_kthread(unsigned int cpu) */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); - return; } } } @@ -1982,12 +1994,8 @@ static void stop_per_cpu_kthreads(void) { int cpu; - cpus_read_lock(); - - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) stop_kthread(cpu); - - cpus_read_unlock(); } /* @@ -2021,6 +2029,7 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + cpumask_set_cpu(cpu, &kthread_cpumask); return 0; } @@ -2048,8 +2057,16 @@ static int start_per_cpu_kthreads(void) */ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { + struct task_struct *kthread; + + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (!WARN_ON(!kthread)) + kthread_stop(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); @@ -2579,7 +2596,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file) osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); - hrtimer_cancel(&tlat_var->timer); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); osn_var->sampling = 0; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 97f1e4bc47dc..c4ad7cd7e778 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -942,7 +942,7 @@ static __init int test_graph_storage_multi(void) { struct fgraph_fixture *fixture; bool printed = false; - int i, ret; + int i, j, ret; pr_cont("PASSED\n"); pr_info("Testing multiple fgraph storage on a function: "); @@ -953,22 +953,35 @@ static __init int test_graph_storage_multi(void) if (ret && ret != -ENODEV) { pr_cont("*Could not set filter* "); printed = true; - goto out; + goto out2; } + } + for (j = 0; j < ARRAY_SIZE(store_bytes); j++) { + fixture = &store_bytes[j]; ret = register_ftrace_graph(&fixture->gops); if (ret) { pr_warn("Failed to init store_bytes fgraph tracing\n"); printed = true; - goto out; + goto out1; } } DYN_FTRACE_TEST_NAME(); -out: +out1: + while (--j >= 0) { + fixture = &store_bytes[j]; + unregister_ftrace_graph(&fixture->gops); + + if (fixture->error_str && !printed) { + pr_cont("*** %s ***", fixture->error_str); + printed = true; + } + } +out2: while (--i >= 0) { fixture = &store_bytes[i]; - unregister_ftrace_graph(&fixture->gops); + ftrace_free_filter(&fixture->gops.ops); if (fixture->error_str && !printed) { pr_cont("*** %s ***", fixture->error_str); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c98e3b3386ba..f7443e996b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -58,8 +58,8 @@ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; - struct inode *inode; char *filename; + struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; unsigned long nhit; @@ -1078,43 +1078,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e return trace_handle_return(s); } -typedef bool (*filter_func_t)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); +typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - int ret; + struct inode *inode = d_real_inode(tu->path.dentry); + struct uprobe *uprobe; tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - - if (tu->ref_ctr_offset) - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - else - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); - if (ret) - tu->inode = NULL; - - return ret; + tu->uprobe = uprobe; + return 0; } static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; + bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - if (!tu->inode) + if (!tu->uprobe) continue; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); + sync = true; + tu->uprobe = NULL; } + if (sync) + uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, @@ -1310,7 +1307,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } @@ -1334,7 +1331,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; @@ -1344,8 +1341,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return err; } -static bool uprobe_perf_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { struct trace_uprobe_filter *filter; struct trace_uprobe *tu; @@ -1431,7 +1427,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + if (!uprobe_perf_filter(&tu->consumer, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) diff --git a/kernel/user.c b/kernel/user.c index aa1162deafe4..f46b1d41163b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -36,33 +36,33 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); */ struct user_namespace init_user_ns = { .uid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .gid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .projid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .ns.count = REFCOUNT_INIT(3), diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e7b005ff3750..9949ffad8df0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -364,7 +364,8 @@ struct workqueue_struct { #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; - struct lockdep_map lockdep_map; + struct lockdep_map __lockdep_map; + struct lockdep_map *lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -476,16 +477,13 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* to raise softirq for the BH worker pools on other CPUs */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], - bh_pool_irq_works); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); /* the BH worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - bh_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -2709,7 +2707,6 @@ static void detach_worker(struct worker *worker) unbind_worker(worker); list_del(&worker->node); - worker->pool = NULL; } /** @@ -2729,6 +2726,7 @@ static void worker_detach_from_pool(struct worker *worker) mutex_lock(&wq_pool_attach_mutex); detach_worker(worker); + worker->pool = NULL; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ @@ -3203,7 +3201,7 @@ __acquires(&pool->lock) lockdep_start_depth = lockdep_depth(current); /* see drain_dead_softirq_workfn() */ if (!bh_draining) - lock_map_acquire(&pwq->wq->lockdep_map); + lock_map_acquire(pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding @@ -3237,7 +3235,7 @@ __acquires(&pool->lock) pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); if (!bh_draining) - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || @@ -3349,7 +3347,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - + /* + * The worker is dead and PF_WQ_WORKER is cleared, worker->pool + * shouldn't be accessed, reset it to NULL in case otherwise. + */ + worker->pool = NULL; ida_free(&pool->worker_ida, worker->id); return 0; } @@ -3869,11 +3871,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, static void touch_wq_lockdep_map(struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP + if (unlikely(!wq->lockdep_map)) + return; + if (wq->flags & WQ_BH) local_bh_disable(); - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); + lock_map_acquire(wq->lockdep_map); + lock_map_release(wq->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); @@ -3907,7 +3912,7 @@ void __flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), }; int next_color; @@ -4772,16 +4777,23 @@ static void wq_init_lockdep(struct workqueue_struct *wq) lock_name = wq->name; wq->lock_name = lock_name; - lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); + wq->lockdep_map = &wq->__lockdep_map; + lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + if (wq->lock_name != wq->name) kfree(wq->lock_name); } @@ -5615,12 +5627,10 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } -__printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +static struct workqueue_struct *__alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, va_list args) { - va_list args; struct workqueue_struct *wq; size_t wq_size; int name_len; @@ -5652,9 +5662,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, goto err_free_wq; } - va_start(args, max_active); name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); - va_end(args); if (name_len >= WQ_NAME_LEN) pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", @@ -5684,12 +5692,11 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (flags & WQ_UNBOUND) { if (alloc_node_nr_active(wq->node_nr_active) < 0) - goto err_unreg_lockdep; + goto err_free_wq; } /* @@ -5728,9 +5735,6 @@ err_unlock_free_node_nr_active: kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); } -err_unreg_lockdep: - wq_unregister_lockdep(wq); - wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); @@ -5741,8 +5745,49 @@ err_destroy: destroy_workqueue(wq); return NULL; } + +__printf(1, 4) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, max_active); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq_init_lockdep(wq); + + return wq; +} EXPORT_SYMBOL_GPL(alloc_workqueue); +#ifdef CONFIG_LOCKDEP +__printf(1, 5) +struct workqueue_struct * +alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, + int max_active, struct lockdep_map *lockdep_map, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, lockdep_map); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq->lockdep_map = lockdep_map; + + return wq; +} +EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); +#endif + static bool pwq_busy(struct pool_workqueue *pwq) { int i; @@ -7402,6 +7447,9 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +static unsigned int wq_panic_on_stall; +module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7453,6 +7501,16 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } +static void panic_on_wq_watchdog(void) +{ + static unsigned int wq_stall; + + if (wq_panic_on_stall) { + wq_stall++; + BUG_ON(wq_stall >= wq_panic_on_stall); + } +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -7525,6 +7583,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (cpu_pool_stall) show_cpu_pools_hogs(); + if (lockup_detected) + panic_on_wq_watchdog(); + wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } |