diff options
Diffstat (limited to 'kernel')
67 files changed, 1478 insertions, 660 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 986c8214dabf..179848ad33e9 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,7 +84,6 @@ static struct ctl_table kern_acct_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __init int kernel_acct_sysctls_init(void) diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 4100df44c665..17067dcb4386 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 733fca2634b7..1a6c3faa6e4a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,7 +22,6 @@ #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/random.h> -#include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -38,6 +37,7 @@ #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> +#include <linux/execmem.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -1065,12 +1065,12 @@ void bpf_jit_uncharge_modmem(u32 size) void *__weak bpf_jit_alloc_exec(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { - module_memfree(addr); + execmem_free(addr); } struct bpf_binary_header * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf6285760aea..2b7d3c96c7ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6035,7 +6035,6 @@ static struct ctl_table bpf_syscall_table[] = { .mode = 0644, .proc_handler = bpf_stats_handler, }, - { } }; static int __init bpf_syscall_sysctl_init(void) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..b9dbf6bf2779 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851c..e32b6972c478 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..a10e4bd0c0c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -202,6 +202,14 @@ struct cpuset { }; /* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; @@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); - static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void) /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void) } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } @@ -2079,14 +2075,11 @@ write_error: /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * cpuset_handle_hotplug may call back into cgroup core asynchronously + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after @@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, */ css_get(&cs->css); kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); cpus_read_lock(); mutex_lock(&cpuset_mutex); @@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -4318,8 +4302,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. + * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); } } @@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void) force_rebuild = true; } -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; -} - /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4493,13 +4470,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4519,18 +4494,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4558,8 +4523,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +4537,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - rebuild_sched_domains(); + rebuild_sched_domains_cpuslocked(); } free_cpumasks(NULL, ptmp); @@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042a7..074653f964c1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { @@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb40..0e5ec7d59b4d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb499..fb8b49437573 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include <linux/btf.h> #include <linux/btf_ids.h> +#include <trace/events/cgroup.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; #endif - cgroup_rstat_flush_release(); + cgroup_rstat_flush_release(cgrp); } else { root_cgroup_cputime(&bstat); usage = bstat.cputime.sum_exec_runtime; diff --git a/kernel/cpu.c b/kernel/cpu.c index 63447eb85dab..563877d6c28b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -/* - * - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock - * protected region. - * - * The operation is still serialized against concurrent CPU hotplug via - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ - * serialized against other hotplug related activity like adding or - * removing of state callbacks and state instances, which invoke either the - * startup or the teardown callback of the affected state. - * - * This is required for subsystems which are unfixable vs. CPU hotplug and - * evade lock inversion problems by scheduling work which has to be - * completed _before_ cpu_up()/_cpu_down() returns. - * - * Don't even think about adding anything to this for any new code or even - * drivers. It's only purpose is to keep existing lock order trainwrecks - * working. - * - * For cpu_down() there might be valid reasons to finish cleanups which are - * not required to be done under cpu_hotplug_lock, but that's a different - * story and would be not invoked via this. - */ -static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) -{ - /* - * cpusets delegate hotplug operations to a worker to "solve" the - * lock order problems. Wait for the worker, but only if tasks are - * _not_ frozen (suspend, hibernate) as that would wait forever. - * - * The wait is required because otherwise the hotplug operation - * returns with inconsistent state, which could even be observed in - * user space when a new CPU is brought up. The CPU plug uevent - * would be delivered and user space reacting on it would fail to - * move tasks to the newly plugged CPU up to the point where the - * work has finished because up to that point the newly plugged CPU - * is not assignable in cpusets/cgroups. On unplug that's not - * necessarily a visible issue, but it is still inconsistent state, - * which is the real problem which needs to be "fixed". This can't - * prevent the transient state between scheduling the work and - * returning from waiting for it. - */ - if (!tasks_frozen) - cpuset_wait_for_hotplug(); -} - #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) @@ -1494,7 +1448,6 @@ out: */ lockup_detector_cleanup(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } @@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) out: cpus_write_unlock(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..394db3ebe835 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -493,10 +493,10 @@ static DEFINE_MUTEX(__crash_hotplug_lock); /* * This routine utilized when the crash_hotplug sysfs node is read. - * It reflects the kernel's ability/permission to update the crash - * elfcorehdr directly. + * It reflects the kernel's ability/permission to update the kdump + * image directly. */ -int crash_check_update_elfcorehdr(void) +int crash_check_hotplug_support(void) { int rc = 0; @@ -508,10 +508,7 @@ int crash_check_update_elfcorehdr(void) return 0; } if (kexec_crash_image) { - if (kexec_crash_image->file_mode) - rc = 1; - else - rc = kexec_crash_image->update_elfcorehdr; + rc = kexec_crash_image->hotplug_support; } /* Release lock now that update complete */ kexec_unlock(); @@ -534,7 +531,7 @@ int crash_check_update_elfcorehdr(void) * list of segments it checks (since the elfcorehdr changes and thus * would require an update to purgatory itself to update the digest). */ -static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg) { struct kimage *image; @@ -552,8 +549,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image = kexec_crash_image; - /* Check that updating elfcorehdr is permitted */ - if (!(image->file_mode || image->update_elfcorehdr)) + /* Check that kexec segments update is permitted */ + if (!image->hotplug_support) goto out; if (hp_action == KEXEC_CRASH_HP_ADD_CPU || @@ -596,7 +593,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image->hp_action = hp_action; /* Now invoke arch-specific update handler */ - arch_crash_handle_hotplug_event(image); + arch_crash_handle_hotplug_event(image, arg); /* No longer handling a hotplug event */ image->hp_action = KEXEC_CRASH_HP_NONE; @@ -612,17 +609,17 @@ out: crash_hotplug_unlock(); } -static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg) { switch (val) { case MEM_ONLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; case MEM_OFFLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; } return NOTIFY_OK; @@ -635,13 +632,13 @@ static struct notifier_block crash_memhp_nb = { static int crash_cpuhp_online(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL); return 0; } static int crash_cpuhp_offline(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL); return 0; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 6f0c358e73d8..e039b0f99a0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -74,7 +74,6 @@ static struct ctl_table kern_delayacct_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static __init int kernel_delayacct_sysctls_init(void) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..2c83ba776fc7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -18,7 +18,7 @@ #include <linux/sched/coredump.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ -#include <linux/mmu_notifier.h> /* set_pte_at_notify */ +#include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); return u; } @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe) if (WARN_ON(!uprobe_is_active(uprobe))) return; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe); } @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode, get_uprobe(u); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..cd3aa9042f1a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -94,7 +94,6 @@ static struct ctl_table kern_exit_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_exit_sysctls_init(void) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1e78ef24321e..06a1f091be81 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1150,7 +1150,7 @@ static int __init futex_init(void) unsigned int futex_shift; unsigned long i; -#if CONFIG_BASE_SMALL +#ifdef CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2fc2727d654..1d92016b0b3c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -314,7 +314,6 @@ static struct ctl_table hung_task_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, - {} }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/kexec.c b/kernel/kexec.c index bab542fc1463..a6b3f96bb50c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -135,8 +135,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image->preserve_context = 1; #ifdef CONFIG_CRASH_HOTPLUG - if (flags & KEXEC_UPDATE_ELFCOREHDR) - image->update_elfcorehdr = 1; + if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; #endif ret = machine_kexec_prepare(image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 0e96f6b24344..9112d69d68b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -948,7 +948,6 @@ static struct ctl_table kexec_core_sysctls[] = { .mode = 0644, .proc_handler = kexec_limit_handler, }, - { } }; static int __init kexec_core_sysctl_init(void) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2d1db05fbf04..3d64290d24c9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -376,6 +376,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; +#ifdef CONFIG_CRASH_HOTPLUG + if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 65adc815fc6e..f2cea3c80cb5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/stddef.h> #include <linux/export.h> -#include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> @@ -39,6 +38,7 @@ #include <linux/jump_label.h> #include <linux/static_call.h> #include <linux/perf_event.h> +#include <linux/execmem.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -113,17 +113,17 @@ enum kprobe_slot_state { void __weak *alloc_insn_page(void) { /* - * Use module_alloc() so this page is within +/- 2GB of where the + * Use execmem_alloc() so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * for most of the architectures. * (e.g. x86-64 needs this to handle the %rip-relative fixups.) */ - return module_alloc(PAGE_SIZE); + return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); } static void free_insn_page(void *page) { - module_memfree(page); + execmem_free(page); } struct kprobe_insn_cache kprobe_insn_slots = { @@ -968,7 +968,6 @@ static struct ctl_table kprobe_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init kprobe_sysctls_init(void) @@ -1068,6 +1067,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; +bool kprobe_ftrace_disabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1136,6 +1136,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } + +void kprobe_ftrace_kill() +{ + kprobe_ftrace_disabled = true; +} #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int arm_kprobe_ftrace(struct kprobe *p) { @@ -1588,7 +1593,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* Get module refcount and reject __init functions for loaded modules. */ - if (*probed_mod) { + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1603,12 +1608,13 @@ static int check_kprobe_address_safe(struct kprobe *p, * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && - (*probed_mod)->state != MODULE_STATE_COMING) { + !module_is_coming(*probed_mod)) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } + out: preempt_enable(); jump_label_unlock(); @@ -2488,24 +2494,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } -/* Remove all symbols in given area from kprobe blacklist */ -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) -{ - struct kprobe_blacklist_entry *ent, *n; - - list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { - if (ent->start_addr < start || ent->start_addr >= end) - continue; - list_del(&ent->list); - kfree(ent); - } -} - -static void kprobe_remove_ksym_blacklist(unsigned long entry) -{ - kprobe_remove_area_blacklist(entry, entry + 1); -} - int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { @@ -2570,6 +2558,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2672,6 +2679,17 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +static int kprobe_register_module_notifier(void) +{ + return register_module_notifier(&kprobe_module_nb); +} +#else +static int kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); @@ -2731,7 +2749,7 @@ static int __init init_kprobes(void) if (!err) err = register_die_notifier(&kprobe_exceptions_nb); if (!err) - err = register_module_notifier(&kprobe_module_nb); + err = kprobe_register_module_notifier(); kprobes_initialized = (err == 0); kprobe_sysctls_init(); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 781249098cb6..84c53285f499 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -85,7 +85,6 @@ static struct ctl_table latencytop_sysctl[] = { .mode = 0644, .proc_handler = sysctl_latencytop, }, - {} }; #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ecbc9b6aba3a..52426665eecc 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -973,7 +973,7 @@ static int __klp_disable_patch(struct klp_patch *patch) if (klp_transition_patch) return -EBUSY; - klp_init_transition(patch, KLP_UNPATCHED); + klp_init_transition(patch, KLP_TRANSITION_UNPATCHED); klp_for_each_object(patch, obj) if (obj->patched) @@ -1008,7 +1008,7 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); - klp_init_transition(patch, KLP_PATCHED); + klp_init_transition(patch, KLP_TRANSITION_PATCHED); /* * Enforce the order of the func->transition writes in diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 4152c71507e2..90408500e5a3 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -95,9 +95,9 @@ static void notrace klp_ftrace_handler(unsigned long ip, patch_state = current->patch_state; - WARN_ON_ONCE(patch_state == KLP_UNDEFINED); + WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE); - if (patch_state == KLP_UNPATCHED) { + if (patch_state == KLP_TRANSITION_UNPATCHED) { /* * Use the previously patched version of the function. * If no previous patches exist, continue with the diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index e54c3d60a904..ba069459c101 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -23,7 +23,7 @@ static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); struct klp_patch *klp_transition_patch; -static int klp_target_state = KLP_UNDEFINED; +static int klp_target_state = KLP_TRANSITION_IDLE; static unsigned int klp_signals_cnt; @@ -96,16 +96,16 @@ static void klp_complete_transition(void) pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) { klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* - * All tasks have transitioned to KLP_UNPATCHED so we can now + * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now * remove the new functions from the func_stack. */ klp_unpatch_objects(klp_transition_patch); @@ -123,36 +123,36 @@ static void klp_complete_transition(void) klp_for_each_func(obj, func) func->transition = false; - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ - if (klp_target_state == KLP_PATCHED) + /* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */ + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_synchronize_transition(); read_lock(&tasklist_lock); for_each_process_thread(g, task) { WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } read_unlock(&tasklist_lock); for_each_possible_cpu(cpu) { task = idle_task(cpu); WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; - if (klp_target_state == KLP_PATCHED) + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_post_patch_callback(obj); - else if (klp_target_state == KLP_UNPATCHED) + else if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_post_unpatch_callback(obj); } pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - klp_target_state = KLP_UNDEFINED; + klp_target_state = KLP_TRANSITION_IDLE; klp_transition_patch = NULL; } @@ -164,13 +164,13 @@ static void klp_complete_transition(void) */ void klp_cancel_transition(void) { - if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) + if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED)) return; pr_debug("'%s': canceling patching transition, going to unpatch\n", klp_transition_patch->mod->name); - klp_target_state = KLP_UNPATCHED; + klp_target_state = KLP_TRANSITION_UNPATCHED; klp_complete_transition(); } @@ -218,7 +218,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, struct klp_ops *ops; int i; - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* * Check for the to-be-unpatched function * (the func itself). @@ -455,7 +455,7 @@ void klp_try_complete_transition(void) struct klp_patch *patch; bool complete = true; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); /* * Try to switch the tasks to the target patch state by walking their @@ -532,11 +532,11 @@ void klp_start_transition(void) struct task_struct *g, *task; unsigned int cpu; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); pr_notice("'%s': starting %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Mark all normal tasks as needing a patch state update. They'll @@ -578,7 +578,7 @@ void klp_init_transition(struct klp_patch *patch, int state) struct klp_func *func; int initial_state = !state; - WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE); klp_transition_patch = patch; @@ -589,7 +589,7 @@ void klp_init_transition(struct klp_patch *patch, int state) klp_target_state = state; pr_debug("'%s': initializing %s transition\n", patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Initialize all tasks to the initial patch state to prepare them for @@ -597,7 +597,7 @@ void klp_init_transition(struct klp_patch *patch, int state) */ read_lock(&tasklist_lock); for_each_process_thread(g, task) { - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } read_unlock(&tasklist_lock); @@ -607,19 +607,19 @@ void klp_init_transition(struct klp_patch *patch, int state) */ for_each_possible_cpu(cpu) { task = idle_task(cpu); - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } /* * Enforce the order of the task->patch_state initializations and the * func->transition updates to ensure that klp_ftrace_handler() doesn't - * see a func in transition with a task->patch_state of KLP_UNDEFINED. + * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE. * * Also enforce the order of the klp_target_state write and future * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and * __klp_sched_try_switch() don't set a task->patch_state to - * KLP_UNDEFINED. + * KLP_TRANSITION_IDLE. */ smp_wmb(); @@ -652,7 +652,7 @@ void klp_reverse_transition(void) pr_debug("'%s': reversing transition from %s\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching to unpatching" : + klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" : "unpatching to patching"); /* @@ -741,7 +741,7 @@ void klp_force_transition(void) klp_update_patch_state(idle_task(cpu)); /* Set forced flag for patches being removed. */ - if (klp_target_state == KLP_UNPATCHED) + if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_transition_patch->forced = true; else if (klp_transition_patch->replace) { klp_for_each_patch(patch) { diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index f3e0329337f6..4047b6d48255 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -2,6 +2,7 @@ menuconfig MODULES bool "Enable loadable module support" modules + select EXECMEM help Kernel modules are small pieces of compiled code which can be inserted in the running kernel, rather than being @@ -392,7 +393,7 @@ config UNUSED_KSYMS_WHITELIST exported at all times, even in absence of in-tree users. The value to set here is the path to a text file containing the list of symbols, one per line. The path can be absolute, or relative to the kernel - source tree. + source or obj tree. config MODULES_TREE_LOOKUP def_bool y diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index ef73ae7c8909..62fb57bb9f16 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -348,7 +348,7 @@ const char *module_address_lookup(unsigned long addr, } /* Make a copy in here where it's safe */ if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + strscpy(namebuf, ret, KSYM_NAME_LEN); ret = namebuf; } preempt_enable(); diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..91e185607d4b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -57,6 +57,7 @@ #include <linux/audit.h> #include <linux/cfi.h> #include <linux/debugfs.h> +#include <linux/execmem.h> #include <uapi/linux/module.h> #include "internal.h" @@ -1179,16 +1180,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -void __weak module_memfree(void *module_region) -{ - /* - * This memory may be RO, and freeing RO memory in an interrupt is not - * supported by vmalloc. - */ - WARN_ON(in_interrupt()); - vfree(module_region); -} - void __weak module_arch_cleanup(struct module *mod) { } @@ -1197,25 +1188,47 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static bool mod_mem_use_vmalloc(enum mod_mem_type type) +static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { - return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) && - mod_mem_type_is_core_data(type); -} + unsigned int size = PAGE_ALIGN(mod->mem[type].size); + enum execmem_type execmem_type; + void *ptr; -static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) -{ - if (mod_mem_use_vmalloc(type)) - return vzalloc(size); - return module_alloc(size); + mod->mem[type].size = size; + + if (mod_mem_type_is_data(type)) + execmem_type = EXECMEM_MODULE_DATA; + else + execmem_type = EXECMEM_MODULE_TEXT; + + ptr = execmem_alloc(execmem_type, size); + if (!ptr) + return -ENOMEM; + + /* + * The pointer to these blocks of memory are stored on the module + * structure and we keep that around so long as the module is + * around. We only free that memory when we unload the module. + * Just mark them as not being a leak then. The .init* ELF + * sections *do* get freed after boot so we *could* treat them + * slightly differently with kmemleak_ignore() and only grey + * them out as they work as typical memory allocations which + * *do* eventually get freed, but let's just keep things simple + * and avoid *any* false positives. + */ + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; + + return 0; } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { - if (mod_mem_use_vmalloc(type)) - vfree(ptr); - else - module_memfree(ptr); + void *ptr = mod->mem[type].base; + + execmem_free(ptr); } static void free_mod_mem(struct module *mod) @@ -1229,12 +1242,12 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ @@ -1610,13 +1623,6 @@ static void free_modinfo(struct module *mod) } } -void * __weak module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - bool __weak module_init_section(const char *name) { return strstarts(name, ".init"); @@ -2225,7 +2231,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; enum mod_mem_type t = 0; int ret = -ENOMEM; @@ -2234,26 +2239,12 @@ static int move_module(struct module *mod, struct load_info *info) mod->mem[type].base = NULL; continue; } - mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); - ptr = module_memory_alloc(mod->mem[type].size, type); - /* - * The pointer to these blocks of memory are stored on the module - * structure and we keep that around so long as the module is - * around. We only free that memory when we unload the module. - * Just mark them as not being a leak then. The .init* ELF - * sections *do* get freed after boot so we *could* treat them - * slightly differently with kmemleak_ignore() and only grey - * them out as they work as typical memory allocations which - * *do* eventually get freed, but let's just keep things simple - * and avoid *any* false positives. - */ - kmemleak_not_leak(ptr); - if (!ptr) { + + ret = module_memory_alloc(mod, type); + if (ret) { t = type; goto out_enomem; } - memset(ptr, 0, mod->mem[type].size); - mod->mem[type].base = ptr; } /* Transfer each section which specifies SHF_ALLOC */ @@ -2296,7 +2287,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod, t); return ret; } @@ -2482,9 +2473,9 @@ static void do_free_init(struct work_struct *w) llist_for_each_safe(pos, n, list) { initfree = container_of(pos, struct mod_initfree, node); - module_memfree(initfree->init_text); - module_memfree(initfree->init_data); - module_memfree(initfree->init_rodata); + execmem_free(initfree->init_text); + execmem_free(initfree->init_data); + execmem_free(initfree->init_rodata); kfree(initfree); } } @@ -2594,10 +2585,10 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path. module_memfree() cannot be called in an interrupt, so do the + * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. * - * Note that module_alloc() on most architectures creates W+X page + * Note that execmem_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work by invoking diff --git a/kernel/panic.c b/kernel/panic.c index 747c3f3d289a..8bff183d6180 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,6 @@ static struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_panic_sysctls_init(void) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7ade20e95232..dc48fecfa1dc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -307,7 +307,6 @@ static struct ctl_table pid_ns_ctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, - { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index 2ee41a3a1dfd..fe9fb991dc42 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -41,7 +41,6 @@ static struct ctl_table pid_ns_ctl_table_vm[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static inline void register_pid_ns_sysctl_table_vm(void) { diff --git a/kernel/power/process.c b/kernel/power/process.c index cae81a87cc91..66ac067d9ae6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -194,8 +194,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index adf99c05adca..3160d287d4cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -178,9 +178,9 @@ static int __init control_devkmsg(char *str) * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) - strcpy(devkmsg_log_str, "on"); + strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) - strcpy(devkmsg_log_str, "off"); + strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* @@ -209,7 +209,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return -EINVAL; old = devkmsg_log; - strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); @@ -227,7 +227,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* ... and restore old setting. */ devkmsg_log = old; - strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + strscpy(devkmsg_log_str, old_str); return -EINVAL; } @@ -2506,22 +2506,22 @@ static int __init console_setup(char *str) /* * Decode str into name, index, options. */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; + if (isdigit(str[0])) + scnprintf(buf, sizeof(buf), "ttyS%s", str); + else + strscpy(buf, str); + options = strchr(str, ','); if (options) *(options++) = 0; + #ifdef __sparc__ if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); + strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); + strscpy(buf, "ttyS1"); #endif + for (s = buf; *s; s++) if (isdigit(*s) || *s == ',') break; diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index c228343eeb97..3e47dedce9e5 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -76,7 +76,6 @@ static struct ctl_table printk_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - {} }; void __init printk_sysctl_init(void) diff --git a/kernel/reboot.c b/kernel/reboot.c index 22c16e2564cc..f05dbde2c93f 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1295,7 +1295,6 @@ static struct ctl_table kern_reboot_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void __init kernel_reboot_sysctls_init(void) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 991fc9002535..db68a964e34e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init sched_autogroup_sysctl_init(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a914388144a..373eaeaf63b8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4741,7 +4741,6 @@ static struct ctl_table sched_core_sysctls[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_core_sysctl_init(void) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..c75d1307d86d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 146ecf9cc3af..4214df32ba45 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -151,7 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e..aa4c1c874fa4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -56,7 +56,6 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - {} }; static int __init sched_rt_sysctl_init(void) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63aecd2a7a9f..683559831656 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -322,7 +322,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f70e031e06a8..e30b60b57614 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2445,7 +2445,6 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0644, .proc_handler = seccomp_actions_logged_handler, }, - { } }; static int __init seccomp_sysctl_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 7bdbcf1b78d0..01c4c46a51a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4840,7 +4840,6 @@ static struct ctl_table signal_debug_table[] = { .proc_handler = proc_dointvec }, #endif - { } }; static int __init init_signal_sysctls(void) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 59cdfaf5118e..0f9712584913 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -54,7 +54,6 @@ static struct ctl_table stackleak_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init stackleak_sysctls_init(void) diff --git a/kernel/sys.c b/kernel/sys.c index 8bb106a56b3a..f9c95410278c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -146,6 +146,12 @@ #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif +#ifndef PPC_GET_DEXCR_ASPECT +# define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) +#endif +#ifndef PPC_SET_DEXCR_ASPECT +# define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2726,6 +2732,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; + case PR_PPC_GET_DEXCR: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PPC_GET_DEXCR_ASPECT(me, arg2); + break; + case PR_PPC_SET_DEXCR: + if (arg4 || arg5) + return -EINVAL; + error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81cc974913bb..e0b917328cf9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2034,7 +2034,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_INT_MAX, }, #endif - { } }; static struct ctl_table vm_table[] = { @@ -2240,7 +2239,6 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif - { } }; int __init sysctl_init_bases(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e394d6d5b9b5..48288dd4a102 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -312,7 +312,6 @@ static struct ctl_table timer_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init timer_sysctl_init(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b3d7f62ac581..166ad5444eea 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c83c005e654e..a130b2d898f7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -125,17 +125,6 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /* - * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses are unpredictable anyway. - * Ex: BPF trampoline may call original function and may skip frame - * depending on type of BPF programs attached. - */ - if (ftrace_direct_func_count && - ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) - return -EBUSY; -#endif trace.func = func; trace.depth = ++current->curr_ret_depth; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c96b30f3d63..65208d3b5ed9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1595,12 +1595,15 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct dyn_ftrace *rec; + unsigned long ip = 0; + rcu_read_lock(); rec = lookup_rec(start, end); if (rec) - return rec->ip; + ip = rec->ip; + rcu_read_unlock(); - return 0; + return ip; } /** @@ -1614,25 +1617,22 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) */ unsigned long ftrace_location(unsigned long ip) { - struct dyn_ftrace *rec; + unsigned long loc; unsigned long offset; unsigned long size; - rec = lookup_rec(ip, ip); - if (!rec) { + loc = ftrace_location_range(ip, ip); + if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) goto out; /* map sym+0 to __fentry__ */ if (!offset) - rec = lookup_rec(ip, ip + size - 1); + loc = ftrace_location_range(ip, ip + size - 1); } - if (rec) - return rec->ip; - out: - return 0; + return loc; } /** @@ -2538,7 +2538,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); -int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -4201,12 +4200,12 @@ static int add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, int clear_filter) { - long index = simple_strtoul(func_g->search, NULL, 0); + long index; struct ftrace_page *pg; struct dyn_ftrace *rec; /* The index starts at 1 */ - if (--index < 0) + if (kstrtoul(func_g->search, 0, &index) || --index < 0) return 0; do_for_each_ftrace_rec(pg, rec) { @@ -5317,14 +5316,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -struct ftrace_direct_func { - struct list_head next; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - static int register_ftrace_function_nolock(struct ftrace_ops *ops); /* @@ -5365,6 +5356,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long } } +static void register_ftrace_direct_cb(struct rcu_head *rhp) +{ + struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu); + + free_ftrace_hash(fhp); +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5463,10 +5461,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) out_unlock: mutex_unlock(&direct_mutex); - if (free_hash && free_hash != EMPTY_HASH) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } + if (free_hash && free_hash != EMPTY_HASH) + call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb); if (new_hash) free_ftrace_hash(new_hash); @@ -5816,9 +5812,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function); static int __init set_graph_max_depth_function(char *str) { - if (!str) + if (!str || kstrtouint(str, 0, &fgraph_max_depth)) return 0; - fgraph_max_depth = simple_strtoul(str, NULL, 0); return 1; } __setup("ftrace_graph_max_depth=", set_graph_max_depth_function); @@ -6595,6 +6590,8 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { WARN_ON(!skipped); + /* Need to synchronize with ftrace_location_range() */ + synchronize_rcu(); ftrace_free_pages(pg_unuse); } return ret; @@ -6808,6 +6805,9 @@ void ftrace_release_mod(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) + synchronize_rcu(); for (pg = tmp_page; pg; pg = tmp_page) { /* Needs to be called outside of ftrace_lock */ @@ -7141,6 +7141,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; @@ -7182,12 +7183,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - ftrace_number_of_groups--; - kfree(pg); + pg->next = tmp_page; + tmp_page = pg; pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) ftrace_pages = pg; @@ -7204,6 +7201,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) clear_func_from_hashes(func); kfree(func); } + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) { + synchronize_rcu(); + ftrace_free_pages(tmp_page); + } } void __init ftrace_free_init_mem(void) @@ -7894,6 +7896,7 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /** @@ -8269,7 +8272,6 @@ static struct ctl_table ftrace_sysctls[] = { .mode = 0644, .proc_handler = ftrace_enable_sysctl, }, - {} }; static int __init ftrace_sysctl_init(void) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } @@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6511dc3a00da..7345a8b625fb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> +#include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -26,6 +27,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> @@ -312,6 +314,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_MASK (3 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -338,6 +342,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id; /* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +489,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -1524,7 +1535,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_ZERO, + mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1599,6 +1610,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1609,7 +1621,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; @@ -1789,8 +1801,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -2318,7 +2328,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { - return rb_page_commit(bpage); + return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned @@ -3945,7 +3955,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) return true; /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_commit(reader)) + if (reader->read != rb_page_size(reader)) return false; /* @@ -4416,7 +4426,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && - iter->head == rb_page_commit(cpu_buffer->reader_page))); + iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -5211,6 +5221,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5255,6 +5281,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; + if (cpu_buffer->mapped) + rb_update_meta_page(cpu_buffer); + rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; } @@ -5469,6 +5498,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5579,7 +5614,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) goto out; page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, + GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); @@ -5720,7 +5755,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_commit(reader); + commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -5733,7 +5768,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5796,7 +5832,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += rb_page_commit(reader); + cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); @@ -5956,6 +5992,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -6057,6 +6098,414 @@ error: } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + meta->meta_page_size = PAGE_SIZE; + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) + cpu_buffer->mapped++; + else + cpu_buffer->mapped--; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (1 + (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +#ifdef CONFIG_MMU +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + /* Refuse MP_PRIVATE or writable mappings */ + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + + /* + * Make sure the mapping cannot become writable later. Also tell the VM + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). + */ + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, + VM_MAYWRITE); + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (!vma_pages || vma_pages > nr_pages) + return -EINVAL; + + nr_pages = vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * TODO: Align sub-buffers on their size, once + * vm_insert_pages() supports the zero-page. + */ + } else { + /* Skip the meta-page */ + pgoff--; + + if (pgoff % subbuf_pages) { + err = -EINVAL; + goto out; + } + + s += pgoff / subbuf_pages; + } + + while (p < nr_pages) { + struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + int off = 0; + + if (WARN_ON_ONCE(s >= nr_subbufs)) { + err = -EINVAL; + goto out; + } + + for (; off < (1 << (subbuf_order)); off++, page++) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + +out: + kfree(pages); + + return err; +} +#else +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} +#endif + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer->mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + } + +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + cpu_buffer->mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); + +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + reader = rb_get_reader_page(cpu_buffer); + if (WARN_ON(!reader)) + goto out; + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { + if (missed_events) { + struct buffer_data_page *bpage = reader->page; + unsigned int commit; + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + /* + * If there is room at the end of the page to save the + * missed events, then record it there. + */ + commit = rb_page_size(reader); + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + } else { + /* + * There really shouldn't be any missed events if the commit + * is on the reader page. + */ + WARN_ON_ONCE(missed_events); + } + + cpu_buffer->lost_events = 0; + + goto consume; + +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..578a49ff5c32 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, return; } + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); @@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) lockdep_assert_held(&trace_types_lock); spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX) { + if (tr->snapshot == UINT_MAX || tr->mapped) { spin_unlock(&tr->snapshot_trigger_lock); return -EBUSY; } @@ -5540,7 +5546,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t symstr, <type>\\[<array-size>\\]\n" + "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" @@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; - + tr->current_trace->enabled--; if (tr->current_trace->reset) @@ -8194,15 +8200,32 @@ out: return ret; } -/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; + int err; + + if (cmd == TRACE_MMAP_IOCTL_GET_READER) { + if (!(file->f_flags & O_NONBLOCK)) { + err = ring_buffer_wait(iter->array_buffer->buffer, + iter->cpu_file, + iter->tr->buffer_percent, + NULL, NULL); + if (err) + return err; + } - if (cmd) - return -ENOIOCTLCMD; + return ring_buffer_map_get_reader(iter->array_buffer->buffer, + iter->cpu_file); + } else if (cmd) { + return -ENOTTY; + } + /* + * An ioctl call with cmd 0 to the ring buffer file will wake up all + * waiters + */ mutex_lock(&trace_types_lock); /* Make sure the waiters see the new wait_index */ @@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } +#ifdef CONFIG_TRACER_MAX_TRACE +static int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} +static void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} +#else +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +#endif + +static void tracing_buffers_mmap_close(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); + put_snapshot_map(iter->tr); +} + +static const struct vm_operations_struct tracing_buffers_vmops = { + .close = tracing_buffers_mmap_close, +}; + +static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + int ret = 0; + + ret = get_snapshot_map(iter->tr); + if (ret) + return ret; + + ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); + if (ret) + put_snapshot_map(iter->tr); + + vma->vm_ops = &tracing_buffers_vmops; + + return ret; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, @@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = { .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, + .mmap = tracing_buffers_mmap, }; static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64450615ca0c..749a182dab48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -336,6 +336,7 @@ struct trace_array { bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; + unsigned int mapped; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 811b08439406..e19c32f2a938 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -104,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - delta = div64_u64(delta, bm_cnt); + do_div(delta, (u32)bm_cnt); avg = delta; if (stddev > 0) { diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..3a2b46847c8b 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1990,6 +1990,80 @@ static int user_event_set_tp_name(struct user_event *user) } /* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *insert_space_after_semis(char *args, int count) +{ + char *fixed, *pos; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + + /* Insert a space after ';' if there is no trailing space. */ + while (*args) { + *pos = *args++; + + if (*pos++ == ';' && !isspace(*args)) + *pos++ = ' '; + } + + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + char **split; + char *fixed; + int count; + + /* Count how many ';' without a trailing space */ + count = count_semis_no_space(args); + + /* No fixup is required */ + if (!count) + return argv_split(GFP_KERNEL, args, argc); + + /* We must fixup 'field;field' to 'field; field' */ + fixed = insert_space_after_semis(args, count); + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; +} + +/* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. @@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return -EPERM; if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); + argv = user_event_argv_split(args, &argc); if (!argv) return -ENOMEM; @@ -2833,7 +2907,6 @@ static struct ctl_table user_event_sysctls[] = { .mode = 0644, .proc_handler = set_max_user_events_sysctl, }, - {} }; static int __init trace_events_user_init(void) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4f4280815522..62e6a8f4aae9 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc, is_return); @@ -1154,6 +1159,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14099cc17fc9..16383247bdbf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } +#ifdef CONFIG_MODULES static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { char *p; @@ -129,6 +130,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +#else +static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) +{ + return false; +} +#endif static bool trace_kprobe_is_busy(struct dyn_event *ev) { @@ -670,6 +677,7 @@ end: return ret; } +#ifdef CONFIG_MODULES /* Module notifier call back, checking event on the module */ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -704,6 +712,16 @@ static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; +static int trace_kprobe_register_module_notifier(void) +{ + return register_module_notifier(&trace_kprobe_module_nb); +} +#else +static int trace_kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ static int count_symbols(void *data, unsigned long unused) { @@ -782,6 +800,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -933,6 +952,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); @@ -979,6 +1002,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: @@ -1933,7 +1957,7 @@ static __init int init_kprobe_trace_early(void) if (ret) return ret; - if (register_module_notifier(&trace_kprobe_module_nb)) + if (trace_kprobe_register_module_notifier()) return -EINVAL; return 0; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c3f2937b434a..5e263c141574 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include <linux/bpf.h> +#include <linux/fs.h> #include "trace_btf.h" #include "trace_probe.h" @@ -1737,6 +1738,68 @@ error: return ERR_PTR(ret); } +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + char *tmp; + char *equal; + size_t arg_len; + + if (!glob_match("*:%p[dD]", argv[i])) + continue; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; + } + + *buf = tmpbuf; + return 0; +nomem: + kfree(tmpbuf); + return -ENOMEM; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cef3a50628a3..5803e6a41570 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX #define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) @@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9e461362450a..8541fa1494ae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; @@ -940,30 +941,56 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + if (*ucbp) + return *ucbp; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + *ucbp = ucb; + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer **ucbp, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; + struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,14 +1004,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; @@ -993,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucbp, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucbp, link->file); rcu_read_unlock(); } @@ -1199,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1326,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); @@ -1335,10 +1366,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1356,7 +1388,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1412,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); - - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; + memcpy(data, ucb->buf, ucb->dsize); - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1425,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1474,11 +1504,9 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1490,18 +1518,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1512,8 +1534,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1546,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; diff --git a/kernel/ucount.c b/kernel/ucount.c index d9e283600f5c..8c07714ff27d 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -38,7 +38,7 @@ static int set_is_seen(struct ctl_table_set *set) } static int set_permissions(struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); @@ -87,7 +87,6 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif - { } }; #endif /* CONFIG_SYSCTL */ @@ -96,7 +95,7 @@ bool setup_userns_sysctls(struct user_namespace *ns) #ifdef CONFIG_SYSCTL struct ctl_table *tbl; - BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); + BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { diff --git a/kernel/umh.c b/kernel/umh.c index 1b13c5d34624..598b3ffe1522 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -560,7 +560,6 @@ static struct ctl_table usermodehelper_table[] = { .mode = 0600, .proc_handler = proc_cap_handler, }, - { } }; static int __init init_umh_sysctls(void) diff --git a/kernel/user.c b/kernel/user.c index 03cedc366dc9..aa1162deafe4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) +#define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 019e3a1566cf..76a772072557 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -120,7 +120,6 @@ static struct ctl_table uts_kern_table[] = { .proc_handler = proc_do_uts_string, .poll = &domainname_poll, }, - {} }; #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d12ff74889ed..941236828de8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1155,7 +1155,6 @@ static struct ctl_table watchdog_sysctls[] = { }, #endif /* CONFIG_SMP */ #endif - {} }; static struct ctl_table watchdog_hardlockup_sysctl[] = { @@ -1168,7 +1167,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init watchdog_sysctl_init(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 80882ae43261..003474c9a77d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,6 +99,7 @@ enum worker_flags { enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ + WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { @@ -392,6 +393,12 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; +struct work_offq_data { + u32 pool_id; + u32 disable; + u32 flags; +}; + static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", @@ -490,12 +497,6 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* - * Used to synchronize multiple cancel_sync attempts on the same work item. See - * work_grab_pending() and __cancel_work_sync(). - */ -static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); - -/* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. @@ -763,6 +764,11 @@ static int work_next_color(int color) return (color + 1) % WORK_NR_COLORS; } +static unsigned long pool_offq_flags(struct worker_pool *pool) +{ + return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; +} + /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag @@ -776,11 +782,6 @@ static int work_next_color(int color) * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. - * - * %WORK_OFFQ_CANCELING is used to mark a work item which is being - * canceled. While being canceled, a work item may have its PENDING set - * but stay off timer and worklist for arbitrarily long and nobody should - * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { @@ -892,36 +893,26 @@ static struct worker_pool *get_work_pool(struct work_struct *work) return idr_find(&worker_pool_idr, pool_id); } -/** - * get_work_pool_id - return the worker pool ID a given work is associated with - * @work: the work item of interest - * - * Return: The worker_pool ID @work was last associated with. - * %WORK_OFFQ_POOL_NONE if none. - */ -static int get_work_pool_id(struct work_struct *work) +static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - unsigned long data = atomic_long_read(&work->data); - - if (data & WORK_STRUCT_PWQ) - return work_struct_pwq(data)->pool->id; - - return data >> WORK_OFFQ_POOL_SHIFT; + return (v >> shift) & ((1 << bits) - 1); } -static void mark_work_canceling(struct work_struct *work) +static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) { - unsigned long pool_id = get_work_pool_id(work); + WARN_ON_ONCE(data & WORK_STRUCT_PWQ); - pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); + offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, + WORK_OFFQ_POOL_BITS); + offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, + WORK_OFFQ_DISABLE_BITS); + offqd->flags = data & WORK_OFFQ_FLAG_MASK; } -static bool work_is_canceling(struct work_struct *work) +static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { - unsigned long data = atomic_long_read(&work->data); - - return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); + return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | + ((unsigned long)offqd->flags); } /* @@ -2067,8 +2058,6 @@ out_put: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry - * -ENOENT if someone else is canceling @work, this state may persist - * for arbitrarily long * ======== ================================================================ * * Note: @@ -2151,7 +2140,8 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * this destroys work->data needed by the next step, stash it. */ work_data = *work_data_bits(work); - set_work_pool_and_keep_pending(work, pool->id, 0); + set_work_pool_and_keep_pending(work, pool->id, + pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); @@ -2164,26 +2154,9 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, fail: rcu_read_unlock(); local_irq_restore(*irq_flags); - if (work_is_canceling(work)) - return -ENOENT; - cpu_relax(); return -EAGAIN; } -struct cwt_wait { - wait_queue_entry_t wait; - struct work_struct *work; -}; - -static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) -{ - struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); - - if (cwait->work != key) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal @@ -2193,7 +2166,7 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * - * Must be called in process context. IRQ is disabled on return with IRQ state + * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * @@ -2202,41 +2175,14 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { - struct cwt_wait cwait; int ret; - might_sleep(); -repeat: - ret = try_to_grab_pending(work, cflags, irq_flags); - if (likely(ret >= 0)) - return ret; - if (ret != -ENOENT) - goto repeat; - - /* - * Someone is already canceling. Wait for it to finish. flush_work() - * doesn't work for PREEMPT_NONE because we may get woken up between - * @work's completion and the other canceling task resuming and clearing - * CANCELING - flush_work() will return false immediately as @work is no - * longer busy, try_to_grab_pending() will return -ENOENT as @work is - * still being canceled and the other canceling task won't be able to - * clear CANCELING as we're hogging the CPU. - * - * Let's wait for completion using a waitqueue. As this may lead to the - * thundering herd problem, use a custom wake function which matches - * @work along with exclusive wait and wakeup. - */ - init_wait(&cwait.wait); - cwait.wait.func = cwt_wakefn; - cwait.work = work; - - prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait, - TASK_UNINTERRUPTIBLE); - if (work_is_canceling(work)) - schedule(); - finish_wait(&wq_cancel_waitq, &cwait.wait); - - goto repeat; + while (true) { + ret = try_to_grab_pending(work, cflags, irq_flags); + if (ret >= 0) + return ret; + cpu_relax(); + } } /** @@ -2422,6 +2368,21 @@ out: rcu_read_unlock(); } +static bool clear_pending_if_disabled(struct work_struct *work) +{ + unsigned long data = *work_data_bits(work); + struct work_offq_data offqd; + + if (likely((data & WORK_STRUCT_PWQ) || + !(data & WORK_OFFQ_DISABLE_MASK))) + return false; + + work_offqd_unpack(&offqd, data); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + return true; +} + /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on @@ -2444,7 +2405,8 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } @@ -2522,7 +2484,8 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); @@ -2604,7 +2567,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, /* read the comment in __queue_work() */ local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } @@ -2636,19 +2600,14 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; - int ret; + bool ret; - do { - ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, - &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); - if (likely(ret >= 0)) { + if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); - local_irq_restore(irq_flags); - } - /* -ENOENT from try_to_grab_pending() becomes %true */ + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); @@ -2677,7 +2636,12 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + /* + * rcu_work can't be canceled or disabled. Warn if the user reached + * inside @rwork and disabled the inner work. + */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; @@ -2953,7 +2917,7 @@ static void idle_worker_timeout(struct timer_list *t) unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); @@ -2995,7 +2959,7 @@ static void idle_cull_fn(struct work_struct *work) struct worker *worker; unsigned long expires; - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { @@ -3230,7 +3194,7 @@ __acquires(&pool->lock) * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_pool_and_clear_pending(work, pool->id, 0); + set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); @@ -3700,7 +3664,7 @@ void workqueue_softirq_dead(unsigned int cpu) if (!need_more_worker(pool)) continue; - INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); @@ -3710,6 +3674,7 @@ void workqueue_softirq_dead(unsigned int cpu) queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); + destroy_work_on_stack(&dead_work.work); } } @@ -4154,8 +4119,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, struct pool_workqueue *pwq; struct workqueue_struct *wq; - might_sleep(); - rcu_read_lock(); pool = get_work_pool(work); if (!pool) { @@ -4207,6 +4170,7 @@ already_gone: static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; + unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4214,13 +4178,41 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (start_flush_work(work, &barr, from_cancel)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { + if (!start_flush_work(work, &barr, from_cancel)) return false; + + /* + * start_flush_work() returned %true. If @from_cancel is set, we know + * that @work must have been executing during start_flush_work() and + * can't currently be queued. Its data must contain OFFQ bits. If @work + * was queued on a BH workqueue, we also know that it was running in the + * BH context and thus can be busy-waited. + */ + data = *work_data_bits(work); + if (from_cancel && + !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted soft + * interrupt processing or prevents ksoftirqd from running by + * keeping flipping BH. If the BH work item runs on a different + * CPU then this has no effect other than doing the BH + * disable/enable dance for nothing. This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } + } else { + wait_for_completion(&barr.done); } + + destroy_work_on_stack(&barr.work); + return true; } /** @@ -4236,6 +4228,7 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) */ bool flush_work(struct work_struct *work) { + might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -4282,32 +4275,53 @@ bool flush_rcu_work(struct rcu_work *rwork) } EXPORT_SYMBOL(flush_rcu_work); +static void work_offqd_disable(struct work_offq_data *offqd) +{ + const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; + + if (likely(offqd->disable < max)) + offqd->disable++; + else + WARN_ONCE(true, "workqueue: work disable count overflowed\n"); +} + +static void work_offqd_enable(struct work_offq_data *offqd) +{ + if (likely(offqd->disable > 0)) + offqd->disable--; + else + WARN_ONCE(true, "workqueue: work disable count underflowed\n"); +} + static bool __cancel_work(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; int ret; - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(work, cflags, &irq_flags); - if (unlikely(ret < 0)) - return false; + work_offqd_unpack(&offqd, *work_data_bits(work)); - set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0); + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { - unsigned long irq_flags; bool ret; - /* claim @work and tell other tasks trying to grab @work to back off */ - ret = work_grab_pending(work, cflags, &irq_flags); - mark_work_canceling(work); - local_irq_restore(irq_flags); + ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); + + if (*work_data_bits(work) & WORK_OFFQ_BH) + WARN_ON_ONCE(in_hardirq()); + else + might_sleep(); /* * Skip __flush_work() during early boot when we know that @work isn't @@ -4316,15 +4330,8 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); - /* - * smp_mb() at the end of set_work_pool_and_clear_pending() is paired - * with prepare_to_wait() above so that either waitqueue_active() is - * visible here or !work_is_canceling() is visible there. - */ - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0); - - if (waitqueue_active(&wq_cancel_waitq)) - __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); + if (!(cflags & WORK_CANCEL_DISABLE)) + enable_work(work); return ret; } @@ -4342,19 +4349,19 @@ EXPORT_SYMBOL(cancel_work); * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. + * Cancel @work and wait for its execution to finish. This function can be used + * even if the work re-queues itself or migrates to another workqueue. On return + * from this function, @work is guaranteed to be not pending or executing on any + * CPU as long as there aren't racing enqueues. * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. + * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. + * Use cancel_delayed_work_sync() instead. * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. * - * Return: - * %true if @work was pending, %false otherwise. + * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { @@ -4400,6 +4407,108 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** + * disable_work - Disable and cancel a work item + * @work: work item to disable + * + * Disable @work by incrementing its disable count and cancel it if currently + * pending. As long as the disable count is non-zero, any attempt to queue @work + * will fail and return %false. The maximum supported disable depth is 2 to the + * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. + * + * Can be called from any context. Returns %true if @work was pending, %false + * otherwise. + */ +bool disable_work(struct work_struct *work) +{ + return __cancel_work(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work); + +/** + * disable_work_sync - Disable, cancel and drain a work item + * @work: work item to disable + * + * Similar to disable_work() but also wait for @work to finish if currently + * executing. + * + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. + * + * Returns %true if @work was pending, %false otherwise. + */ +bool disable_work_sync(struct work_struct *work) +{ + return __cancel_work_sync(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work_sync); + +/** + * enable_work - Enable a work item + * @work: work item to enable + * + * Undo disable_work[_sync]() by decrementing @work's disable count. @work can + * only be queued if its disable count is 0. + * + * Can be called from any context. Returns %true if the disable count reached 0. + * Otherwise, %false. + */ +bool enable_work(struct work_struct *work) +{ + struct work_offq_data offqd; + unsigned long irq_flags; + + work_grab_pending(work, 0, &irq_flags); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + work_offqd_enable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + local_irq_restore(irq_flags); + + return !offqd.disable; +} +EXPORT_SYMBOL_GPL(enable_work); + +/** + * disable_delayed_work - Disable and cancel a delayed work item + * @dwork: delayed work item to disable + * + * disable_work() for delayed work items. + */ +bool disable_delayed_work(struct delayed_work *dwork) +{ + return __cancel_work(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work); + +/** + * disable_delayed_work_sync - Disable, cancel and drain a delayed work item + * @dwork: delayed work item to disable + * + * disable_work_sync() for delayed work items. + */ +bool disable_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_sync(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work_sync); + +/** + * enable_delayed_work - Enable a delayed work item + * @dwork: delayed work item to enable + * + * enable_work() for delayed work items. + */ +bool enable_delayed_work(struct delayed_work *dwork) +{ + return enable_work(&dwork->work); +} +EXPORT_SYMBOL_GPL(enable_delayed_work); + +/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -4530,6 +4639,8 @@ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; + if (attrs->affn_strict) + cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ @@ -4538,11 +4649,12 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - hash = jhash_1word(attrs->affn_strict, hash); + if (!attrs->affn_strict) + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -4552,11 +4664,11 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, { if (a->nice != b->nice) return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) + if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; - if (a->affn_strict != b->affn_strict) + if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } @@ -7148,25 +7260,27 @@ static ssize_t __wq_cpumask_show(struct device *dev, return written; } -static ssize_t wq_unbound_cpumask_show(struct device *dev, +static ssize_t cpumask_requested_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); } +static DEVICE_ATTR_RO(cpumask_requested); -static ssize_t wq_requested_cpumask_show(struct device *dev, +static ssize_t cpumask_isolated_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); } +static DEVICE_ATTR_RO(cpumask_isolated); -static ssize_t wq_isolated_cpumask_show(struct device *dev, +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } -static ssize_t wq_unbound_cpumask_store(struct device *dev, +static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; @@ -7182,36 +7296,19 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, free_cpumask_var(cpumask); return ret ? ret : count; } +static DEVICE_ATTR_RW(cpumask); -static struct device_attribute wq_sysfs_cpumask_attrs[] = { - __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store), - __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), - __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), - __ATTR_NULL, +static struct attribute *wq_sysfs_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + &dev_attr_cpumask_requested.attr, + &dev_attr_cpumask_isolated.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { - struct device *dev_root; - int err; - - err = subsys_virtual_register(&wq_subsys, NULL); - if (err) - return err; - - dev_root = bus_get_dev_root(&wq_subsys); - if (dev_root) { - struct device_attribute *attr; - - for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { - err = device_create_file(dev_root, attr); - if (err) - break; - } - put_device(dev_root); - } - return err; + return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); |