aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c85
-rw-r--r--kernel/bounds.c7
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c54
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c246
-rw-r--r--kernel/bpf/bpf_inode_storage.c42
-rw-r--r--kernel/bpf/bpf_iter.c10
-rw-r--r--kernel/bpf/bpf_local_storage.c212
-rw-r--r--kernel/bpf/bpf_lsm.c46
-rw-r--r--kernel/bpf/bpf_struct_ops.c3
-rw-r--r--kernel/bpf/bpf_task_storage.c169
-rw-r--r--kernel/bpf/btf.c1324
-rw-r--r--kernel/bpf/cgroup.c185
-rw-r--r--kernel/bpf/cgroup_iter.c296
-rw-r--r--kernel/bpf/core.c53
-rw-r--r--kernel/bpf/cpumap.c37
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/dispatcher.c52
-rw-r--r--kernel/bpf/hashtab.c245
-rw-r--r--kernel/bpf/helpers.c547
-rw-r--r--kernel/bpf/local_storage.c7
-rw-r--r--kernel/bpf/lpm_trie.c4
-rw-r--r--kernel/bpf/map_in_map.c61
-rw-r--r--kernel/bpf/memalloc.c677
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/percpu_freelist.c71
-rw-r--r--kernel/bpf/queue_stack_maps.c2
-rw-r--r--kernel/bpf/ringbuf.c259
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/bpf/syscall.c515
-rw-r--r--kernel/bpf/task_iter.c234
-rw-r--r--kernel/bpf/trampoline.c139
-rw-r--r--kernel/bpf/verifier.c3175
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cfi.c352
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c359
-rw-r--r--kernel/cgroup/cpuset.c870
-rw-r--r--kernel/cgroup/legacy_freezer.c23
-rw-r--r--kernel/cgroup/pids.c37
-rw-r--r--kernel/cgroup/rstat.c48
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/tiny.config5
-rw-r--r--kernel/cpu.c61
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/cred.c15
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/debug/kdb/kdb_io.c18
-rw-r--r--kernel/delayacct.c13
-rw-r--r--kernel/dma/mapping.c22
-rw-r--r--kernel/dma/swiotlb.c93
-rw-r--r--kernel/entry/common.c5
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/core.c2389
-rw-r--r--kernel/events/hw_breakpoint.c648
-rw-r--r--kernel/events/hw_breakpoint_test.c333
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c39
-rw-r--r--kernel/exit.c93
-rw-r--r--kernel/fail_function.c26
-rw-r--r--kernel/fork.c188
-rw-r--r--kernel/freezer.c133
-rw-r--r--kernel/futex/core.c26
-rw-r--r--kernel/futex/waitwake.c8
-rw-r--r--kernel/gcov/clang.c2
-rw-r--r--kernel/gcov/gcc_4_7.c23
-rwxr-xr-xkernel/gen_kheaders.sh6
-rw-r--r--kernel/hung_task.c16
-rw-r--r--kernel/irq/Kconfig7
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c39
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/msi.c915
-rw-r--r--kernel/jump_label.c58
-rw-r--r--kernel/kallsyms.c155
-rw-r--r--kernel/kallsyms_internal.h1
-rw-r--r--kernel/kallsyms_selftest.c485
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/kcsan/Makefile1
-rw-r--r--kernel/kcsan/core.c50
-rw-r--r--kernel/kcsan/report.c3
-rw-r--r--kernel/kcsan/selftest.c14
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kexec_core.c46
-rw-r--r--kernel/kexec_file.c6
-rw-r--r--kernel/kexec_internal.h15
-rw-r--r--kernel/kprobes.c43
-rw-r--r--kernel/ksysfs.c25
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/livepatch/core.c55
-rw-r--r--kernel/livepatch/patch.c2
-rw-r--r--kernel/livepatch/transition.c72
-rw-r--r--kernel/locking/Makefile2
-rw-r--r--kernel/locking/lockdep.c6
-rw-r--r--kernel/locking/percpu-rwsem.c6
-rw-r--r--kernel/locking/qrwlock.c4
-rw-r--r--kernel/locking/qspinlock.c2
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
-rw-r--r--kernel/locking/rwsem.c14
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/locking/spinlock.c56
-rw-r--r--kernel/locking/test-ww_mutex.c4
-rw-r--r--kernel/module/Kconfig3
-rw-r--r--kernel/module/decompress.c102
-rw-r--r--kernel/module/internal.h4
-rw-r--r--kernel/module/kallsyms.c2
-rw-r--r--kernel/module/main.c83
-rw-r--r--kernel/module/sysfs.c2
-rw-r--r--kernel/module/tracking.c71
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c20
-rw-r--r--kernel/padata.c27
-rw-r--r--kernel/panic.c56
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/power/hibernate.c37
-rw-r--r--kernel/power/main.c18
-rw-r--r--kernel/power/process.c35
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/power/suspend.c15
-rw-r--r--kernel/power/user.c24
-rw-r--r--kernel/printk/printk.c511
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/profile.c32
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Kconfig22
-rw-r--r--kernel/rcu/Kconfig.debug3
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcuscale.c69
-rw-r--r--kernel/rcu/rcutorture.c358
-rw-r--r--kernel/rcu/srcutiny.c24
-rw-r--r--kernel/rcu/srcutree.c100
-rw-r--r--kernel/rcu/sync.c2
-rw-r--r--kernel/rcu/tasks.h9
-rw-r--r--kernel/rcu/tiny.c29
-rw-r--r--kernel/rcu/tree.c500
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_exp.h59
-rw-r--r--kernel/rcu/tree_nocb.h269
-rw-r--r--kernel/rcu/tree_plugin.h31
-rw-r--r--kernel/rcu/tree_stall.h5
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/reboot.c17
-rw-r--r--kernel/relay.c13
-rw-r--r--kernel/resource.c30
-rw-r--r--kernel/rseq.c19
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c12
-rw-r--r--kernel/sched/core.c489
-rw-r--r--kernel/sched/core_sched.c4
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/deadline.c126
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c719
-rw-r--r--kernel/sched/psi.c380
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h190
-rw-r--r--kernel/sched/stats.h28
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/wait.c18
-rw-r--r--kernel/scs.c14
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/static_call_inline.c23
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl-test.c43
-rw-r--r--kernel/sysctl.c64
-rw-r--r--kernel/task_work.c16
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/namespace.c18
-rw-r--r--kernel/time/timer.c427
-rw-r--r--kernel/trace/Kconfig21
-rw-r--r--kernel/trace/blktrace.c89
-rw-r--r--kernel/trace/bpf_trace.c329
-rw-r--r--kernel/trace/fprobe.c5
-rw-r--r--kernel/trace/ftrace.c110
-rw-r--r--kernel/trace/kprobe_event_gen_test.c113
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/ring_buffer.c219
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c8
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c8
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h2
-rw-r--r--kernel/trace/synth_event_gen_test.c16
-rw-r--r--kernel/trace/trace.c207
-rw-r--r--kernel/trace/trace.h45
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_eprobe.c171
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events.c79
-rw-r--r--kernel/trace/trace_events_filter.c239
-rw-r--r--kernel/trace/trace_events_hist.c443
-rw-r--r--kernel/trace/trace_events_synth.c32
-rw-r--r--kernel/trace/trace_events_trigger.c19
-rw-r--r--kernel/trace/trace_events_user.c571
-rw-r--r--kernel/trace/trace_kprobe.c62
-rw-r--r--kernel/trace/trace_osnoise.c253
-rw-r--r--kernel/trace/trace_output.c71
-rw-r--r--kernel/trace/trace_probe.c67
-rw-r--r--kernel/trace/trace_probe.h22
-rw-r--r--kernel/trace/trace_probe_kernel.h115
-rw-r--r--kernel/trace/trace_probe_tmpl.h47
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_syscalls.c2
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/trace/tracing_map.c5
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/ucount.c34
-rw-r--r--kernel/umh.c18
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/workqueue.c10
231 files changed, 18622 insertions, 7122 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 318789c728d3..10ef068f598d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -38,11 +38,9 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# Don't instrument error handlers
-CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
-
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -68,6 +66,7 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 13706356ec54..010667ce6080 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -320,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -331,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -350,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
@@ -555,15 +557,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/audit.c b/kernel/audit.c
index a75978ae38ad..9bc0b0301198 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -321,7 +321,6 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
@@ -330,9 +329,8 @@ static inline int audit_rate_check(void)
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -366,7 +364,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 58b66543b4d5..c57b008b9914 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -133,7 +133,7 @@ struct audit_context {
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
- pid_t pid, ppid;
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -245,8 +245,6 @@ struct audit_netlink_list {
int audit_send_list_thread(void *_dest);
-extern int selinux_audit_rule_update(void);
-
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 4b0957aa2cd4..65075f1e4ac8 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 79a5da1bc5bb..547c88be8a28 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -806,30 +806,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
}
/**
- * audit_filter_uring - apply filters to an io_uring operation
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
* @tsk: associated task
* @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static void audit_filter_uring(struct task_struct *tsk,
- struct audit_context *ctx)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
- list) {
- if (audit_in_mask(&e->rule, ctx->uring_op) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
- false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
rcu_read_unlock();
}
@@ -841,24 +864,13 @@ static void audit_filter_uring(struct task_struct *tsk,
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
- struct audit_entry *e;
- enum audit_state state;
-
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return;
}
/*
@@ -870,17 +882,8 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_context *ctx) {
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -965,7 +968,7 @@ static void audit_reset_context(struct audit_context *ctx)
if (!ctx)
return;
- /* if ctx is non-null, reset the "ctx->state" regardless */
+ /* if ctx is non-null, reset the "ctx->context" regardless */
ctx->context = AUDIT_CTX_UNUSED;
if (ctx->dummy)
return;
@@ -1002,7 +1005,7 @@ static void audit_reset_context(struct audit_context *ctx)
kfree(ctx->sockaddr);
ctx->sockaddr = NULL;
ctx->sockaddr_len = 0;
- ctx->pid = ctx->ppid = 0;
+ ctx->ppid = 0;
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
ctx->personality = 0;
@@ -1016,7 +1019,6 @@ static void audit_reset_context(struct audit_context *ctx)
WARN_ON(!list_empty(&ctx->killed_trees));
audit_free_module(ctx);
ctx->fds[0] = -1;
- audit_proctitle_free(ctx);
ctx->type = 0; /* reset last for audit_free_*() */
}
@@ -1077,6 +1079,7 @@ static inline void audit_free_context(struct audit_context *context)
{
/* resetting is extra work, but it is likely just noise */
audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
kfree(context->filterkey);
kfree(context);
@@ -1833,7 +1836,7 @@ void __audit_free(struct task_struct *tsk)
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
- * random task_struct that doesn't doesn't have any meaningful data we
+ * random task_struct that doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy) {
@@ -2069,7 +2072,7 @@ void __audit_syscall_exit(int success, long return_code)
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
- if (context->current_state < AUDIT_STATE_RECORD)
+ if (context->current_state != AUDIT_STATE_RECORD)
goto out;
audit_log_exit();
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..b529182e8b04 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..3a12e6b400a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -24,6 +24,9 @@ endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 624527401d4d..484706959556 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -305,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
-static void check_and_free_fields(struct bpf_array *arr, void *val)
-{
- if (map_value_has_timer(&arr->map))
- bpf_timer_cancel_and_free(val + arr->map.timer_off);
- if (map_value_has_kptrs(&arr->map))
- bpf_map_free_kptrs(&arr->map, val);
-}
-
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -334,12 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -347,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- check_and_free_fields(array, val);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -383,7 +377,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -406,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(map))
+ /* We don't reset or free fields other than timer on uref dropping to zero. */
+ if (!btf_record_has_field(map->record, BPF_TIMER))
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -420,10 +415,21 @@ static void array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- if (map_value_has_kptrs(map)) {
- for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
- bpf_map_free_kptr_off_tab(map);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
+ }
}
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
@@ -608,9 +614,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
pptr = v;
size = array->elem_size;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += size;
}
ctx.value = info->percpu_value_buf;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b9ea539a5561..48ee750849f2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
attr->value_size / sizeof(u32);
if (!(attr->map_flags & BPF_F_ZERO_SEED))
- bloom->hash_seed = get_random_int();
+ bloom->hash_seed = get_random_u32();
return &bloom->map;
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..6cdf6d9ed91d
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_cgroup_storage = false;
+ unsigned long flags;
+
+ rcu_read_lock();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_cgrp_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_cgrp_storage_unlock();
+ rcu_read_unlock();
+
+ if (free_cgroup_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), true);
+ return 0;
+}
+
+static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 5f7683b19199..05f4c66c9089 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
- struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
@@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
raw_spin_lock_bh(&local_storage->lock);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_bh(&local_storage->lock);
rcu_read_unlock();
- /* free_inoode_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_inode_storage)
kfree_rcu(local_storage, rcu);
}
@@ -226,27 +205,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
}
-BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
- bpf_local_storage_map)
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -257,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &inode_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 24b755eca0b3..5dc307bdeaeb 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -202,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -689,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
if (prog->aux->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 8ce40fd869f6..b39a46e8fb08 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -74,7 +74,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
gfp_flags | __GFP_NOWARN);
if (selem) {
if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
return selem;
}
@@ -88,8 +88,14 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_rcu(local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
@@ -97,16 +103,19 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- kfree_rcu(selem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem, bool use_trace_rcu)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -233,6 +242,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
__bpf_selem_unlink_storage(selem, use_trace_rcu);
}
+/* If cacheit_lockit is false, this lookup function is lockless */
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
@@ -372,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(&smap->map)))
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
@@ -491,7 +501,7 @@ unlock_err:
return ERR_PTR(err);
}
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
@@ -515,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
return res;
}
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
- u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
- int __percpu *busy_counter)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map_bucket *b;
- unsigned int i;
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
-
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the owner->storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or when the storage is freed e.g.
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
- __this_cpu_inc(*busy_counter);
- }
- bpf_selem_unlink(selem, false);
- if (busy_counter) {
- __this_cpu_dec(*busy_counter);
- migrate_enable();
- }
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* While freeing the storage we may still need to access the map.
- *
- * e.g. when bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exit immediately.
- *
- * However, while freeing the storage one still needs to access the
- * smap->elem_size to do the uncharging in
- * bpf_selem_unlink_storage_nolock().
- *
- * Hence, wait another rcu grace period for the storage to be freed.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- kfree(smap);
-}
-
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -604,13 +552,13 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
unsigned int i;
u32 nbuckets;
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
@@ -623,7 +571,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap->buckets) {
- kfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
}
@@ -654,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
+
+bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+{
+ struct bpf_local_storage_elem *selem;
+ bool free_storage = false;
+ struct hlist_node *n;
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false, false);
+ }
+
+ return free_storage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = __bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ this_cpu_inc(*busy_counter);
+ }
+ bpf_selem_unlink(selem, false);
+ if (busy_counter) {
+ this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fa71d58b7ded..9ea42a45da47 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks)
*/
BTF_SET_START(bpf_lsm_current_hooks)
/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
BTF_SET_END(bpf_lsm_current_hooks)
/* List of LSM hooks that trigger while the socket is properly locked.
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
/* List of LSM hooks that trigger while the socket is _not_ locked,
@@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
* in the early init phase.
*/
BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
#ifdef CONFIG_CGROUP_BPF
@@ -145,6 +151,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
@@ -163,6 +170,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
static const struct bpf_func_proto bpf_ima_file_hash_proto = {
.func = bpf_ima_file_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
@@ -189,6 +197,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -207,20 +223,11 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
- return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ return &bpf_ima_inode_hash_proto;
case BPF_FUNC_ima_file_hash:
- return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+ return &bpf_ima_file_hash_proto;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
- case BPF_FUNC_get_local_storage:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_local_storage_proto : NULL;
- case BPF_FUNC_set_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_set_retval_proto : NULL;
- case BPF_FUNC_get_retval:
- return prog->expected_attach_type == BPF_LSM_CGROUP ?
- &bpf_get_retval_proto : NULL;
#ifdef CONFIG_NET
case BPF_FUNC_setsockopt:
if (prog->expected_attach_type != BPF_LSM_CGROUP)
@@ -335,13 +342,30 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 84b2d9dba79a..ece9870cab68 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
+ set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* Pair with smp_load_acquire() during lookup_elem().
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e9014dc62682..1e486055a523 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
@@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
- struct hlist_node *n;
unsigned long flags;
rcu_read_lock();
@@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
bpf_task_storage_lock();
raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_task_storage_unlock();
rcu_read_unlock();
- /* free_task_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_task_storage)
kfree_rcu(local_storage, rcu);
}
@@ -184,7 +163,8 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
{
struct bpf_local_storage_data *sdata;
@@ -192,6 +172,9 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
+ if (!nobusy)
+ return -EBUSY;
+
bpf_selem_unlink(SELEM(sdata), true);
return 0;
@@ -220,63 +203,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
}
bpf_task_storage_lock();
- err = task_storage_delete(task, map);
+ err = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
{
struct bpf_local_storage_data *sdata;
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
- return (unsigned long)NULL;
-
- if (!task)
- return (unsigned long)NULL;
-
- if (!bpf_task_storage_trylock())
- return (unsigned long)NULL;
-
- sdata = task_storage_lookup(task, map, true);
+ sdata = task_storage_lookup(task, map, nobusy);
if (sdata)
- goto unlock;
+ return sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
-unlock:
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
bpf_task_storage_unlock();
- return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
+ return (unsigned long)data;
}
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
task)
{
+ bool nobusy;
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- if (!bpf_task_storage_trylock())
- return -EBUSY;
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+ bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map);
+ ret = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
return ret;
}
@@ -288,26 +316,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
}
-BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -318,10 +335,21 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &task_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
@@ -333,6 +361,15 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.arg4_type = ARG_ANYTHING,
};
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7e64447659f3..f7dd8af06413 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
@@ -199,16 +200,18 @@ DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
BTF_KFUNC_HOOK_XDP,
BTF_KFUNC_HOOK_TC,
BTF_KFUNC_HOOK_STRUCT_OPS,
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
BTF_KFUNC_HOOK_MAX,
};
enum {
- BTF_KFUNC_SET_MAX_CNT = 32,
+ BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
};
@@ -237,6 +240,7 @@ struct btf {
struct rcu_head rcu;
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
/* split BTF support */
struct btf *base_btf;
@@ -477,16 +481,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
-}
-
-static bool btf_type_is_array(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -818,6 +812,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
* Regular int is not a bit field and it must be either
@@ -1396,7 +1391,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
@@ -1412,7 +1406,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -1642,8 +1636,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf)
btf->dtor_kfunc_tab = NULL;
}
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++) {
+ btf_record_free(tab->types[i].record);
+ kfree(tab->types[i].field_offs);
+ }
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
@@ -3128,7 +3144,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -3191,7 +3207,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_type {
+enum btf_field_info_type {
BTF_FIELD_SPIN_LOCK,
BTF_FIELD_TIMER,
BTF_FIELD_KPTR,
@@ -3203,18 +3219,28 @@ enum {
};
struct btf_field_info {
- u32 type_id;
+ enum btf_field_type type;
u32 off;
- enum bpf_kptr_type type;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } list_head;
+ };
};
static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
if (!__btf_type_is_struct(t))
return BTF_FIELD_IGNORE;
if (t->size != sz)
return BTF_FIELD_IGNORE;
+ info->type = field_type;
info->off = off;
return BTF_FIELD_FOUND;
}
@@ -3222,9 +3248,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
u32 off, int sz, struct btf_field_info *info)
{
- enum bpf_kptr_type type;
+ enum btf_field_type type;
u32 res_id;
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
/* For PTR, sz is always == 8 */
if (!btf_type_is_ptr(t))
return BTF_FIELD_IGNORE;
@@ -3248,28 +3277,135 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
if (!__btf_type_is_struct(t))
return -EINVAL;
- info->type_id = res_id;
- info->off = off;
info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
+ return BTF_FIELD_FOUND;
+}
+
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+ const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ int i;
+
+ for (i = 1; i < btf_nr_types(btf); i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ int len = strlen(tag_key);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type) ||
+ btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return __btf_name_by_offset(btf, t->name_off) + len;
+ }
+ return NULL;
+}
+
+static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ const char *value_type;
+ const char *list_node;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (!value_type)
+ return -EINVAL;
+ list_node = strstr(value_type, ":");
+ if (!list_node)
+ return -EINVAL;
+ value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ list_node++;
+ if (str_is_empty(list_node))
+ return -EINVAL;
+ info->type = BPF_LIST_HEAD;
+ info->off = off;
+ info->list_head.value_btf_id = id;
+ info->list_head.node_name = list_node;
return BTF_FIELD_FOUND;
}
-static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+ int *align, int *sz)
+{
+ int type = 0;
+
+ if (field_mask & BPF_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_spin_lock")) {
+ if (*seen_mask & BPF_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_SPIN_LOCK;
+ type = BPF_SPIN_LOCK;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_TIMER) {
+ if (!strcmp(name, "bpf_timer")) {
+ if (*seen_mask & BPF_TIMER)
+ return -E2BIG;
+ *seen_mask |= BPF_TIMER;
+ type = BPF_TIMER;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_LIST_HEAD) {
+ if (!strcmp(name, "bpf_list_head")) {
+ type = BPF_LIST_HEAD;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_LIST_NODE) {
+ if (!strcmp(name, "bpf_list_node")) {
+ type = BPF_LIST_NODE;
+ goto end;
+ }
+ }
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & BPF_KPTR) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
struct btf_field_info *info, int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_member *member;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
off = __btf_member_bit_offset(t, member);
if (off % 8)
@@ -3277,22 +3413,30 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
return -EINVAL;
off /= 8;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, member_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, member_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ ret = btf_find_list_head(btf, t, member_type, i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3307,42 +3451,53 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_var_secinfo *vsi;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- off = vsi->offset;
-
- if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
+
+ off = vsi->offset;
if (vsi->size != sz)
continue;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, var_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, var_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3357,169 +3512,327 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
- const char *name;
- int sz, align;
-
- switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- name = "bpf_spin_lock";
- sz = sizeof(struct bpf_spin_lock);
- align = __alignof__(struct bpf_spin_lock);
- break;
- case BTF_FIELD_TIMER:
- name = "bpf_timer";
- sz = sizeof(struct bpf_timer);
- align = __alignof__(struct bpf_timer);
- break;
- case BTF_FIELD_KPTR:
- name = NULL;
- sz = sizeof(u64);
- align = 8;
- break;
- default:
- return -EFAULT;
- }
-
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
return -EINVAL;
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info;
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ struct btf *kernel_btf;
int ret;
+ s32 id;
- ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
- return -ENOENT;
- return info.off;
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kernel_btf);
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kernel_btf)) {
+ mod = btf_try_get_module(kernel_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+ field->kptr.btf_id = id;
+ field->kptr.btf = kernel_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kernel_btf);
+ return ret;
}
-int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info;
- int ret;
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
- ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
+ t = btf_type_by_id(btf, info->list_head.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % __alignof__(struct bpf_list_node))
+ return -EINVAL;
+
+ field->list_head.btf = (struct btf *)btf;
+ field->list_head.value_btf_id = info->list_head.value_btf_id;
+ field->list_head.node_offset = offset;
+ }
+ if (!n)
return -ENOENT;
- return info.off;
+ return 0;
}
-struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
- const struct btf_type *t)
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
{
- struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
- struct bpf_map_value_off *tab;
- struct btf *kernel_btf = NULL;
- struct module *mod = NULL;
- int ret, i, nr_off;
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ struct btf_record *rec;
+ u32 next_off = 0;
+ int ret, i, cnt;
- ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
if (ret < 0)
return ERR_PTR(ret);
if (!ret)
return NULL;
- nr_off = ret;
- tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
- if (!tab)
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ if (!rec)
return ERR_PTR(-ENOMEM);
- for (i = 0; i < nr_off; i++) {
- const struct btf_type *t;
- s32 id;
+ rec->spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
- /* Find type in map BTF, and use it to look up the matching type
- * in vmlinux or module BTFs, by name and kind.
- */
- t = btf_type_by_id(btf, info_arr[i].type_id);
- id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
- if (id < 0) {
- ret = id;
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ break;
+ default:
+ ret = -EFAULT;
goto end;
}
+ rec->cnt++;
+ }
- /* Find and stash the function pointer for the destruction function that
- * needs to be eventually invoked from the map free path.
- */
- if (info_arr[i].type == BPF_KPTR_REF) {
- const struct btf_type *dtor_func;
- const char *dtor_func_name;
- unsigned long addr;
- s32 dtor_btf_id;
-
- /* This call also serves as a whitelist of allowed objects that
- * can be used as a referenced pointer and be stored in a map at
- * the same time.
- */
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
- if (dtor_btf_id < 0) {
- ret = dtor_btf_id;
- goto end_btf;
- }
+ /* bpf_list_head requires bpf_spin_lock */
+ if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
- if (!dtor_func) {
- ret = -ENOENT;
- goto end_btf;
- }
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
- if (!mod) {
- ret = -ENXIO;
- goto end_btf;
- }
- }
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
- /* We already verified dtor_func to be btf_type_is_func
- * in register_btf_id_dtor_kfuncs.
- */
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
- addr = kallsyms_lookup_name(dtor_func_name);
- if (!addr) {
- ret = -EINVAL;
- goto end_mod;
- }
- tab->off[i].kptr.dtor = (void *)addr;
- }
+ /* There are two owning types, kptr_ref and bpf_list_head. The former
+ * only supports storing kernel types, which can never store references
+ * to program allocated local types, atleast not yet. Hence we only need
+ * to ensure that bpf_list_head ownership does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ u32 btf_id;
+
+ if (!(rec->fields[i].type & BPF_LIST_HEAD))
+ continue;
+ btf_id = rec->fields[i].list_head.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].list_head.value_rec = meta->record;
- tab->off[i].offset = info_arr[i].off;
- tab->off[i].type = info_arr[i].type;
- tab->off[i].kptr.btf_id = id;
- tab->off[i].kptr.btf = kernel_btf;
- tab->off[i].kptr.module = mod;
+ if (!(rec->field_mask & BPF_LIST_NODE))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_list_head in a type. Instead, we use the following
+ * reasoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_list_node.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_list_head.
+ *
+ * We ensure that if a type has both bpf_list_head and
+ * bpf_list_node, its element types cannot be owning types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A only has bpf_list_head, ownership chain can be:
+ * A -> B -> C
+ * Where:
+ * - B has both bpf_list_head and bpf_list_node.
+ * - C only has bpf_list_node.
+ *
+ * When A has both bpf_list_head and bpf_list_node, some other
+ * type already owns it in the BTF domain, hence it can not own
+ * another owning type through any of the bpf_list_head edges.
+ * A -> B
+ * Where:
+ * - B only has bpf_list_node.
+ */
+ if (meta->record->field_mask & BPF_LIST_HEAD)
+ return -ELOOP;
}
- tab->nr_off = nr_off;
- return tab;
-end_mod:
- module_put(mod);
-end_btf:
- btf_put(kernel_btf);
-end:
- while (i--) {
- btf_put(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
+ return 0;
+}
+
+static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const u32 a = *(const u32 *)_a;
+ const u32 b = *(const u32 *)_b;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ return 0;
+}
+
+static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
+{
+ struct btf_field_offs *foffs = (void *)priv;
+ u32 *off_base = foffs->field_off;
+ u32 *a = _a, *b = _b;
+ u8 *sz_a, *sz_b;
+
+ sz_a = foffs->field_sz + (a - off_base);
+ sz_b = foffs->field_sz + (b - off_base);
+
+ swap(*a, *b);
+ swap(*sz_a, *sz_b);
+}
+
+struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
+{
+ struct btf_field_offs *foffs;
+ u32 i, *off;
+ u8 *sz;
+
+ BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+
+ foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
+ if (!foffs)
+ return ERR_PTR(-ENOMEM);
+
+ off = foffs->field_off;
+ sz = foffs->field_sz;
+ for (i = 0; i < rec->cnt; i++) {
+ off[i] = rec->fields[i].offset;
+ sz[i] = btf_field_type_size(rec->fields[i].type);
}
- kfree(tab);
- return ERR_PTR(ret);
+ foffs->cnt = rec->cnt;
+
+ if (foffs->cnt == 1)
+ return foffs;
+ sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
+ btf_field_offs_cmp, btf_field_offs_swap, foffs);
+ return foffs;
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -4436,6 +4749,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_is_resolve_source_only(ret_type)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
if (btf_type_needs_resolve(ret_type) &&
!env_type_is_resolved(env, ret_type_id)) {
err = btf_resolve(env, ret_type, ret_type_id);
@@ -4463,7 +4781,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -4472,8 +4789,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -4481,25 +4802,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -4854,7 +5173,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
@@ -4911,11 +5229,120 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
+ return btf_check_sec_info(env, btf_data_size);
+}
- return 0;
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ union {
+ struct btf_id_set set;
+ struct {
+ u32 _cnt;
+ u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+ } _arr;
+ } aof;
+ struct btf_struct_metas *tab = NULL;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ memset(&aof, 0, sizeof(aof));
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ aof.set.ids[aof.set.cnt++] = id;
+ }
+
+ if (!aof.set.cnt)
+ return NULL;
+ sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_field_offs *foffs;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free;
+ }
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(&aof.set, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ foffs = btf_parse_field_offs(record);
+ /* We need the field_offs to be valid for a valid record,
+ * either both should be set or both should be unset.
+ */
+ if (IS_ERR_OR_NULL(foffs)) {
+ btf_record_free(record);
+ ret = -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ type->field_offs = foffs;
+ tab->cnt++;
+ }
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
}
static int btf_check_type_tags(struct btf_verifier_env *env,
@@ -4968,6 +5395,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
@@ -5036,15 +5464,34 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
+ struct_meta_tab = btf_parse_struct_metas(log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
+ }
+
if (log->level && bpf_verifier_log_full(log)) {
err = -ENOSPC;
- goto errout;
+ goto errout_meta;
}
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
btf_verifier_env_free(env);
if (btf)
@@ -5086,7 +5533,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
+const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5159,6 +5606,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
@@ -5328,6 +5795,50 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
return btf_type_is_int(t);
}
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -5347,7 +5858,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = get_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -5398,7 +5909,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -5417,7 +5928,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_is_any_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -5425,7 +5936,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
@@ -5471,6 +5982,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -5509,11 +6023,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
return true;
}
@@ -5737,6 +6251,9 @@ error:
/* check __percpu tag */
if (strcmp(tag_value, "percpu") == 0)
tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
}
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
@@ -5766,20 +6283,50 @@ error:
return -EINVAL;
}
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype __maybe_unused,
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
int err;
- u32 id;
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
do {
err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
/* If we found the pointer or scalar on t+off,
* we're done.
*/
@@ -5814,8 +6361,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
- const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
@@ -5864,26 +6411,25 @@ again:
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf_type_by_id(btf, 0);
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_is_any_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
@@ -5902,8 +6448,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
@@ -5917,10 +6465,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || __btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
@@ -5933,10 +6481,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
return -EINVAL;
}
if (ret == 0) {
@@ -5946,6 +6496,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
}
m->nr_args = nargs;
return 0;
@@ -6093,95 +6644,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
-/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
-static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int rec)
-{
- const struct btf_type *member_type;
- const struct btf_member *member;
- u32 i;
-
- if (!btf_type_is_struct(t))
- return false;
-
- for_each_member(i, t, member) {
- const struct btf_array *array;
-
- member_type = btf_type_skip_modifiers(btf, member->type, NULL);
- if (btf_type_is_struct(member_type)) {
- if (rec >= 3) {
- bpf_log(log, "max struct nesting depth exceeded\n");
- return false;
- }
- if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
- return false;
- continue;
- }
- if (btf_type_is_array(member_type)) {
- array = btf_type_array(member_type);
- if (!array->nelems)
- return false;
- member_type = btf_type_skip_modifiers(btf, array->type, NULL);
- if (!btf_type_is_scalar(member_type))
- return false;
- continue;
- }
- if (!btf_type_is_scalar(member_type))
- return false;
- }
- return true;
-}
-
-static bool is_kfunc_arg_mem_size(const struct btf *btf,
- const struct btf_param *arg,
- const struct bpf_reg_state *reg)
-{
- int len, sfx_len = sizeof("__sz") - 1;
- const struct btf_type *t;
- const char *param_name;
-
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
- return false;
-
- /* In the future, this can be ported to use BTF tagging */
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len < sfx_len)
- return false;
- param_name += len - sfx_len;
- if (strncmp(param_name, "__sz", sfx_len))
- return false;
-
- return true;
-}
-
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok,
- u32 kfunc_flags)
+ bool processing_call)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- bool rel = false, kptr_get = false, trusted_arg = false;
struct bpf_verifier_log *log = &env->log;
- u32 i, nargs, ref_id, ref_obj_id = 0;
- bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- int ref_regno = 0, ret;
+ u32 i, nargs, ref_id;
+ int ret;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -6207,13 +6682,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (is_kfunc) {
- /* Only kfunc can be release func */
- rel = kfunc_flags & KF_RELEASE;
- kptr_get = kfunc_flags & KF_KPTR_GET;
- trusted_arg = kfunc_flags & KF_TRUSTED_ARGS;
- }
-
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -6236,70 +6704,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* Check if argument must be a referenced pointer, args + i has
- * been verified to be a pointer (after skipping modifiers).
- */
- if (is_kfunc && trusted_arg && !reg->ref_obj_id) {
- bpf_log(log, "R%d must be referenced\n", regno);
- return -EINVAL;
- }
-
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- /* Trusted args have the same offset checks as release arguments */
- if (trusted_arg || (rel && reg->ref_obj_id))
- arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
- /* kptr_get is only true for kfunc */
- if (i == 0 && kptr_get) {
- struct bpf_map_value_off_desc *off_desc;
-
- if (reg->type != PTR_TO_MAP_VALUE) {
- bpf_log(log, "arg#0 expected pointer to map value\n");
- return -EINVAL;
- }
-
- /* check_func_arg_reg_off allows var_off for
- * PTR_TO_MAP_VALUE, but we need fixed offset to find
- * off_desc.
- */
- if (!tnum_is_const(reg->var_off)) {
- bpf_log(log, "arg#0 must have constant offset\n");
- return -EINVAL;
- }
-
- off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
- if (!off_desc || off_desc->type != BPF_KPTR_REF) {
- bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
- reg->off + reg->var_off.value);
- return -EINVAL;
- }
-
- if (!btf_type_is_ptr(ref_t)) {
- bpf_log(log, "arg#0 BTF type must be a double pointer\n");
- return -EINVAL;
- }
-
- ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
- ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
- off_desc->kptr.btf_id, true)) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- /* rest of the arguments can be anything, like normal kfunc */
- } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -6309,86 +6721,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
- (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
- const struct btf_type *reg_ref_t;
- const struct btf *reg_btf;
- const char *reg_ref_tname;
- u32 reg_ref_id;
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (reg->type == PTR_TO_BTF_ID) {
- reg_btf = reg->btf;
- reg_ref_id = reg->btf_id;
- /* Ensure only one argument is referenced PTR_TO_BTF_ID */
- if (reg->ref_obj_id) {
- if (ref_obj_id) {
- bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id, ref_obj_id);
- return -EFAULT;
- }
- ref_regno = regno;
- ref_obj_id = reg->ref_obj_id;
- }
- } else {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[base_type(reg->type)];
- }
-
- reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
- &reg_ref_id);
- reg_ref_tname = btf_name_by_offset(reg_btf,
- reg_ref_t->name_off);
- if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id,
- trusted_arg || (rel && reg->ref_obj_id))) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname,
- regno, btf_type_str(reg_ref_t),
- reg_ref_tname);
- return -EINVAL;
- }
- } else if (ptr_to_mem_ok) {
+ } else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
- if (is_kfunc) {
- bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
-
- /* Permit pointer to mem, but only when argument
- * type is pointer to scalar, or struct composed
- * (recursively) of scalars.
- * When arg_mem_size is true, the pointer can be
- * void *.
- */
- if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
- (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
- return -EINVAL;
- }
-
- /* Check for mem, len pair */
- if (arg_mem_size) {
- if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
- bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
- i, i + 1);
- return -EINVAL;
- }
- i++;
- continue;
- }
- }
-
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -6401,29 +6737,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
- bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
- is_kfunc ? "kernel " : "", func_name, func_id);
+ bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+ func_name, func_id);
return -EINVAL;
}
}
- /* Either both are set, or neither */
- WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
- /* We already made sure ref_obj_id is set only for one argument. We do
- * allow (!rel && ref_obj_id), so that passing such referenced
- * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
- * is_kfunc is true.
- */
- if (rel && !ref_obj_id) {
- bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
- func_name);
- return -EINVAL;
- }
- /* returns argument register number > 0 in case of reference release kfunc */
- return rel ? ref_regno : 0;
+ return 0;
}
-/* Compare BTF of a function with given bpf_reg_state.
+/* Compare BTF of a function declaration with given bpf_reg_state.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - there is a type mismatch or BTF is not available.
@@ -6450,7 +6773,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6461,12 +6784,47 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return err;
}
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs,
- u32 kfunc_flags)
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ *
+ * NOTE: the code is duplicated from btf_check_subprog_arg_match()
+ * because btf_check_func_arg_match() is still doing both. Once that
+ * function is split in 2, we can call from here btf_check_subprog_arg_match()
+ * first, and then treat the calling part in a new code path.
+ */
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags);
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
+
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -6580,7 +6938,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
return 0;
@@ -6851,23 +7209,6 @@ bool btf_is_module(const struct btf *btf)
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
-static int btf_id_cmp_func(const void *a, const void *b)
-{
- const int *pa = a, *pb = b;
-
- return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
- return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
-
-static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
-{
- return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
-}
-
enum {
BTF_MODULE_F_LIVE = (1 << 0),
};
@@ -7228,6 +7569,8 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
case BPF_PROG_TYPE_XDP:
return BTF_KFUNC_HOOK_XDP;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -7235,6 +7578,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
@@ -7255,16 +7599,24 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
u32 kfunc_btf_id)
{
enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags)
+ return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
-/* This function must be invoked only from initcalls/module init functions */
-int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
- const struct btf_kfunc_id_set *kset)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- enum btf_kfunc_hook hook;
struct btf *btf;
int ret;
@@ -7283,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
if (IS_ERR(btf))
return PTR_ERR(btf);
- hook = bpf_prog_type_to_kfunc_hook(prog_type);
ret = btf_populate_kfunc_set(btf, hook, kset->set);
btf_put(btf);
return ret;
}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4a400cd63731..bf2fdb33fb31 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
enum cgroup_bpf_attach_type from_atype, to_atype;
@@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int total_cnt = 0;
u32 flags;
+ if (effective_query && prog_attach_flags)
+ return -EINVAL;
+
if (type == BPF_LSM_CGROUP) {
- if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
return -EINVAL;
from_atype = CGROUP_LSM_START;
@@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype; atype++) {
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
@@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
}
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
@@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
@@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (++i == cnt)
break;
}
- }
- if (prog_attach_flags) {
- flags = cgrp->bpf.flags[atype];
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
- for (i = 0; i < cnt; i++)
- if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
- return -EFAULT;
- prog_attach_flags += cnt;
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
prog_ids += cnt;
@@ -1529,6 +1537,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return ret;
}
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_0(bpf_get_retval)
{
struct bpf_cg_run_ctx *ctx =
@@ -1560,32 +1599,26 @@ const struct bpf_func_proto bpf_set_retval_proto = {
};
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_retval:
- return &bpf_get_retval_proto;
- case BPF_FUNC_set_retval:
- return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -2098,11 +2131,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -2113,8 +2152,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_set_new_value_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2235,6 +2276,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2256,8 +2307,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2422,3 +2475,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..06989d278846
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
+ p->start_css = &cgrp->self;
+ css_get(p->start_css);
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3d9eb3ae334c..ba3fff17e2f9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,7 @@
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
+#include <linux/bpf_mem_alloc.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -60,6 +61,9 @@
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -825,6 +829,11 @@ struct bpf_prog_pack {
unsigned long bitmap[];
};
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
static DEFINE_MUTEX(pack_mutex);
@@ -859,12 +868,11 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
-static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
@@ -878,8 +886,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
if (ptr) {
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
- set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
}
goto out;
}
@@ -905,7 +912,7 @@ out:
return ptr;
}
-static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(struct bpf_binary_header *hdr)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
@@ -1027,7 +1034,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1089,7 +1096,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -1211,7 +1218,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -2002,7 +2009,7 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
@@ -2083,6 +2090,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
if (fp->kprobe_override)
@@ -2093,12 +2101,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
/* There's no owner yet where we could check for
* compatibility.
*/
- map->owner.type = fp->type;
+ map->owner.type = prog_type;
map->owner.jited = fp->jited;
map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = map->owner.type == fp->type &&
+ ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
@@ -2246,8 +2254,14 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
progs = container_of(rcu, struct bpf_prog_array, rcu);
- kfree_rcu(progs, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
@@ -2623,6 +2637,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
@@ -2734,6 +2749,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index f4860ac756cd..e0b2d016f0bf 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,13 +4,16 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
@@ -85,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@@ -97,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_cmap;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_cmap:
- kfree(cmap);
- return ERR_PTR(err);
}
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -623,7 +622,7 @@ static void cpu_map_free(struct bpf_map *map)
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
@@ -668,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a0e02b009487..d01e4c55b376 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
- dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..fa3e9225aedc 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -4,6 +4,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/static_call.h>
/* The BPF dispatcher is a multiway branch code generator. The
* dispatcher is a mechanism to avoid the performance penalty of an
@@ -85,12 +86,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,34 +100,38 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
- u32 noff;
- int err;
-
- if (!prev_num_progs) {
- old = NULL;
- noff = 0;
- } else {
- old = d->image + d->image_off;
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
noff = d->image_off ^ (PAGE_SIZE / 2);
- }
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
- err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
- if (err || !new)
- return;
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
- d->image_off = noff;
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
+
+ if (new)
+ d->image_off = noff;
}
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
@@ -140,9 +145,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ u32 size = PAGE_SIZE;
+
+ bpf_arch_text_copy(d->image, &size, sizeof(size));
+ bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ d->image = NULL;
+ goto out;
+ }
bpf_image_ksym_add(d->image, &d->ksym);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6c530a5e560a..5aa2b5525f79 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,6 +14,7 @@
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -67,24 +68,16 @@
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ raw_spinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -92,6 +85,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -99,7 +94,12 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -114,14 +114,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -133,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab)) {
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
&htab->lockdep_key);
- } else {
- spin_lock_init(&htab->buckets[i].lock);
- lockdep_set_class(&htab->buckets[i].lock,
- &htab->lockdep_key);
- }
cond_resched();
}
}
@@ -165,17 +154,14 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
hash = hash & HASHTAB_MAP_LOCK_MASK;
- migrate_disable();
+ preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
return -EBUSY;
}
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
*pflags = flags;
return 0;
@@ -186,12 +172,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
unsigned long flags)
{
hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -239,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_timer(&htab->map))
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -248,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
-static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_kptrs(&htab->map))
+ if (IS_ERR_OR_NULL(htab->map.record))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
-
for (i = 0; i < num_entries; i++) {
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
@@ -428,8 +408,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -491,7 +469,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
struct bpf_htab *htab;
int err, i;
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -546,10 +524,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
@@ -563,6 +564,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -570,12 +581,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -746,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
{
void *map_value = elem->key + round_up(htab->map.key_size, 8);
- if (map_value_has_timer(&htab->map))
- bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
- if (map_value_has_kptrs(&htab->map))
- bpf_map_free_kptrs(&htab->map, map_value);
+ bpf_obj_free_fields(htab->map.record, map_value);
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -847,17 +859,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
check_and_free_fields(htab, l);
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -871,6 +875,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
@@ -879,9 +908,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -906,13 +934,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
+ if (!onallcpus) {
u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
@@ -963,19 +990,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = container_of(l, struct htab_elem, fnode);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_NOWAIT | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
@@ -986,18 +1010,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_NOWAIT | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1016,7 +1040,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -1061,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1416,6 +1440,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1426,6 +1454,7 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
@@ -1439,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
- /* We don't reset or free kptr on uref dropping to zero,
- * hence just free timer.
- */
- bpf_timer_cancel_and_free(l->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ /* We only free timer on uref dropping to zero */
+ bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
@@ -1455,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(&htab->map))
+ /* We only free timer on uref dropping to zero */
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
@@ -1475,24 +1500,27 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
- htab_free_prealloced_kptrs(htab);
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
}
- bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1636,7 +1664,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
@@ -1691,8 +1719,11 @@ again_nocopy:
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1f961f9982d2..af30c6cbd65d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -15,9 +16,11 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
#include "../../lib/kstrtox.h"
@@ -198,6 +201,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -323,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
@@ -345,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -353,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
__bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
@@ -415,40 +432,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- struct bpf_cg_run_ctx *ctx;
- void *ptr;
-
- /* get current cgroup storage from BPF run context */
- ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
- storage = ctx->prog_item->cgroup_storage[stype];
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -577,7 +561,6 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
@@ -678,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -708,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_task_proto = {
.func = bpf_copy_from_user_task,
.gpl_only = true,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -1190,7 +1175,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->timer_off;
+ t->value = (void *)timer - map->record->timer_off;
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
@@ -1398,10 +1383,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
}
/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
- * helper is determined dynamically by the verifier.
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
*/
-#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-
static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.func = bpf_kptr_xchg,
.gpl_only = false,
@@ -1420,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1430,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
-static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
@@ -1454,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
u32 size = bpf_dynptr_get_size(ptr);
@@ -1468,6 +1452,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_
{
int err;
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
err = bpf_dynptr_check_size(size);
if (err)
goto error;
@@ -1497,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
int err;
@@ -1509,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
if (err)
return err;
- memcpy(dst, src->data + src->offset + offset, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
return 0;
}
@@ -1520,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_DYNPTR,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
int err;
@@ -1537,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
if (err)
return err;
- memcpy(dst->data + dst->offset + offset, src, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
return 0;
}
@@ -1546,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
int err;
@@ -1574,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};
@@ -1617,6 +1611,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1627,26 +1623,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
- case BPF_FUNC_ringbuf_reserve_dynptr:
- return &bpf_ringbuf_reserve_dynptr_proto;
- case BPF_FUNC_ringbuf_submit_dynptr:
- return &bpf_ringbuf_submit_dynptr_proto;
- case BPF_FUNC_ringbuf_discard_dynptr:
- return &bpf_ringbuf_discard_dynptr_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_loop:
- return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
- case BPF_FUNC_dynptr_from_mem:
- return &bpf_dynptr_from_mem_proto;
- case BPF_FUNC_dynptr_read:
- return &bpf_dynptr_read_proto;
- case BPF_FUNC_dynptr_write:
- return &bpf_dynptr_write_proto;
- case BPF_FUNC_dynptr_data:
- return &bpf_dynptr_data_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
default:
break;
}
@@ -1675,6 +1657,32 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_cancel_proto;
case BPF_FUNC_kptr_xchg:
return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+#endif
default:
break;
}
@@ -1711,3 +1719,402 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
}
+
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->list_head.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ bpf_obj_free_fields(field->list_head.value_rec, obj);
+ /* bpf_mem_free requires migrate_disable(), since we can be
+ * called from map free path as well apart from BPF program (as
+ * part of map ops doing bpf_obj_free_fields).
+ */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, obj);
+ migrate_enable();
+ }
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->field_offs, p);
+ return p;
+}
+
+void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ if (meta)
+ bpf_obj_free_fields(meta->record, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
+static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n = (void *)node, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (unlikely(!n->next))
+ INIT_LIST_HEAD(n);
+ tail ? list_add_tail(n, h) : list_add(n, h);
+}
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, false);
+}
+
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, true);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+ n = tail ? h->prev : h->next;
+ list_del_init(n);
+ return (struct bpf_list_node *)n;
+}
+
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ return get_task_struct(p);
+}
+
+/**
+ * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
+ * acquired by this kfunc which is not stored in a map as a kptr, must be
+ * released by calling bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+{
+ /* For the time being this function returns NULL, as it's not currently
+ * possible to safely acquire a reference to a task with RCU protection
+ * using get_task_struct() and put_task_struct(). This is due to the
+ * slightly odd mechanics of p->rcu_users, and how task RCU protection
+ * works.
+ *
+ * A struct task_struct is refcounted by two different refcount_t
+ * fields:
+ *
+ * 1. p->usage: The "true" refcount field which tracks a task's
+ * lifetime. The task is freed as soon as this
+ * refcount drops to 0.
+ *
+ * 2. p->rcu_users: An "RCU users" refcount field which is statically
+ * initialized to 2, and is co-located in a union with
+ * a struct rcu_head field (p->rcu). p->rcu_users
+ * essentially encapsulates a single p->usage
+ * refcount, and when p->rcu_users goes to 0, an RCU
+ * callback is scheduled on the struct rcu_head which
+ * decrements the p->usage refcount.
+ *
+ * There are two important implications to this task refcounting logic
+ * described above. The first is that
+ * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
+ * after the refcount goes to 0, the RCU callback being scheduled will
+ * cause the memory backing the refcount to again be nonzero due to the
+ * fields sharing a union. The other is that we can't rely on RCU to
+ * guarantee that a task is valid in a BPF program. This is because a
+ * task could have already transitioned to being in the TASK_DEAD
+ * state, had its rcu_users refcount go to 0, and its rcu callback
+ * invoked in which it drops its single p->usage reference. At this
+ * point the task will be freed as soon as the last p->usage reference
+ * goes to 0, without waiting for another RCU gp to elapse. The only
+ * way that a BPF program can guarantee that a task is valid is in this
+ * scenario is to hold a p->usage refcount itself.
+ *
+ * Until we're able to resolve this issue, either by pulling
+ * p->rcu_users and p->rcu out of the union, or by getting rid of
+ * p->usage and just using p->rcu_users for refcounting, we'll just
+ * return NULL here.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_task_release().
+ * @pp: A pointer to a task kptr on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
+{
+ /* We must return NULL here until we have clarity on how to properly
+ * leverage RCU for ensuring a task's lifetime. See the comment above
+ * in bpf_task_acquire_not_zero() for more details.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+void bpf_task_release(struct task_struct *p)
+{
+ if (!p)
+ return;
+
+ put_task_struct(p);
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_cgroup_release().
+ * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
+ */
+struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
+{
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ /* Another context could remove the cgroup from the map and release it
+ * at any time, including after we've done the lookup above. This is
+ * safe because we're in an RCU read region, so the cgroup is
+ * guaranteed to remain valid until at least the rcu_read_unlock()
+ * below.
+ */
+ cgrp = READ_ONCE(*cgrpp);
+
+ if (cgrp && !cgroup_tryget(cgrp))
+ /* If the cgroup had been removed from the map and freed as
+ * described above, cgroup_tryget() will return false. The
+ * cgroup will be freed at some point after the current RCU gp
+ * has ended, so just return NULL to the user.
+ */
+ cgrp = NULL;
+ rcu_read_unlock();
+
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ if (!cgrp)
+ return;
+
+ cgroup_put(cgrp);
+}
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ ancestor = cgrp->ancestors[level];
+ cgroup_get(ancestor);
+ return ancestor;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+{
+ return obj__ign;
+}
+
+void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+__diag_pop();
+
+BTF_SET8_START(generic_btf_ids)
+#ifdef CONFIG_KEXEC_CORE
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_list_push_front)
+BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_SET8_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release)
+#endif
+
+BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
+BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_SET8_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 49ef0ce040c7..e90d9f63edc5 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d789e3b831ad..d833496e9e42 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -558,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
@@ -609,7 +609,7 @@ static void trie_free(struct bpf_map *map)
}
out:
- kfree(trie);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 135205d0d560..38136ec4e095 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
struct fd f;
+ int ret;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
@@ -20,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto put;
}
if (!inner_map->ops->map_meta_equal) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
+ ret = -ENOTSUPP;
+ goto put;
}
inner_map_meta_size = sizeof(*inner_map_meta);
@@ -41,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
- fdput(f);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto put;
}
inner_map_meta->map_type = inner_map->map_type;
@@ -50,9 +46,33 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
- inner_map_meta->timer_off = inner_map->timer_off;
- inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ ret = PTR_ERR(inner_map_meta->record);
+ goto free;
+ }
+ if (inner_map_meta->record) {
+ struct btf_field_offs *field_offs;
+ /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
+ * valid.
+ */
+ field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
+ if (!field_offs) {
+ ret = -ENOMEM;
+ goto free_rec;
+ }
+ inner_map_meta->field_offs = field_offs;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
@@ -68,11 +88,19 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
+free_rec:
+ btf_record_free(inner_map_meta->record);
+free:
+ kfree(inner_map_meta);
+put:
+ fdput(f);
+ return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
- bpf_map_free_kptr_off_tab(map_meta);
+ kfree(map_meta->field_offs);
+ bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -84,9 +112,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags &&
- bpf_map_equal_kptr_off_tab(meta0, meta1);
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..ebcc3dd0fa19
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,677 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 1;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+
+ struct rcu_head rcu;
+ struct llist_head free_by_rcu;
+ struct llist_head waiting_for_gp;
+ atomic_t call_rcu_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node)
+{
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
+
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ unsigned long flags;
+ void *obj;
+ int i;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (i = 0; i < cnt; i++) {
+ /*
+ * free_by_rcu is only manipulated by irq work refill_work().
+ * IRQ works on the same CPU are called sequentially, so it is
+ * safe to use __llist_del_first() here. If alloc_bulk() is
+ * invoked by the initial prefill, there will be no running
+ * refill_work(), so __llist_del_first() is fine as well.
+ *
+ * In most cases, objects on free_by_rcu are from the same CPU.
+ * If some objects come from other CPUs, it doesn't incur any
+ * harm because NUMA_NO_NODE means the preference for current
+ * numa node and it is not a guarantee.
+ */
+ obj = __llist_del_first(&c->free_by_rcu);
+ if (!obj) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ }
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+ if (c->percpu_size) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+ struct llist_node *pos, *t;
+
+ llist_for_each_safe(pos, t, llnode)
+ free_one(c, pos);
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu list.
+ */
+ __llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ return;
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ /* There is no concurrent __llist_add(waiting_for_gp) access.
+ * It doesn't race with llist_del_all either.
+ * But there could be two concurrent llist_del_all(waiting_for_gp):
+ * from __free_rcu() and from drain_mem_cache().
+ */
+ __llist_add(llnode, &c->waiting_for_gp);
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ do {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(flags);
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ if (llnode)
+ enque_to_free(c, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(c, llnode);
+ do_call_rcu(c);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
+ /* To avoid consuming memory assume that 1st run of bpf
+ * prog won't be doing more than 4 map_update_elem from
+ * irq disabled region
+ */
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (percpu)
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ else
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ /* size == 0 && percpu is an invalid combination */
+ if (WARN_ON_ONCE(percpu))
+ return -EINVAL;
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ prefill_mem_cache(c, cpu);
+ }
+ }
+ ma->caches = pcc;
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ *
+ * Except for waiting_for_gp list, there are no concurrent operations
+ * on these lists, so it is safe to use __llist_del_all().
+ */
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
+ free_one(c, llnode);
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp lists was drained, but __free_rcu might
+ * still execute. Wait for it now before we freeing percpu caches.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
+ */
+ rcu_barrier_tasks_trace();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->cache = ma->cache;
+ ma->cache = NULL;
+ copy->caches = ma->caches;
+ ma->caches = NULL;
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ /*
+ * refill_work may be unfinished for PREEMPT_RT kernel
+ * in which irq work is invoked in a per-CPU RT thread.
+ * It is also possible for kernel with
+ * arch_irq_work_has_interrupt() being false and irq
+ * work is invoked in timer interrupt. So waiting for
+ * the completion of irq work to ease the handling of
+ * concurrency.
+ */
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ /* objcg is the same across cpus */
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..13e4efc971e6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
@@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
@@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 00b874c8e889..034cf87b54e9 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
{
int cpu, orig_cpu;
- orig_cpu = cpu = raw_smp_processor_id();
+ orig_cpu = raw_smp_processor_id();
while (1) {
- struct pcpu_freelist_head *head;
+ for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+ struct pcpu_freelist_head *head;
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
/* cannot lock any per cpu lock, try extralist */
- if (cpu == orig_cpu &&
- pcpu_freelist_try_push_extra(s, node))
+ if (pcpu_freelist_try_push_extra(s, node))
return;
}
}
@@ -102,22 +100,21 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
- int i, cpu, pcpu_entries;
+ unsigned int cpu, cpu_idx, i, j, n, m;
- pcpu_entries = nr_elems / num_possible_cpus() + 1;
- i = 0;
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+ cpu_idx = 0;
for_each_possible_cpu(cpu) {
-again:
head = per_cpu_ptr(s->freelist, cpu);
- /* No locking required as this is not visible yet. */
- pcpu_freelist_push_node(head, buf);
- i++;
- buf += elem_size;
- if (i == nr_elems)
- break;
- if (i % pcpu_entries)
- goto again;
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
}
}
@@ -125,13 +122,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
- goto next_cpu;
+ continue;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
@@ -140,12 +136,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
return node;
}
raw_spin_unlock(&head->lock);
-next_cpu:
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* per cpu lists are all empty, try extralist */
@@ -164,13 +154,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
- goto next_cpu;
+ continue;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
@@ -180,12 +169,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
}
raw_spin_unlock(&head->lock);
}
-next_cpu:
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* cannot pop from per cpu lists, try extralist */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index a1c0794ae49d..8a5e060de63b 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -78,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
if (!qs)
return ERR_PTR(-ENOMEM);
- memset(qs, 0, sizeof(*qs));
-
bpf_map_init_from_attr(&qs->map, attr);
qs->size = size;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index ded4faeca192..80f4b4d88aaf 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,10 +38,43 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -116,7 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -136,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
return NULL;
spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -164,7 +198,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
#endif
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
@@ -172,7 +206,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
if (!rb_map->rb) {
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
}
@@ -190,7 +224,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
@@ -199,7 +233,7 @@ static void ringbuf_map_free(struct bpf_map *map)
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -224,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -242,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
@@ -251,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
return prod_pos - cons_pos;
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -264,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
@@ -278,6 +350,20 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_btf_id = &ringbuf_map_btf_ids[0],
};
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
* calculate offset from record metadata to ring buffer in pages, rounded
* down. This page offset is stored as part of record metadata and allows to
@@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
- if (len > rb->mask + 1)
+ if (len > ringbuf_total_data_sz(rb))
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -361,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -404,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -417,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
@@ -553,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
.arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ smp_mb__before_atomic();
+ atomic_set(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 1adbe67cdb95..aecea7451b61 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
int ret;
/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return bpf_get_stackid((unsigned long)(ctx->regs),
(unsigned long) map, flags, 0, 0);
@@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
int err = -EINVAL;
__u64 nr_kernel;
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 27760627370d..64131f88c553 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -175,8 +175,8 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
@@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
@@ -495,114 +495,181 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+static int btf_field_cmp(const void *a, const void *b)
{
- const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+ const struct btf_field *f1 = a, *f2 = b;
- if (off_desc1->offset < off_desc2->offset)
+ if (f1->offset < f2->offset)
return -1;
- else if (off_desc1->offset > off_desc2->offset)
+ else if (f1->offset > f2->offset)
return 1;
return 0;
}
-struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ enum btf_field_type type)
{
- /* Since members are iterated in btf_find_field in increasing order,
- * offsets appended to kptr_off_tab are in increasing order, so we can
- * do bsearch to find exact match.
- */
- struct bpf_map_value_off *tab;
+ struct btf_field *field;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & type))
return NULL;
- tab = map->kptr_off_tab;
- return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+ return field;
}
-void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+void btf_record_free(struct btf_record *rec)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
int i;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec))
return;
- for (i = 0; i < tab->nr_off; i++) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ /* Nothing to release for bpf_list_head */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
}
- kfree(tab);
- map->kptr_off_tab = NULL;
+ kfree(rec);
}
-struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+void bpf_map_free_record(struct bpf_map *map)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
- int size, i;
+ btf_record_free(map->record);
+ map->record = NULL;
+}
- if (!map_value_has_kptrs(map))
- return ERR_PTR(-ENOENT);
- size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
- new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
- if (!new_tab)
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = offsetof(struct btf_record, fields[rec->cnt]);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
return ERR_PTR(-ENOMEM);
- /* Do a deep copy of the kptr_off_tab */
- for (i = 0; i < tab->nr_off; i++) {
- btf_get(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
- while (i--) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
}
- kfree(new_tab);
- return ERR_PTR(-ENXIO);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ /* Nothing to acquire for bpf_list_head */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
}
+ new_rec->cnt++;
}
- return new_tab;
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
}
-bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
{
- struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
- bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
int size;
- if (!a_has_kptr && !b_has_kptr)
+ if (!a_has_fields && !b_has_fields)
return true;
- if (a_has_kptr != b_has_kptr)
+ if (a_has_fields != b_has_fields)
return false;
- if (tab_a->nr_off != tab_b->nr_off)
+ if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
- return !memcmp(tab_a, tab_b, size);
+ size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
}
-/* Caller must ensure map_value_has_kptrs is true. Note that this function can
- * be called on a map value while the map_value is visible to BPF programs, as
- * it ensures the correct synchronization, and we already enforce the same using
- * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
- */
-void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- unsigned long *btf_id_ptr;
- int i;
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
- for (i = 0; i < tab->nr_off; i++) {
- struct bpf_map_value_off_desc *off_desc = &tab->off[i];
- unsigned long old_ptr;
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
- btf_id_ptr = map_value + off_desc->offset;
- if (off_desc->type == BPF_KPTR_UNREF) {
- u64 *p = (u64 *)btf_id_ptr;
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
- WRITE_ONCE(p, 0);
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
continue;
}
- old_ptr = xchg(btf_id_ptr, 0);
- off_desc->kptr.dtor((void *)old_ptr);
}
}
@@ -610,14 +677,24 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct btf_field_offs *foffs = map->field_offs;
+ struct btf_record *rec = map->record;
security_bpf_map_free(map);
- kfree(map->off_arr);
bpf_map_release_memcg(map);
- /* implementation dependent freeing, map_free callback also does
- * bpf_map_free_kptr_off_tab, if needed.
- */
+ /* implementation dependent freeing */
map->ops->map_free(map);
+ /* Delay freeing of field_offs and btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ kfree(foffs);
+ btf_record_free(rec);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -638,7 +715,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
@@ -775,8 +855,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
- map_value_has_timer(map) || map_value_has_kptrs(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -903,84 +982,6 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
-{
- const u32 a = *(const u32 *)_a;
- const u32 b = *(const u32 *)_b;
-
- if (a < b)
- return -1;
- else if (a > b)
- return 1;
- return 0;
-}
-
-static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
-{
- struct bpf_map *map = (struct bpf_map *)priv;
- u32 *off_base = map->off_arr->field_off;
- u32 *a = _a, *b = _b;
- u8 *sz_a, *sz_b;
-
- sz_a = map->off_arr->field_sz + (a - off_base);
- sz_b = map->off_arr->field_sz + (b - off_base);
-
- swap(*a, *b);
- swap(*sz_a, *sz_b);
-}
-
-static int bpf_map_alloc_off_arr(struct bpf_map *map)
-{
- bool has_spin_lock = map_value_has_spin_lock(map);
- bool has_timer = map_value_has_timer(map);
- bool has_kptrs = map_value_has_kptrs(map);
- struct bpf_map_off_arr *off_arr;
- u32 i;
-
- if (!has_spin_lock && !has_timer && !has_kptrs) {
- map->off_arr = NULL;
- return 0;
- }
-
- off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
- if (!off_arr)
- return -ENOMEM;
- map->off_arr = off_arr;
-
- off_arr->cnt = 0;
- if (has_spin_lock) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->spin_lock_off;
- off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
- off_arr->cnt++;
- }
- if (has_timer) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->timer_off;
- off_arr->field_sz[i] = sizeof(struct bpf_timer);
- off_arr->cnt++;
- }
- if (has_kptrs) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- u32 *off = &off_arr->field_off[off_arr->cnt];
- u8 *sz = &off_arr->field_sz[off_arr->cnt];
-
- for (i = 0; i < tab->nr_off; i++) {
- *off++ = tab->off[i].offset;
- *sz++ = sizeof(u64);
- }
- off_arr->cnt += tab->nr_off;
- }
-
- if (off_arr->cnt == 1)
- return 0;
- sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
- map_off_arr_cmp, map_off_arr_swap, map);
- return 0;
-}
-
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
@@ -1003,39 +1004,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
- map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
- map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
- }
- }
-
- map->timer_off = btf_find_timer(btf, value_type);
- if (map_value_has_timer(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EOPNOTSUPP;
- }
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
- map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
- if (map_value_has_kptrs(map)) {
if (!bpf_capable()) {
ret = -EPERM;
goto free_map_tab;
@@ -1044,14 +1018,60 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
ret = -EACCES;
goto free_map_tab;
}
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY) {
- ret = -EOPNOTSUPP;
- goto free_map_tab;
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
if (ret < 0)
@@ -1060,7 +1080,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
free_map_tab:
- bpf_map_free_kptr_off_tab(map);
+ bpf_map_free_record(map);
return ret;
}
@@ -1069,6 +1089,7 @@ free_map_tab:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
+ struct btf_field_offs *foffs;
struct bpf_map *map;
int f_flags;
int err;
@@ -1113,8 +1134,6 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
spin_lock_init(&map->owner.lock);
- map->spin_lock_off = -EINVAL;
- map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -1150,13 +1169,17 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = bpf_map_alloc_off_arr(map);
- if (err)
+
+ foffs = btf_parse_field_offs(map->record);
+ if (IS_ERR(foffs)) {
+ err = PTR_ERR(foffs);
goto free_map;
+ }
+ map->field_offs = foffs;
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_off_arr;
+ goto free_map_field_offs;
err = bpf_map_alloc_id(map);
if (err)
@@ -1180,8 +1203,8 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
-free_map_off_arr:
- kfree(map->off_arr);
+free_map_field_offs:
+ kfree(map->field_offs);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1328,7 +1351,7 @@ static int map_lookup_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1401,7 +1424,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1413,19 +1436,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
value_size = bpf_map_value_size(map);
-
- err = -ENOMEM;
- value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
+ }
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
-
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, f.file, key, value, attr->flags);
-free_value:
kvfree(value);
free_key:
kvfree(key);
@@ -1437,9 +1455,9 @@ err_put:
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1459,7 +1477,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -1569,7 +1587,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1610,23 +1628,21 @@ int generic_map_delete_batch(struct bpf_map *map,
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->batch.map_fd;
void *key, *value;
- struct fd f;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1646,7 +1662,6 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
- f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1654,7 +1669,7 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
@@ -1667,7 +1682,6 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
- fdput(f);
return err;
}
@@ -1689,7 +1703,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
@@ -1811,7 +1825,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1882,8 +1896,7 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
- map_value_has_timer(map) || map_value_has_kptrs(map)) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
fdput(f);
return -ENOTSUPP;
}
@@ -2094,6 +2107,17 @@ struct bpf_prog_kstats {
u64 misses;
};
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
@@ -2107,11 +2131,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
+ start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
@@ -3494,9 +3518,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LSM:
if (ptype == BPF_PROG_TYPE_LSM &&
prog->expected_attach_type != BPF_LSM_CGROUP)
- return -EINVAL;
-
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ ret = -EINVAL;
+ else
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
@@ -4395,7 +4419,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4448,13 +4474,13 @@ put_file:
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
@@ -4488,13 +4514,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
if (has_write)
bpf_map_write_active_dec(map);
@@ -4941,7 +4967,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -5073,8 +5099,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
case BPF_LINK_CREATE:
@@ -5119,13 +5147,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
run_ctx.bpf_cookie = 0;
run_ctx.saved_run_ctx = NULL;
- if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
/* recursion detected */
bpf_prog_put(prog);
return -EBUSY;
}
attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
- __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
bpf_prog_put(prog);
return 0;
#endif
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..c2a2182ce570 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -10,8 +10,17 @@
#include <linux/btf_ids.h>
#include "mmap_unlock_work.h"
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
+
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task, *next_task;
+ struct pid *pid;
+ u32 saved_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return NULL;
+
+retry:
+ if (!pid_alive(task)) {
+ put_task_struct(task);
+ return NULL;
+ }
+
+ next_task = next_thread(task);
+ put_task_struct(task);
+ if (!next_task)
+ return NULL;
+
+ saved_tid = *tid;
+ *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+ if (!*tid || *tid == common->pid) {
+ /* Run out of tasks of a process. The tasks of a
+ * thread_group are linked as circular linked list.
+ */
+ *tid = saved_tid;
+ return NULL;
+ }
+
+ get_task_struct(next_task);
+ common->pid_visiting = *tid;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files) {
+ task = next_task;
+ goto retry;
+ }
+
+ return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
@@ -151,21 +291,18 @@ again:
curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
info->task = NULL;
- info->tid = curr_tid;
return NULL;
}
- /* set info->task and info->tid */
+ /* set info->task */
info->task = curr_task;
- if (curr_tid == info->tid) {
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
rcu_read_lock();
@@ -186,9 +323,15 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
@@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -299,19 +445,18 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
- struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
* the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
}
} else {
again:
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
- info->tid = curr_tid + 1;
+ info->tid++;
goto finish;
}
- if (curr_tid != info->tid) {
- info->tid = curr_tid;
+ if (saved_tid != info->tid) {
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
@@ -400,10 +544,10 @@ again:
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_task->mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
@@ -417,7 +561,7 @@ again:
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
@@ -430,9 +574,12 @@ again:
return curr_vma;
next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
put_task_struct(curr_task);
info->task = NULL;
- curr_tid++;
+ info->tid++;
goto again;
finish:
@@ -531,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
};
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
static struct bpf_iter_reg task_reg_info = {
.target = "task",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
@@ -540,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -551,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
static struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -560,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -571,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -580,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ff87e38af8a7..11f5ec0b8016 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void *bpf_jit_alloc_exec_page(void)
-{
- void *image;
-
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * every time new program is attached or detached.
- */
- set_memory_x((long)image, 1);
- return image;
-}
-
void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
@@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
goto out_free_im;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec_page();
+ im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!image)
goto out_uncharge;
+ set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -483,6 +468,8 @@ again:
if (err < 0)
goto out;
+ set_memory_rox((long)im->image, 1);
+
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
if (tr->cur_image)
@@ -863,17 +850,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
return start;
}
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
- struct bpf_prog_stats *stats;
- unsigned int flags;
-
- stats = this_cpu_ptr(prog->stats);
- flags = u64_stats_update_begin_irqsave(&stats->syncp);
- u64_stats_inc(&stats->misses);
- u64_stats_update_end_irqrestore(&stats->syncp, flags);
-}
-
/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
@@ -887,7 +863,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
@@ -895,8 +871,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
return bpf_prog_start_time();
@@ -924,19 +900,20 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
}
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
/* Runtime stats are exported via actual BPF_LSM_CGROUP
@@ -950,8 +927,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
return NO_START_TIME;
}
-void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -960,14 +937,15 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
@@ -976,17 +954,62 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock();
+}
+
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
percpu_ref_get(&tr->pcref);
@@ -997,6 +1020,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
percpu_ref_put(&tr->pcref);
}
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3eadb14e090b..a5255a0dcbb6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
+#include <linux/poison.h>
#include "disasm.h"
@@ -261,7 +262,7 @@ struct bpf_call_arg_meta {
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
- struct bpf_map_value_off_desc *kptr_off_desc;
+ struct btf_field *kptr_field;
u8 uninit_dynptr_regno;
};
@@ -370,6 +371,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
+EXPORT_SYMBOL_GPL(bpf_log);
static const char *ltrim(const char *s)
{
@@ -427,6 +429,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
+ type = base_type(type);
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
@@ -448,28 +451,34 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool type_is_ptr_alloc_obj(u32 type)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
- return base_type(type) == PTR_TO_SOCKET ||
- base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM ||
- base_type(type) == PTR_TO_BTF_ID;
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
+ }
+ return rec;
}
-static bool type_is_rdonly_mem(u32 type)
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return type & MEM_RDONLY;
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
+static bool type_is_rdonly_mem(u32 type)
{
- return type == ARG_PTR_TO_SOCK_COMMON;
+ return type & MEM_RDONLY;
}
static bool type_may_be_null(u32 type)
@@ -477,15 +486,6 @@ static bool type_may_be_null(u32 type)
return type & PTR_MAYBE_NULL;
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -518,6 +518,43 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_timer_set_callback ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
+}
+
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
+
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_STX &&
@@ -533,7 +570,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
static const char *reg_type_str(struct bpf_verifier_env *env,
enum bpf_reg_type type)
{
- char postfix[16] = {0}, prefix[32] = {0};
+ char postfix[16] = {0}, prefix[64] = {0};
static const char * const str[] = {
[NOT_INIT] = "?",
[SCALAR_VALUE] = "scalar",
@@ -555,6 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
if (type & PTR_MAYBE_NULL) {
@@ -564,16 +602,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
strncpy(postfix, "_or_null", 16);
}
- if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 32);
- if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 32);
- if (type & MEM_USER)
- strncpy(prefix, "user_", 32);
- if (type & MEM_PERCPU)
- strncpy(prefix, "percpu_", 32);
- if (type & PTR_UNTRUSTED)
- strncpy(prefix, "untrusted_", 32);
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -688,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
return type == BPF_DYNPTR_TYPE_RINGBUF;
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(sreg1, type, true);
+ __mark_dynptr_reg(sreg2, type, false);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(reg, type, true);
+}
+
+
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx)
{
@@ -709,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- state->stack[spi].spilled_ptr.dynptr.first_slot = true;
- state->stack[spi].spilled_ptr.dynptr.type = type;
- state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+ mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
if (dynptr_type_refcounted(type)) {
/* The id is used to track proper releasing */
@@ -719,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (id < 0)
return id;
- state->stack[spi].spilled_ptr.id = id;
- state->stack[spi - 1].spilled_ptr.id = id;
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
return 0;
@@ -742,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
}
/* Invalidate any slices associated with this dynptr */
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- release_reference(env, state->stack[spi].spilled_ptr.id);
- state->stack[spi].spilled_ptr.id = 0;
- state->stack[spi - 1].spilled_ptr.id = 0;
- }
-
- state->stack[spi].spilled_ptr.dynptr.first_slot = false;
- state->stack[spi].spilled_ptr.dynptr.type = 0;
- state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+ WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
return 0;
}
static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
- int i;
+ int spi, i;
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = get_spi(reg->off);
if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
return true;
@@ -773,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
int i;
+ /* This already represents first slot of initialized bpf_dynptr */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ spi = get_spi(reg->off);
if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
!state->stack[spi].spilled_ptr.dynptr.first_slot)
return false;
@@ -790,11 +850,27 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
return false;
}
+ return true;
+}
+
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi;
+
/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
- return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+ dynptr_type = arg_to_dynptr_type(arg_type);
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = get_spi(reg->off);
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -853,7 +929,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (reg->id)
verbose_a("id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
verbose_a("off=%d", reg->off);
@@ -986,9 +1062,9 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < bytes) {
+ if (ksize(dst) < ksize(src)) {
kfree(dst);
- dst = kmalloc_track_caller(bytes, flags);
+ dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags);
if (!dst)
return NULL;
}
@@ -1005,12 +1081,19 @@ out:
*/
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
+ size_t alloc_size;
+ void *new_arr;
+
if (!new_n || old_n == new_n)
goto out;
- arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
- if (!arr)
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
+ if (!new_arr) {
+ kfree(arr);
return NULL;
+ }
+ arr = new_arr;
if (new_n > old_n)
memset(arr + old_n * size, 0, (new_n - old_n) * size);
@@ -1086,6 +1169,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -1098,6 +1182,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -1173,8 +1260,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
+ dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->active_lock.ptr = src->active_lock.ptr;
+ dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
@@ -1293,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
@@ -1358,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -1369,7 +1468,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (map_value_has_timer(map->inner_map_meta))
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
@@ -1658,7 +1757,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1739,6 +1838,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = tnum_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -2466,15 +2566,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_idx_pair *p;
+ size_t alloc_size;
+
+ if (!is_jmp_point(env, env->insn_idx))
+ return 0;
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
p[cnt - 1].idx = env->insn_idx;
@@ -2626,6 +2741,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
return -ENOTSUPP;
+ /* BPF helpers that invoke callback subprogs are
+ * equivalent to BPF_PSEUDO_CALL above
+ */
+ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -2715,8 +2835,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
@@ -2734,9 +2857,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
}
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
int spi)
{
struct bpf_verifier_state *st = env->cur_state;
@@ -2753,18 +2989,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
+ new_marks = true;
}
while (spi >= 0) {
@@ -2777,11 +3013,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
stack_mask = 0;
break;
}
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
+ new_marks = true;
break;
}
@@ -2789,12 +3021,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
if (!reg_mask && !stack_mask)
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ reg->precise = true;
+ }
+ return 0;
+ }
+
+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, reg_mask, stack_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
for (i = last_idx;;) {
if (skip_first) {
err = 0;
@@ -2834,7 +3096,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
break;
new_marks = false;
- func = st->frame[st->curframe];
+ func = st->frame[frame];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
@@ -2898,14 +3160,19 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, frame, regno, -1);
+}
+
+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
+{
+ return __mark_chain_precision(env, frame, -1, spi);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -3154,14 +3421,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
mark_stack_slot_scratched(env, spi);
- if (!env->allow_ptr_leaks
- && *stype != NOT_INIT
- && *stype != SCALAR_VALUE) {
- /* Reject the write if there's are spilled pointers in
- * range. If we didn't reject here, the ptr status
- * would be erased below (even though not all slots are
- * actually overwritten), possibly opening the door to
- * leaks.
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i);
@@ -3651,15 +3921,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
}
static int map_kptr_match_type(struct bpf_verifier_env *env,
- struct bpf_map_value_off_desc *off_desc,
+ struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL;
+ const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
perm_flags |= PTR_UNTRUSTED;
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -3706,15 +3976,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
* strict mode to true for type match.
*/
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- off_desc->kptr.btf, off_desc->kptr.btf_id,
- off_desc->type == BPF_KPTR_REF))
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type == BPF_KPTR_REF))
goto bad_type;
return 0;
bad_type:
verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
reg_type_str(env, reg->type), reg_name);
verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
targ_name);
else
@@ -3724,7 +3994,7 @@ bad_type:
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
- struct bpf_map_value_off_desc *off_desc)
+ struct btf_field *kptr_field)
{
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
int class = BPF_CLASS(insn->code);
@@ -3734,7 +4004,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* - Reject cases where variable offset may touch kptr
* - size of access (must be BPF_DW)
* - tnum_is_const(reg->var_off)
- * - off_desc->offset == off + reg->var_off.value
+ * - kptr_field->offset == off + reg->var_off.value
*/
/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
if (BPF_MODE(insn->code) != BPF_MEM) {
@@ -3745,7 +4015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We only allow loading referenced kptr, since it will be marked as
* untrusted, similar to unreferenced kptr.
*/
- if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+ if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
@@ -3755,19 +4025,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
- mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
- off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
- map_kptr_match_type(env, off_desc, val_reg, value_regno))
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
return -EACCES;
} else if (class == BPF_ST) {
if (insn->imm) {
verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
- off_desc->offset);
+ kptr_field->offset);
return -EACCES;
}
} else {
@@ -3786,45 +4056,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ struct btf_record *rec;
+ int err, i;
err = check_mem_region_access(env, regno, off, size, map->value_size,
zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_timer(map)) {
- u32 t = map->timer_off;
-
- if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
- t < reg->umax_value + off + size) {
- verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_kptrs(map)) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- int i;
-
- for (i = 0; i < tab->nr_off; i++) {
- u32 p = tab->off[i].offset;
-
- if (reg->smin_value + off < p + sizeof(u64) &&
- p < reg->umax_value + off + size) {
+ if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
if (src != ACCESS_DIRECT) {
verbose(env, "kptr cannot be accessed indirectly by helper\n");
return -EACCES;
@@ -3843,10 +4098,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
}
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
@@ -4063,6 +4322,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -4479,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4509,17 +4804,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id, &flag);
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id, &flag);
+ if (type_is_alloc(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
}
if (ret < 0)
@@ -4531,6 +4837,30 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_flag(reg->type) & PTR_UNTRUSTED)
flag |= PTR_UNTRUSTED;
+ /* By default any pointer obtained from walking a trusted pointer is
+ * no longer trusted except the rcu case below.
+ */
+ flag &= ~PTR_TRUSTED;
+
+ if (flag & MEM_RCU) {
+ /* Mark value register as MEM_RCU only if it is protected by
+ * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
+ * itself can already indicate trustedness inside the rcu
+ * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
+ * it could be null in some cases.
+ */
+ if (!env->cur_state->active_rcu_lock ||
+ !(is_trusted_reg(reg) || is_rcu_reg(reg)))
+ flag &= ~MEM_RCU;
+ else
+ flag |= PTR_MAYBE_NULL;
+ } else if (reg->type & MEM_RCU) {
+ /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
+ * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+ */
+ flag |= PTR_UNTRUSTED;
+ }
+
if (atype == BPF_READ && value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
@@ -4545,6 +4875,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
@@ -4565,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!env->allow_ptr_to_map_access) {
+ if (!env->allow_ptr_leaks) {
verbose(env,
- "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname);
return -EPERM;
}
@@ -4583,7 +4914,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
@@ -4719,7 +5053,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
- struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+ struct btf_field *kptr_field = NULL;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -4733,11 +5067,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
if (tnum_is_const(reg->var_off))
- kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
- off + reg->var_off.value);
- if (kptr_off_desc) {
- err = check_map_kptr_access(env, regno, value_regno, insn_idx,
- kptr_off_desc);
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
@@ -5128,10 +5461,6 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
- goto mark;
-
- if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -5161,6 +5490,11 @@ mark:
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64);
+ /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
return update_stack_depth(env, state, min_off);
}
@@ -5223,6 +5557,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ atype, -1, false);
+ }
+
+ fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
@@ -5323,8 +5676,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return err;
}
-int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
{
struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
bool may_be_null = type_may_be_null(mem_reg->type);
@@ -5352,23 +5705,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
}
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * cur_state->active_lock remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
@@ -5376,8 +5732,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
if (!is_const) {
verbose(env,
@@ -5385,49 +5743,78 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
regno);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr");
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ if (rec->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+ val + reg->off, rec->spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
+ if (cur->active_lock.ptr) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- cur->active_spin_lock = reg->id;
+ if (map)
+ cur->active_lock.ptr = map;
+ else
+ cur->active_lock.ptr = btf;
+ cur->active_lock.id = reg->id;
} else {
- if (!cur->active_spin_lock) {
+ struct bpf_func_state *fstate = cur_func(env);
+ void *ptr;
+ int i;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_lock.ptr) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
+ if (cur->active_lock.ptr != ptr ||
+ cur->active_lock.id != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_spin_lock = 0;
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
+
+ for (i = fstate->acquired_refs - 1; i >= 0; i--) {
+ int err;
+
+ /* Complain on error because this reference state cannot
+ * be freed before this point, as bpf_spin_lock critical
+ * section does not allow functions that release the
+ * allocated object immediately.
+ */
+ if (!fstate->refs[i].release_on_unlock)
+ continue;
+ err = release_reference(env, fstate->refs[i].id);
+ if (err) {
+ verbose(env, "failed to release release_on_unlock reference");
+ return err;
+ }
+ }
}
return 0;
}
@@ -5451,24 +5838,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
map->name);
return -EINVAL;
}
- if (!map_value_has_timer(map)) {
- if (map->timer_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_timer'\n",
- map->name);
- else if (map->timer_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_timer'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_timer is mangled\n",
- map->name);
+ if (!btf_record_has_field(map->record, BPF_TIMER)) {
+ verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
return -EINVAL;
}
- if (map->timer_off != val + reg->off) {
+ if (map->record->timer_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->timer_off);
+ val + reg->off, map->record->timer_off);
return -EINVAL;
}
if (meta->map_ptr) {
@@ -5484,10 +5860,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map_value_off_desc *off_desc;
struct bpf_map *map_ptr = reg->map_ptr;
+ struct btf_field *kptr_field;
u32 kptr_off;
- int ret;
if (!tnum_is_const(reg->var_off)) {
verbose(env,
@@ -5500,30 +5875,136 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
map_ptr->name);
return -EINVAL;
}
- if (!map_value_has_kptrs(map_ptr)) {
- ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
- if (ret == -E2BIG)
- verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
- BPF_MAP_VALUE_OFF_MAX);
- else if (ret == -EEXIST)
- verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
- else
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
return -EINVAL;
}
meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
- if (!off_desc) {
+ kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
}
- if (off_desc->type != BPF_KPTR_REF) {
+ if (kptr_field->type != BPF_KPTR_REF) {
verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
- meta->kptr_off_desc = off_desc;
+ meta->kptr_field = kptr_field;
+ return 0;
+}
+
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+ enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ return -EFAULT;
+ }
+ /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic. We only need to check offset
+ * alignment for PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
+ return -EINVAL;
+ }
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* We only support one dynptr being uninitialized at the moment,
+ * which is sufficient for the helper functions we have right now.
+ */
+ if (meta->uninit_dynptr_regno) {
+ verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+ return -EFAULT;
+ }
+
+ meta->uninit_dynptr_regno = regno;
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ const char *err_extra = "";
+
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ err_extra = "local";
+ break;
+ case DYNPTR_TYPE_RINGBUF:
+ err_extra = "ringbuf";
+ break;
+ default:
+ err_extra = "<unknown>";
+ break;
+ }
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ err_extra, regno);
+ return -EINVAL;
+ }
+ }
return 0;
}
@@ -5588,16 +6069,6 @@ struct bpf_reg_types {
u32 *btf_id;
};
-static const struct bpf_reg_types map_key_value_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types sock_types = {
.types = {
PTR_TO_SOCK_COMMON,
@@ -5615,6 +6086,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
PTR_TO_TCP_SOCK,
PTR_TO_XDP_SOCK,
PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
@@ -5628,7 +6100,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_MEM | MEM_ALLOC,
+ PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
},
};
@@ -5643,23 +6115,46 @@ static const struct bpf_reg_types int_ptr_types = {
},
};
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ CONST_PTR_TO_DYNPTR,
+ }
+};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
- [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
@@ -5673,7 +6168,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
@@ -5682,7 +6177,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
[ARG_PTR_TO_KPTR] = &kptr_types,
- [ARG_PTR_TO_DYNPTR] = &stack_ptr_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5732,7 +6227,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
* allows bpf_sk_release to work for multiple socket types.
@@ -5749,15 +6244,29 @@ found:
}
if (meta->func_id == BPF_FUNC_kptr_xchg) {
- if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
- } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id,
- strict_type_match)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
- return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, kernel_type_name(reg->btf, reg->btf_id),
+ kernel_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
+ }
+ } else if (type_is_alloc(reg->type)) {
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ return -EFAULT;
}
}
@@ -5768,64 +6277,80 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno,
enum bpf_arg_type arg_type)
{
- enum bpf_reg_type type = reg->type;
- bool fixed_off_ok = false;
+ u32 type = reg->type;
- switch ((u32)type) {
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_STACK:
- if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
- verbose(env, "cannot pass in dynptr at an offset\n");
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
return -EINVAL;
}
- fallthrough;
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
case SCALAR_VALUE:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
- return 0;
- break;
+ return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
* fixed offset.
*/
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
/* When referenced PTR_TO_BTF_ID is passed to release function,
- * it's fixed offset must be 0. In the other cases, fixed offset
- * can be non-zero.
- */
- if (arg_type_is_release(arg_type) && reg->off) {
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- return -EINVAL;
- }
- /* For arg is release pointer, fixed_off_ok must be false, but
- * we already checked and rejected reg->off != 0 above, so set
- * to true to allow fixed offset for all other cases.
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
*/
- fixed_off_ok = true;
- break;
+ return __check_ptr_off_reg(env, reg, regno, true);
default:
- break;
+ return __check_ptr_off_reg(env, reg, regno, false);
}
- return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
}
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
- return state->stack[spi].spilled_ptr.id;
+ spi = get_spi(reg->off);
+ return state->stack[spi].spilled_ptr.ref_obj_id;
}
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -5874,7 +6399,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
goto skip_type_check;
/* arg_btf_id and arg_size are in a union. */
- if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
@@ -5889,11 +6415,22 @@ skip_type_check:
if (arg_type_is_release(arg_type)) {
if (arg_type_is_dynptr(arg_type)) {
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.id) {
- verbose(env, "arg %d is an unacquired reference\n", regno);
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = get_spi(reg->off);
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+ !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
return -EINVAL;
}
} else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -5990,19 +6527,22 @@ skip_type_check:
break;
case ARG_PTR_TO_SPIN_LOCK:
if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
+ err = process_spin_lock(env, regno, true);
+ if (err)
+ return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
+ err = process_spin_lock(env, regno, false);
+ if (err)
+ return err;
} else {
verbose(env, "verifier internal error\n");
return -EFAULT;
}
break;
case ARG_PTR_TO_TIMER:
- if (process_timer_func(env, regno, meta))
- return -EACCES;
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
break;
case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
@@ -6025,39 +6565,9 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg)) {
- verbose(env, "Dynptr has to be an uninitialized dynptr\n");
- return -EINVAL;
- }
-
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
- }
-
- meta->uninit_dynptr_regno = regno;
- } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
- const char *err_extra = "";
-
- switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
- case DYNPTR_TYPE_LOCAL:
- err_extra = "local ";
- break;
- case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf ";
- break;
- default:
- break;
- }
-
- verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
- err_extra, arg + 1);
- return -EINVAL;
- }
+ err = process_dynptr_func(env, regno, arg_type, meta);
+ if (err)
+ return err;
break;
case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
@@ -6124,8 +6634,9 @@ skip_type_check:
break;
}
case ARG_PTR_TO_KPTR:
- if (process_kptr_func(env, regno, meta))
- return -EACCES;
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
break;
}
@@ -6199,6 +6710,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_ringbuf_discard_dynptr)
goto error;
break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
+ goto error;
+ break;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -6282,6 +6797,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_task_storage_delete)
goto error;
break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete)
+ goto error;
+ break;
case BPF_MAP_TYPE_BLOOM_FILTER:
if (func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_map_push_elem)
@@ -6318,6 +6838,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_RINGBUF)
goto error;
break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -6390,6 +6914,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
goto error;
break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -6456,41 +6985,15 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
-{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
-
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
-
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
-}
-
static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
- return false;
-
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
/* arg_btf_id and arg_size are in a union. */
(base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
@@ -6501,43 +7004,25 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id,
- struct bpf_call_arg_meta *meta)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_btf_id_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg_is_pkt_pointer_any(reg))
__mark_reg_unknown(env, reg);
- }
-}
-
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
-{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
-
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ }));
}
enum {
@@ -6566,41 +7051,28 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
- }
-}
-
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
int ref_obj_id)
{
- struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
err = release_reference_state(cur_func(env), ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+ }
+ }));
return 0;
}
@@ -6622,6 +7094,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
@@ -6648,7 +7124,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_subprog_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -6672,6 +7148,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) {
+ verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+
if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
@@ -6716,11 +7202,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* Transfer references to the callee */
err = copy_reference_state(callee, caller);
if (err)
- return err;
+ goto err_out;
err = set_callee_state_cb(env, caller, callee, *insn_idx);
if (err)
- return err;
+ goto err_out;
clear_caller_saved_regs(env, caller->regs);
@@ -6737,6 +7223,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
print_verifier_state(env, callee, true);
}
return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -6822,6 +7313,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6843,6 +7335,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6872,6 +7365,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6899,6 +7393,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6922,11 +7440,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
- state->curframe--;
- caller = state->frame[state->curframe];
+ caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
/* enforce R0 return value range [0, 1]. */
- struct tnum range = tnum_range(0, 1);
+ struct tnum range = callee->callback_ret_range;
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
@@ -6941,10 +7458,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
@@ -6955,7 +7479,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* clear everything in the callee */
free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
+ state->frame[state->curframe--] = NULL;
return 0;
}
@@ -7066,13 +7590,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -7208,6 +7739,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (!env->prog->aux->sleepable && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -7219,13 +7755,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id, &meta);
+ err = check_func_proto(fn, func_id);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
+ if (env->cur_state->active_rcu_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -7254,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+ * is safe to do directly.
+ */
if (meta.uninit_dynptr_regno) {
+ if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+ return -EFAULT;
+ }
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7272,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.release_regno) {
err = -EINVAL;
- if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ return -EFAULT;
+ }
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
- else if (meta.ref_obj_id)
+ } else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
- /* meta.ref_obj_id can only be 0 if register that is meant to be
- * released is NULL, which must be > R0.
- */
- else if (register_is_null(&regs[meta.release_regno]))
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
err = 0;
+ }
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -7344,6 +7908,29 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
break;
+ case BPF_FUNC_dynptr_data:
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
+
+ meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ break;
+ }
+ }
+ if (i == MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+ return -EFAULT;
+ }
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_user_ringbuf_callback_state);
+ break;
}
if (err)
@@ -7360,13 +7947,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* update return register (already marked as written above) */
ret_type = fn->ret_type;
- ret_flag = type_flag(fn->ret_type);
- if (ret_type == RET_INTEGER) {
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -7382,23 +7973,29 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_type) &&
- map_value_has_spin_lock(meta.map_ptr)) {
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
+ break;
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -7430,16 +8027,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
struct btf *ret_btf;
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
if (func_id == BPF_FUNC_kptr_xchg) {
- ret_btf = meta.kptr_off_desc->kptr.btf;
- ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
} else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+ func_id_name(func_id));
+ return -EINVAL;
+ }
ret_btf = btf_vmlinux;
ret_btf_id = *fn->ret_btf_id;
}
@@ -7451,7 +8057,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- } else {
+ break;
+ }
+ default:
verbose(env, "unknown return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
@@ -7460,7 +8068,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (is_ptr_cast_function(func_id)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
@@ -7472,21 +8086,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
- } else if (func_id == BPF_FUNC_dynptr_data) {
- int dynptr_id = 0, i;
-
- /* Find the id of the dynptr we're acquiring a reference to */
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- if (dynptr_id) {
- verbose(env, "verifier internal error: multiple dynptr args in func\n");
- return -EFAULT;
- }
- dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
- }
- }
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = dynptr_id;
}
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
@@ -7553,18 +8152,926 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ struct {
+ struct btf *btf;
+ u32 btf_id;
+ } arg_obj_drop;
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+};
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_TRUSTED_ARGS;
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
+}
+
+static bool __kfunc_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_list_push_front,
+ KF_bpf_list_push_back,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_kptr_get(meta, argno)) {
+ if (!btf_type_is_ptr(ref_t)) {
+ verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
+ return -EINVAL;
+ }
+ ref_t = btf_type_by_id(meta->btf, ref_t->type);
+ ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
+ meta->func_name, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_KPTR;
+ }
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id))
+ strict_type_match = true;
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ struct btf_field *kptr_field;
+
+ /* check_func_arg_reg_off allows var_off for
+ * PTR_TO_MAP_VALUE, but we need fixed offset to find
+ * off_desc.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "arg#0 must have constant offset\n");
+ return -EINVAL;
+ }
+
+ kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
+ if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
+ verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
+ reg->off + reg->var_off.value);
+ return -EINVAL;
+ }
+
+ if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, true)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state = cur_func(env);
+ struct bpf_reg_state *reg;
+ int i;
+
+ /* bpf_spin_lock only allows calling list_push and list_pop, no BPF
+ * subprogs, no global functions. This means that the references would
+ * not be released inside the critical section but they may be added to
+ * the reference state, and the acquired_refs are never copied out for a
+ * different frame as BPF to BPF calls don't work in bpf_spin_lock
+ * critical sections.
+ */
+ if (!ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n");
+ return -EFAULT;
+ }
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id == ref_obj_id) {
+ if (state->refs[i].release_on_unlock) {
+ verbose(env, "verifier internal error: expected false release_on_unlock");
+ return -EFAULT;
+ }
+ state->refs[i].release_on_unlock = true;
+ /* Now mark everyone sharing same ref_obj_id as untrusted */
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ reg->type |= PTR_UNTRUSTED;
+ }));
+ return 0;
+ }
+ }
+ verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+ ptr = reg->btf;
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_lock.ptr)
+ return -EINVAL;
+ if (env->cur_state->active_lock.ptr != ptr ||
+ env->cur_state->active_lock.id != id) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 list_head_off;
+
+ if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) {
+ verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n");
+ return -EFAULT;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ list_head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD);
+ if (!field) {
+ verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n",
+ rec->spin_lock_off);
+ return -EINVAL;
+ }
+
+ if (meta->arg_list_head.field) {
+ verbose(env, "verifier internal error: repeating bpf_list_head arg\n");
+ return -EFAULT;
+ }
+ meta->arg_list_head.field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 list_node_off;
+
+ if (meta->btf != btf_vmlinux ||
+ (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] &&
+ meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) {
+ verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n");
+ return -EFAULT;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ list_node_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, list_node_off, BPF_LIST_NODE);
+ if (!field || field->offset != list_node_off) {
+ verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off);
+ return -EINVAL;
+ }
+
+ field = meta->arg_list_head.field;
+
+ et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf,
+ field->list_head.value_btf_id, true)) {
+ verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off),
+ list_node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+
+ if (list_node_off != field->list_head.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n",
+ list_node_off, field->list_head.node_offset,
+ btf_name_by_offset(field->list_head.btf, et->name_off));
+ return -EINVAL;
+ }
+ /* Set arg#1 for expiration after unlock */
+ return ref_set_release_on_unlock(env, reg->ref_obj_id);
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
+ const struct btf_param *args;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ /* Trusted arguments have the same offset checks as release arguments */
+ arg_type |= OBJ_RELEASE;
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ /* Trusted by default */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux &&
+ meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ meta->arg_obj_drop.btf = reg->btf;
+ meta->arg_obj_drop.btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#0 expected pointer to map value\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ if (reg->type != PTR_TO_STACK &&
+ reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
+ return -EINVAL;
+ }
+
+ ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ /* Skip next '__sz' argument */
+ i++;
+ break;
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
u32 i, nargs, func_id, ptr_type_id;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
+ const struct btf_type *ret_t;
struct btf *desc_btf;
u32 *kfunc_flags;
- bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
@@ -7585,17 +9092,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name);
return -EACCES;
}
- acq = *kfunc_flags & KF_ACQUIRE;
+
+ /* Prepare kfunc call metadata */
+ memset(&meta, 0, sizeof(meta));
+ meta.btf = desc_btf;
+ meta.func_id = func_id;
+ meta.kfunc_flags = *kfunc_flags;
+ meta.func_proto = func_proto;
+ meta.func_name = func_name;
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !env->prog->aux->sleepable) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
+ verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ if (env->cur_state->active_rcu_lock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ if (rcu_lock) {
+ verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ env->cur_state->active_rcu_lock = false;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+ } else if (rcu_lock) {
+ env->cur_state->active_rcu_lock = true;
+ } else if (rcu_unlock) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags);
+ err = check_kfunc_args(env, &meta);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
- * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
- if (err) {
- err = release_reference(env, regs[err].ref_obj_id);
+ if (meta.release_regno) {
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, func_id);
@@ -7609,43 +9167,137 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
- if (acq && !btf_type_is_ptr(t)) {
- verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
- return -EINVAL;
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
}
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
- &ptr_type_id);
- if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(desc_btf,
- ptr_type->name_off);
- verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
- func_name, btf_type_str(ptr_type),
- ptr_type_name);
- return -EINVAL;
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (unlikely(!bpf_global_ma_set))
+ return -ENOMEM;
+
+ if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta.arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+
+ env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(ret_btf, ret_btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ struct btf_field *field = meta.arg_list_head.field;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = field->list_head.btf;
+ regs[BPF_REG_0].btf_id = field->list_head.value_btf_id;
+ regs[BPF_REG_0].off = field->list_head.node_offset;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else {
+ verbose(env, "kernel function %s unhandled dynamic return type\n",
+ meta.func_name);
+ return -EFAULT;
+ }
+ } else if (!__btf_type_is_struct(ptr_type)) {
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
}
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
- if (*kfunc_flags & KF_RET_NULL) {
+
+ if (is_kfunc_ret_null(&meta)) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
- if (acq) {
+ if (is_kfunc_acquire(&meta)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
return id;
- regs[BPF_REG_0].id = id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
}
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -9073,6 +10725,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -9274,34 +10931,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, int new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
- }
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
-}
-
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- int new_range, i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -9366,9 +11003,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
@@ -9830,17 +11469,20 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
bool is_null)
{
if (type_may_be_null(reg->type) && reg->id == id &&
- !WARN_ON_ONCE(!reg->id)) {
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL. If we
- * see this happening, don't convert the register.
- */
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
+ return;
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off))
return;
- }
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -9857,7 +11499,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -9867,22 +11509,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -9890,10 +11516,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -9902,8 +11527,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -10016,23 +11642,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
{
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- int i, j;
- for (i = 0; i <= vstate->curframe; i++) {
- state = vstate->frame[i];
- for (j = 0; j < MAX_BPF_REG; j++) {
- reg = &state->regs[j];
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
-
- bpf_for_each_spilled_reg(j, state, reg) {
- if (!reg)
- continue;
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- }
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ *reg = *known_reg;
+ }));
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -10042,6 +11656,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -10151,8 +11766,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
* this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
+ * object, I suppose, see the PTR_MAYBE_NULL related if block below),
+ * because otherwise the different base pointers mean the offsets aren't
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
@@ -10201,6 +11816,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
}
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
+ }
+
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
@@ -10307,8 +11952,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- if (map_value_has_spin_lock(map))
- dst_reg->id = ++env->id_gen;
+ WARN_ON_ONCE(map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
@@ -10386,11 +12031,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -10592,7 +12242,7 @@ static int check_return_code(struct bpf_verifier_env *env)
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -10641,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state(
return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
}
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].prune_point;
+}
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -10674,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -10703,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
-static int visit_func_call_insn(int t, int insn_cnt,
- struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
@@ -10714,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
if (ret)
return ret;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + 1);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + 1);
+
if (visit_callee) {
- init_explored_state(env, t);
+ mark_prune_point(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
/* It's ok to allow recursion from CFG point of
* view. __check_func_call() will do the actual
@@ -10733,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int ret;
if (bpf_pseudo_func(insns + t))
- return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
@@ -10752,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
case BPF_CALL:
if (insns[t].imm == BPF_FUNC_timer_set_callback)
- /* Mark this call insn to trigger is_state_visited() check
- * before call itself is processed by __check_func_call().
- * Otherwise new async state will be pushed for further
- * exploration.
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
*/
- init_explored_state(env, t);
- return visit_func_call_insn(t, insn_cnt, insns, env,
+ mark_prune_point(env, t);
+ return visit_func_call_insn(t, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
@@ -10771,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
if (ret)
return ret;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + insns[t].off + 1);
+ mark_jmp_point(env, t + insns[t].off + 1);
return ret;
default:
/* conditional jump with two edges */
- init_explored_state(env, t);
+ mark_prune_point(env, t);
+
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret)
return ret;
@@ -10822,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env)
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
- ret = visit_insn(t, insn_cnt, env);
+ ret = visit_insn(t, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
@@ -11413,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
-
- if (equal)
- return true;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
@@ -11429,10 +13071,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return false;
switch (base_type(rold->type)) {
case SCALAR_VALUE:
+ if (equal)
+ return true;
if (env->explore_alu_limits)
return false;
if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
+ if (!rold->precise)
return true;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
@@ -11475,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != rold->type)
@@ -11499,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return equal && rold->frameno == rcur->frameno;
default:
- /* Don't know what's going on, just say it's not safe */
- return false;
+ /* Only valid matches are exact, which memcmp() */
+ return equal;
}
/* Shouldn't get here; if we do, say it's not safe */
@@ -11622,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
env->idmap_scratch))
@@ -11646,13 +13284,28 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->active_lock.ptr != cur->active_lock.ptr)
+ return false;
+
+ /* Old and cur active_lock's have to be either both present
+ * or both absent.
+ */
+ if (!!old->active_lock.id != !!cur->active_lock.id)
+ return false;
+
+ if (old->active_lock.id &&
+ !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
/* for states to be equal callsites have to be the same
@@ -11755,34 +13408,36 @@ static int propagate_precision(struct bpf_verifier_env *env,
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
- int i, err = 0;
+ int i, err = 0, fr;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating r%d\n", i, fr);
+ err = mark_chain_precision_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE, fr);
+ err = mark_chain_precision_stack_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
}
return 0;
}
@@ -11814,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
int i, j, err, states_cnt = 0;
bool add_new_state = env->test_state_freq ? true : false;
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
-
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
* Do not add new state for future pruning if the verifier hasn't seen
@@ -11955,10 +13603,10 @@ next:
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -11977,6 +13625,10 @@ next:
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -12094,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -12291,11 +13953,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
+ if (env->cur_state->active_lock.ptr) {
+ if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_CALL) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
@@ -12328,11 +13993,26 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "bpf_rcu_read_unlock is missing\n");
+ return -EINVAL;
+ }
+
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env);
+ if (err)
+ return err;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -12342,10 +14022,6 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
err = check_return_code(env);
if (err)
return err;
@@ -12558,14 +14234,6 @@ err_put:
return err;
}
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
-}
-
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
@@ -12580,52 +14248,21 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
+
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_list_head yet\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
- if (map_value_has_spin_lock(map)) {
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -12642,7 +14279,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (map_value_has_timer(map)) {
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
if (is_tracing_prog_type(prog_type)) {
verbose(env, "tracing progs cannot use bpf_timer yet\n");
return -EINVAL;
@@ -12670,20 +14307,16 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
break;
default:
verbose(env,
- "Sleepable programs can only use array, hash, and ringbuf maps\n");
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
return -EINVAL;
}
@@ -13317,7 +14950,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;
@@ -13339,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
if (WARN_ON(load_reg == -1)) {
verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
return -EFAULT;
@@ -13466,13 +15103,17 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
break;
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
default:
@@ -13834,8 +15475,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
@@ -13845,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
}
/* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_base_call).
+ * an address (relative to __bpf_call_base).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
@@ -13854,8 +15495,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
return -EFAULT;
}
+ *cnt = 0;
insn->imm = desc->imm;
-
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
return 0;
}
@@ -13997,9 +15663,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn);
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
+ if (cnt == 0)
+ continue;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
@@ -14117,13 +15793,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (insn->imm == BPF_FUNC_task_storage_get ||
- insn->imm == BPF_FUNC_sk_storage_get ||
- insn->imm == BPF_FUNC_inode_storage_get) {
- if (env->prog->aux->sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- else
+ if (is_storage_get_function(insn->imm)) {
+ if (!env->prog->aux->sleepable ||
+ env->insn_aux_data[i + delta].storage_get_func_atomic)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
insn_buf[1] = *insn;
cnt = 2;
@@ -14193,7 +15868,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
(int (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
@@ -14572,6 +16247,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
@@ -14981,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
- /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -15007,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
- ret = check_attach_modify_return(addr, tname);
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
if (ret) {
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
@@ -15196,10 +16886,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
+ env->rcu_tag_supported = btf_vmlinux &&
+ btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/capability.c b/kernel/capability.c
index 765194f5d678..860fd22117c1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -489,8 +489,8 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
- kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode));
}
/**
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 2046276ee234..08caad776717 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,339 +1,101 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
*
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
*/
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
+#include <linux/cfi.h>
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
-#endif
-
-static inline void handle_cfi_failure(void *ptr)
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
{
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
- WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
else
- panic("CFI failure (target: %pS)\n", ptr);
-}
-
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID ((shadow_t)~0UL)
-
-struct cfi_shadow {
- /* Page index for the beginning of the shadow */
- unsigned long base;
- /* An array of __cfi_check locations (as indices to the shadow) */
- shadow_t shadow[1];
-} __packed;
-
-/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
- */
-#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
- unsigned long index;
- unsigned long page = ptr >> PAGE_SHIFT;
-
- if (unlikely(page < s->base))
- return -1; /* Outside of module area */
-
- index = page - s->base;
-
- if (index >= SHADOW_ARR_SLOTS)
- return -1; /* Cannot be addressed with shadow */
-
- return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- return (s->base + index) << PAGE_SHIFT;
-}
-
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- if (unlikely(s->shadow[index] == SHADOW_INVALID))
- return 0;
-
- /* __cfi_check is always page aligned */
- return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
-
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
- struct cfi_shadow *next)
-{
- int i, index, check;
-
- /* Mark everything invalid */
- memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
- if (!prev)
- return; /* No previous shadow */
-
- /* If the base address didn't change, an update is not needed */
- if (prev->base == next->base) {
- memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
- return;
- }
-
- /* Convert the previous shadow to the new address range */
- for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
- if (prev->shadow[i] == SHADOW_INVALID)
- continue;
-
- index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
- if (index < 0)
- continue;
-
- check = ptr_to_shadow(next,
- shadow_to_check_fn(prev, prev->shadow[i]));
- if (check < 0)
- continue;
-
- next->shadow[index] = (shadow_t)check;
- }
-}
-
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
-{
- int check_index;
- unsigned long check = (unsigned long)mod->cfi_check;
- unsigned long ptr;
-
- if (unlikely(!PAGE_ALIGNED(check))) {
- pr_warn("cfi: not using shadow for module %s\n", mod->name);
- return;
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
}
- check_index = ptr_to_shadow(s, check);
- if (check_index < 0)
- return; /* Module not addressable with shadow */
-
- /* For each page, store the check function index in the shadow */
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0) {
- /* Each page must only contain one module */
- WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
- s->shadow[index] = (shadow_t)check_index;
- }
- }
+ return BUG_TRAP_TYPE_BUG;
}
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
{
- unsigned long ptr;
-
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0)
- s->shadow[index] = SHADOW_INVALID;
- }
+ return (unsigned long)((long)p + (long)*p);
}
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
- unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
- update_shadow_fn fn)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
{
- struct cfi_shadow *prev;
- struct cfi_shadow *next;
- unsigned long min_addr, max_addr;
-
- next = vmalloc(SHADOW_SIZE);
-
- mutex_lock(&shadow_update_lock);
- prev = rcu_dereference_protected(cfi_shadow,
- mutex_is_locked(&shadow_update_lock));
-
- if (next) {
- next->base = base_addr >> PAGE_SHIFT;
- prepare_next_shadow(prev, next);
+ s32 *p;
- min_addr = (unsigned long)mod->core_layout.base;
- max_addr = min_addr + mod->core_layout.text_size;
- fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
- set_memory_ro((unsigned long)next, SHADOW_PAGES);
- }
-
- rcu_assign_pointer(cfi_shadow, next);
- mutex_unlock(&shadow_update_lock);
- synchronize_rcu();
-
- if (prev) {
- set_memory_rw((unsigned long)prev, SHADOW_PAGES);
- vfree(prev);
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
}
-}
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, add_module_to_shadow);
+ return false;
}
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, remove_module_from_shadow);
-}
-
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
- unsigned long ptr)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
{
- int index;
-
- if (unlikely(!s))
- return NULL; /* No shadow available */
-
- index = ptr_to_shadow(s, ptr);
- if (index < 0)
- return NULL; /* Cannot be addressed with shadow */
+ char *secstrings;
+ unsigned int i;
- return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn;
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
- rcu_read_lock_sched_notrace();
- fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched_notrace();
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- return fn;
-}
-
-#else /* !CONFIG_CFI_CLANG_SHADOW */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- return NULL;
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
}
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+static bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = NULL;
struct module *mod;
+ bool found = false;
rcu_read_lock_sched_notrace();
- mod = __module_address(ptr);
- if (mod)
- fn = mod->cfi_check;
- rcu_read_unlock_sched_notrace();
-
- return fn;
-}
-
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
- unsigned long flags;
- bool rcu_idle;
-
- if (is_kernel_text(ptr))
- return __cfi_check;
- /*
- * Indirect call checks can happen when RCU is not watching. Both
- * the shadow and __module_address use RCU, so we need to wake it
- * up if necessary.
- */
- rcu_idle = !rcu_is_watching();
- if (rcu_idle) {
- local_irq_save(flags);
- ct_irq_enter();
- }
-
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
- if (!fn)
- fn = find_module_check_fn(ptr);
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- if (rcu_idle) {
- ct_irq_exit();
- local_irq_restore(flags);
- }
+ rcu_read_unlock_sched_notrace();
- return fn;
+ return found;
}
-
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = find_check_fn((unsigned long)ptr);
-
- if (likely(fn))
- fn(id, ptr, diag);
- else /* Don't allow unchecked modules */
- handle_cfi_failure(ptr);
+ return false;
}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+#endif /* CONFIG_MODULES */
-#else /* !CONFIG_MODULES */
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+bool is_cfi_trap(unsigned long addr)
{
- handle_cfi_failure(ptr); /* No modules */
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
-#endif /* CONFIG_MODULES */
-
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
-{
- handle_cfi_failure(ptr);
+ return is_module_cfi_trap(addr);
}
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 36b740cb3d59..367b0a42ada9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,11 +164,9 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
/* iterate across the hierarchies */
#define for_each_root(root) \
@@ -250,6 +248,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ff6a8099eb2a..52bb5a74a23b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,8 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
- cpus_read_lock();
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -72,8 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
- cpus_read_unlock();
+ cgroup_attach_unlock(true);
mutex_unlock(&cgroup_mutex);
return retval;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e4bb5d57f4d1..c099cf3fa02d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
/* cgroup optional features */
enum cgroup_opt_features {
@@ -247,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
@@ -1391,6 +1398,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
+/*
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
+ */
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
@@ -1402,6 +1412,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -1413,6 +1424,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
}
}
+ BUG_ON(!res_cgroup);
return res_cgroup;
}
@@ -1435,23 +1447,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_unlock();
- BUG_ON(!res);
return res;
}
+/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
- struct cgroup *res = NULL;
-
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- res = __cset_cgroup_from_root(cset, root);
-
- BUG_ON(!res);
- return res;
+ return __cset_cgroup_from_root(cset, root);
}
/*
@@ -1689,12 +1714,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1717,14 +1746,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -2050,7 +2087,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2173,7 +2210,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2361,7 +2398,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
@@ -2393,7 +2430,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
*/
-static void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(bool lock_threadgroup)
{
cpus_read_lock();
if (lock_threadgroup)
@@ -2404,7 +2441,7 @@ static void cgroup_attach_lock(bool lock_threadgroup)
* cgroup_attach_unlock - Undo cgroup_attach_lock()
* @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
*/
-static void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(bool lock_threadgroup)
{
if (lock_threadgroup)
percpu_up_write(&cgroup_threadgroup_rwsem);
@@ -2829,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -3292,11 +3327,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3689,27 +3720,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@@ -3729,7 +3760,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@@ -3746,21 +3777,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3780,6 +3876,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void)
{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -4132,8 +4231,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4198,21 +4295,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
@@ -4226,26 +4327,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -4267,6 +4368,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
@@ -4372,6 +4479,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -5131,10 +5258,14 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5142,7 +5273,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5150,12 +5281,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -5207,6 +5353,7 @@ static void css_free_rwork_fn(struct work_struct *work)
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
@@ -5432,8 +5579,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5485,7 +5631,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5918,6 +6064,7 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
@@ -6038,16 +6185,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
- * On success return the cgrp, on failure return NULL
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp = NULL;
+ struct cgroup *cgrp, *root_cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out;
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
rcu_read_lock();
@@ -6056,9 +6209,17 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-
kernfs_put(kn);
-out:
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6088,7 +6249,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -6154,16 +6315,37 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
@@ -6635,8 +6817,10 @@ struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
if (!kn)
goto out;
@@ -6661,15 +6845,15 @@ out:
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
struct file *f;
@@ -6678,10 +6862,29 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f);
+ cgrp = cgroup_v1v2_get_from_file(f);
fput(f);
return cgrp;
}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+ return cgrp;
+}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
static u64 power_of_ten(int power)
@@ -6794,9 +6997,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
-
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
@@ -6816,8 +7016,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1f3a55297f39..a29c0b13706b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -33,6 +33,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
+#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
@@ -85,6 +86,30 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -168,6 +193,9 @@ struct cpuset {
int use_parent_ecpus;
int child_ecpus_count;
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
};
@@ -175,20 +203,22 @@ struct cpuset {
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -268,25 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
/*
* Send notification event of whenever partition_root_state changes.
*/
-static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
- if (old_prs != new_prs)
- cgroup_file_notify(&cs->partition_file);
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
}
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};
/**
@@ -404,6 +452,41 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts_cpus)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
* Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online and are capable of running the task. If none are found,
@@ -467,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
@@ -659,22 +746,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
- /*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
*/
@@ -698,6 +769,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -875,7 +962,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -1081,7 +1168,7 @@ static void rebuild_sched_domains_locked(void)
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1127,10 +1214,18 @@ static void update_tasks_cpumask(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if (top_cs && (task->flags & PF_KTHREAD) &&
+ kthread_is_per_cpu(task))
+ continue;
set_cpus_allowed_ptr(task, cs->effective_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1165,15 +1260,18 @@ enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1184,38 +1282,36 @@ enum subparts_cmd {
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
*
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
+ * For partcmd_invalidate, the current partition will be made invalid.
*
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change().
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
int old_prs, new_prs;
- bool part_error = false; /* Partition error? */
+ int part_error = PERR_NONE; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1224,126 +1320,165 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
- */
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
-
- /*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
- */
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
- old_prs = new_prs = cpuset->partition_root_state;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ /*
+ * Enabling partition root is not allowed if cpus_allowed
+ * doesn't overlap parent's cpus_allowed.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ partition_is_populated(parent, cs))
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
+ } else if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid. It is assumed that
+ * invalidation is caused by violating cpu exclusivity rule.
+ */
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ part_error = PERR_NOTEXCL;
+ }
} else if (newmask) {
/*
* partcmd_update with newmask:
*
+ * Compute add/delete mask to/from subparts_cpus
+ *
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
+ * addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
+ partition_is_populated(parent, cs)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effective_cpus
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
+ * A partition error happens when:
+ * 1) Cpuset is valid partition, but parent does not distribute
+ * out any CPUs.
+ * 2) Parent has tasks and all its effective CPUs will have
+ * to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }
+
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
-
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- new_prs = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- new_prs = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
- }
-
- if (!part_error && (new_prs == PRS_ERROR))
- return 0; /* Nothing need to be done */
-
- if (new_prs == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
}
if (!adding && !deleting && (new_prs == old_prs))
return 0;
/*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ */
+ if (old_prs != new_prs) {
+ if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
+ (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+ return PERR_NOTEXCL;
+ if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
@@ -1369,18 +1504,32 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
if (old_prs != new_prs)
- cpuset->partition_root_state = new_prs;
+ cs->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
- notify_partition_change(cpuset, old_prs, new_prs);
- return cmd == partcmd_update;
+ if (adding || deleting)
+ update_tasks_cpumask(parent);
+
+ /*
+ * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+ * rebuild_sched_domains_locked() may be called.
+ */
+ if (old_prs != new_prs) {
+ if (old_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ else if (new_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ }
+ notify_partition_change(cs, old_prs);
+ return 0;
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
* When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -1389,7 +1538,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* Called with cpuset_rwsem held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1399,14 +1549,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool update_parent = false;
compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ if (is_partition_valid(cp) &&
+ cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
+ goto update_parent_subparts;
+
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1420,14 +1577,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * and has no partition root state and force flag not set.
*/
- if (!cp->partition_root_state &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_subparts:
/*
* update_parent_subparts_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
@@ -1437,36 +1595,22 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear its state
- * and its CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- new_prs = PRS_DISABLED;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
- break;
-
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- new_prs = PRS_ERROR;
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
@@ -1475,42 +1619,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
+ if (update_parent) {
+ update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+ tmp);
+ /*
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
+ */
+ new_prs = cp->partition_root_state;
+ }
+
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
+ /*
+ * Put all active subparts_cpus back to effective_cpus.
+ */
+ cpumask_or(tmp->new_cpus, tmp->new_cpus,
+ cp->subparts_cpus);
+ cpumask_and(tmp->new_cpus, tmp->new_cpus,
+ cpu_active_mask);
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ }
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus) {
/*
* Make sure that effective_cpus & subparts_cpus
* are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
*/
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
}
- if (new_prs != old_prs)
- cp->partition_root_state = new_prs;
-
+ cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
- notify_partition_change(cp, old_prs, new_prs);
+
+ notify_partition_change(cp, old_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1526,7 +1672,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -1570,7 +1716,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp);
+ update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -1588,6 +1734,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ bool invalidate = false;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1615,10 +1762,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
-
#ifdef CONFIG_CPUMASK_OFFSTACK
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
@@ -1629,28 +1772,70 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
tmp.new_cpus = trialcs->cpus_allowed;
#endif
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ struct cpuset *cp, *parent;
+ struct cgroup_subsys_state *css;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ invalidate = true;
+ rcu_read_lock();
+ parent = parent_cs(cs);
+ cpuset_for_each_child(cp, css, parent)
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+ rcu_read_unlock();
+ update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ if (retval < 0)
+ return retval;
+
if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ if (invalidate)
+ update_parent_subparts_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ else
+ update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp);
}
+ compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+ parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Make sure that subparts_cpus, if not empty, is a subset of
+ * cpus_allowed. Clear subparts_cpus if partition not valid or
+ * empty effective cpus with tasks.
*/
if (cs->nr_subparts_cpus) {
- cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ if (!is_partition_valid(cs) ||
+ (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
+ partition_is_populated(cs, NULL))) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ } else {
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
}
spin_unlock_irq(&callback_lock);
- update_cpumasks_hier(cs, &tmp);
+ /* effective_cpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, false);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1972,7 +2157,7 @@ static void update_tasks_flags(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
@@ -2026,16 +2211,18 @@ out:
return err;
}
-/*
+/**
* update_prstate - update partition_root_state
- * cs: the cpuset to update
- * new_prs: new partition root state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
* Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err, old_prs = cs->partition_root_state;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
+ bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2043,28 +2230,33 @@ static int update_prstate(struct cpuset *cs, int new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * For a previously invalid partition root, leave it at being
+ * invalid if new_prs is not "member".
*/
- if (new_prs && (old_prs == PRS_ERROR))
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs)) {
+ cs->partition_root_state = -new_prs;
+ return 0;
+ }
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- err = -EINVAL;
if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cannot be empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ if (err) {
+ err = PERR_NOTEXCL;
goto out;
+ }
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
@@ -2072,47 +2264,77 @@ static int update_prstate(struct cpuset *cs, int new_prs)
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
+
+ if (new_prs == PRS_ISOLATED) {
+ /*
+ * Disable the load balance flag should not return an
+ * error unless the system is running out of memory.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ sched_domain_rebuilt = true;
+ }
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
+ sched_domain_rebuilt = true;
+ goto out; /* Sched domain is rebuilt in update_flag() */
} else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
*/
- if (old_prs == PRS_ERROR) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
- goto out;
- }
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
+ &tmpmask);
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmpmask);
- if (err)
- goto out;
+ /*
+ * If there are child partitions, they will all become invalid.
+ */
+ if (unlikely(cs->nr_subparts_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(cs->effective_cpus, cs, parent);
+ spin_unlock_irq(&callback_lock);
+ }
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+
+ if (!is_sched_load_balance(cs)) {
+ /* Make sure load balance is on */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ sched_domain_rebuilt = true;
+ }
}
- /*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
- */
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
- rebuild_sched_domains_locked();
+ if (!sched_domain_rebuilt)
+ rebuild_sched_domains_locked();
out:
- if (!err) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = new_prs;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, new_prs);
- }
+ /*
+ * Make partition invalid if an error happen
+ */
+ if (err)
+ new_prs = -new_prs;
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ /*
+ * Update child cpusets, if present.
+ * Force update if switching back to member.
+ */
+ if (!list_empty(&cs->css.children))
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
- return err;
+ return 0;
}
/*
@@ -2238,6 +2460,12 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
+ /*
+ * Task cannot be moved to a cpuset with empty effective cpus.
+ */
+ if (cpumask_empty(cs->effective_cpus))
+ goto out_unlock;
+
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task, cs->effective_cpus);
if (ret)
@@ -2285,12 +2513,28 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2306,14 +2550,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
}
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2336,6 +2585,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
@@ -2598,16 +2848,29 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
@@ -2626,9 +2889,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
@@ -2807,11 +3072,15 @@ static struct cftype dfl_files[] = {
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -2927,7 +3196,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -3103,7 +3372,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3172,11 +3442,31 @@ retry:
/*
* In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * effective_cpus with tasks, we will have to invalidate child
+ * partitions, if present, by setting nr_subparts_cpus to 0 to
+ * reclaim their cpus.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+ cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
+ */
+ if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+ (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
+ int old_prs, parent_prs;
+
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
if (cs->nr_subparts_cpus) {
spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
@@ -3185,39 +3475,32 @@ retry:
compute_effective_cpumask(&new_cpus, cs, parent);
}
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- int old_prs;
-
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- old_prs = cs->partition_root_state;
- if (old_prs != PRS_ERROR) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = PRS_ERROR;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, PRS_ERROR);
- }
+ old_prs = cs->partition_root_state;
+ parent_prs = parent->partition_root_state;
+ if (is_partition_valid(cs)) {
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(cs);
+ spin_unlock_irq(&callback_lock);
+ if (is_prs_invalid(parent_prs))
+ WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
+ else if (!parent_prs)
+ WRITE_ONCE(cs->prs_err, PERR_NOTPART);
+ else
+ WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
+ notify_partition_change(cs, old_prs);
}
cpuset_force_rebuild();
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+ if (is_partition_valid(cs))
+ cpuset_force_rebuild();
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
@@ -3377,11 +3660,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3399,7 +3677,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..1b6b21851e9d 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec(&freezer_active);
unfreeze_cgroup(freezer);
}
}
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..7695e60bcb40 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -47,6 +47,7 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
@@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
*/
if (new > limit)
goto revert;
+
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
@@ -332,6 +360,11 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index feb59380c896..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
+}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..38a7c5362c9c
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1 @@
+CONFIG_RUST=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..c2f9c912df1c 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+# CONFIG_SLOB_DEPRECATED is not set
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bbad5e375d3b..6c0a92ca6bb5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -663,21 +663,51 @@ static bool cpuhp_next_state(bool bringup,
return true;
}
-static int cpuhp_invoke_callback_range(bool bringup,
- unsigned int cpu,
- struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
enum cpuhp_state state;
- int err = 0;
+ int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
- if (err)
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
break;
+ }
}
- return err;
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -999,7 +1029,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -1012,13 +1041,10 @@ static int take_cpu_down(void *_param)
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
- /* Invoke the former CPU_DYING callbacks */
- ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
/*
- * DYING must not fail!
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -1296,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -2326,8 +2350,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
@@ -2688,6 +2714,7 @@ void __init boot_cpu_hotplug_init(void)
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
/*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a0eb4d5cf557..87ef6096823f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -383,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...)
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
}
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index e10c15f51c1f..811ad654abd1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -701,9 +701,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
+ old = get_task_cred(daemon);
validate_creds(old);
*new = *old;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7beceb447211..d5e9ccde3ab8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -50,7 +50,6 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
#include <linux/security.h>
@@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 67d3c48a1522..5c7e9ba7cd6b 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
const char *cp;
+ int cookie;
int len;
if (msg_len == 0)
@@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
- for_each_console(c) {
- if (!(c->flags & CON_ENABLED))
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!(console_srcu_read_flags(c) & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
@@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
--oops_in_progress;
touch_nmi_watchdog();
}
+ console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 164ed9ef77a3..e39cb696cfbd 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -214,13 +214,22 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 27f272381cf2..68106e3791f6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,6 +10,7 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ kmsan_handle_dma(page, offset, size, dir);
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
@@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- if (ents > 0)
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
- else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO && ents != -EREMOTEIO))
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
return -EIO;
+ }
return ents;
}
@@ -494,6 +498,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -548,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
return NULL;
if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
@@ -633,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
if (ops && ops->alloc_noncontiguous)
sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 0ef6b12f961d..a34c38bbe28f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -300,6 +300,37 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
+static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
+
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
+
+ return tlb;
+}
+
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
@@ -310,7 +341,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long nslabs;
size_t alloc_size;
- size_t bytes;
void *tlb;
if (!addressing_limit && !swiotlb_force_bounce)
@@ -326,42 +356,32 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
swiotlb_adjust_nareas(num_possible_cpus());
nslabs = default_nslabs;
- /*
- * By default allocate the bounce buffer memory from low memory, but
- * allow to pick a location everywhere for hypervisors with guest
- * memory encryption.
- */
-retry:
- bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
- if (flags & SWIOTLB_ANY)
- tlb = memblock_alloc(bytes, PAGE_SIZE);
- else
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb) {
- pr_warn("%s: failed to allocate tlb structure\n", __func__);
- return;
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
}
- if (remap && remap(tlb, nslabs) < 0) {
- memblock_free(tlb, PAGE_ALIGN(bytes));
-
- nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
- if (nslabs < IO_TLB_MIN_SLABS)
- panic("%s: Failed to remap %zu bytes\n",
- __func__, bytes);
- goto retry;
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
}
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem->slots)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+ return;
+ }
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
default_nareas), SMP_CACHE_BYTES);
- if (!mem->areas)
- panic("%s: Failed to allocate mem->areas.\n", __func__);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
default_nareas);
@@ -545,9 +565,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
- char *buffer;
+ struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -555,12 +574,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
- memcpy(vaddr, buffer + offset, sz);
+ memcpy_from_page(vaddr, page, offset, sz);
else
- memcpy(buffer + offset, vaddr, sz);
- kunmap_atomic(buffer);
+ memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -731,8 +749,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
int index;
phys_addr_t tlb_addr;
- if (!mem || !mem->nslabs)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 063068a9ea9b..846add8394c4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
user_exit_irqoff();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
}
@@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 8591c180b52b..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,5 @@
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2621fd24ad26..eacc3702654d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -54,6 +54,7 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
+#include <linux/task_work.h>
#include "internal.h"
@@ -154,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -183,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -216,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -313,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -387,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -447,7 +449,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
int perf_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -570,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -690,13 +686,31 @@ do { \
___p; \
})
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -822,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
/*
* reschedule events based on the cgroup constraint of task.
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_cgroup *cgrp;
- struct perf_cpu_context *cpuctx, *tmp;
- struct list_head *list;
- unsigned long flags;
-
- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
cgrp = perf_cgroup_from_task(task, NULL);
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- if (READ_ONCE(cpuctx->cgrp) == cgrp)
- continue;
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to update_cgrp_time_from_cpuctx() in
- * ctx_sched_out()
- */
- cpuctx->cgrp = cgrp;
- /*
- * set cgrp before ctxsw in to allow
- * perf_cgroup_set_timestamp() in ctx_sched_in()
- * to not have to pass task around
- */
- cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
- local_irq_restore(flags);
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -887,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -971,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
return;
cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
static inline void
@@ -993,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
return;
cpuctx->cgrp = NULL;
- list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1068,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1104,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1146,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
-{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1198,7 +1167,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1383,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1399,7 +1367,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1412,7 +1380,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1439,12 +1407,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1468,6 +1436,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
+ lockdep_assert_held(&ctx->lock);
+
if (adv)
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
@@ -1590,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1640,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1654,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1711,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1730,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1749,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1793,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1938,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -2003,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2019,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2054,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2105,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
/*
@@ -2117,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event)
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
@@ -2209,47 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
- }
-
- return 1;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2268,50 +2234,61 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap) {
+ bool dec = true;
+
+ event->pending_sigtrap = 0;
+ if (state != PERF_EVENT_STATE_OFF &&
+ !event->pending_work) {
+ event->pending_work = 1;
+ dec = false;
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ }
+ if (dec)
+ local_dec(&event->ctx->nr_pending);
+ }
+
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
+#define DETACH_DEAD 0x04UL
/*
* Cross CPU call to remove a performance event
@@ -2325,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
@@ -2332,19 +2310,38 @@ __perf_remove_from_context(struct perf_event *event,
update_cgrp_time_from_cpuctx(cpuctx, false);
}
- event_sched_out(event, cpuctx, ctx);
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_DEAD)
+ event->pending_disable = 1;
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_DEAD)
+ event->state = PERF_EVENT_STATE_DEAD;
+
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
if (!ctx->nr_events && ctx->is_active) {
if (ctx == &cpuctx->ctx)
update_cgrp_time_from_cpuctx(cpuctx, true);
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2374,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- /*
- * Cgroup events are per-cpu events, and must IPI because of
- * cgrp_cpuctx_list.
- */
- if (!ctx->is_active && !is_cgroup_event(event)) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2405,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2424,7 +2421,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_event it's OK because event->ctx
+ * When called from perf_pending_irq it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2463,9 +2460,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
- irq_work_queue(&event->pending);
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2474,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2518,14 +2514,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2534,26 +2528,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2572,9 +2564,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2584,10 +2576,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2597,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2619,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2666,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2680,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2694,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
+
+ perf_ctx_enable(&cpuctx->ctx);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2722,7 +2717,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2764,7 +2759,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2797,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2809,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
- * Similarly, cgroup events for the context also needs the IPI to
- * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
@@ -2912,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2921,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2930,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -3199,11 +3192,52 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3251,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
-
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
- }
-
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
-
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3376,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3420,26 +3477,27 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- WRITE_ONCE(ctx->task, next);
- WRITE_ONCE(next_ctx->task, task);
+ perf_ctx_disable(ctx);
- perf_pmu_disable(pmu);
+ /* PMIs are disabled; ctx->nr_pending is stable. */
+ if (local_read(&ctx->nr_pending) ||
+ local_read(&next_ctx->nr_pending)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3448,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3463,37 +3521,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+inside_switch:
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3505,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3527,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3561,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
@@ -3581,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_switch(next);
}
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
-}
-
static bool perf_less_group_idx(const void *l, const void *r)
{
const struct perf_event *le = *(const struct perf_event **)l;
@@ -3621,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3655,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3673,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3715,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event)
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3724,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
@@ -3735,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
} else {
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ struct perf_cpu_pmu_context *cpc;
+
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
}
@@ -3744,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+ ctx_flexible_sched_in(ctx, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3810,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, NULL);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, NULL);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
- ctx_sched_in(ctx, cpuctx, event_type);
-}
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx);
- cpuctx = __get_cpu_context(ctx);
+ perf_ctx_sched_task_cb(ctx, true);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
-
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3853,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3862,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
perf_event_sched_in(cpuctx, ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
- perf_pmu_enable(pmu);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx);
+
+ perf_ctx_enable(ctx);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3889,16 +3973,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4017,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -4030,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -4080,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4102,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
- perf_event_sched_in(cpuctx, task_ctx);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
+
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4175,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4185,8 +4296,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4208,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4218,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_events)
+ goto out;
+
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4237,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4256,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event,
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4287,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn)
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4329,7 +4442,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4555,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4576,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4605,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4621,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4632,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4677,12 +4773,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4695,21 +4791,146 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ raw_spin_lock_irq(&ctx->lock);
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ raw_spin_unlock_irq(&ctx->lock);
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ unsigned long flags;
+
+ if (!atomic_dec_and_test(&epc->refcount))
+ return;
+
+ if (epc->ctx) {
+ struct perf_event_context *ctx = epc->ctx;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4847,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -4931,7 +5152,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_irq);
unaccount_event(event);
@@ -4971,6 +5192,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -5071,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5085,9 +5309,7 @@ int perf_event_release_kernel(struct perf_event *event)
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
- raw_spin_lock_irq(&ctx->lock);
/*
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
@@ -5099,8 +5321,7 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
perf_event_ctx_unlock(event, ctx);
@@ -5507,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5523,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -6431,7 +6652,8 @@ static void perf_sigtrap(struct perf_event *event)
return;
/*
- * perf_pending_event() can race with the task exiting.
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
if (current->flags & PF_EXITING)
return;
@@ -6440,23 +6662,33 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}
-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_irq(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
if (cpu < 0)
return;
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
+ local_dec(&event->ctx->nr_pending);
+ }
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
}
-
- perf_event_disable_local(event);
return;
}
@@ -6476,35 +6708,64 @@ static void perf_pending_event_disable(struct perf_event *event)
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_event()
+ * perf_pending_irq()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending, cpu);
+ irq_work_queue_on(&event->pending_irq, cpu);
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_pending_irq(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry, struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+ __perf_pending_irq(event);
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_pending);
+ }
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
+
+ put_event(event);
+}
+
#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
@@ -6794,11 +7055,10 @@ out_put:
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 sample_type)
{
- u64 sample_type = event->attr.sample_type;
-
- data->type = sample_type;
+ data->type = event->attr.sample_type;
header->size += event->id_header_size;
if (sample_type & PERF_SAMPLE_TID) {
@@ -6827,7 +7087,7 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_event *event)
{
if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ __perf_event_header__init_id(header, data, event, event->attr.sample_type);
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6893,9 +7153,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
+ unsigned long flags;
u64 values[6];
int n = 0;
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6931,6 +7198,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
@@ -6967,11 +7236,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -7053,14 +7317,14 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->br_stack) {
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
size_t size;
size = data->br_stack->nr
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
@@ -7229,7 +7493,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7303,6 +7567,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
@@ -7310,7 +7575,12 @@ void perf_prepare_sample(struct perf_event_header *header,
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- __perf_event_header__init_id(header, data, event);
+ /*
+ * Clear the sample flags that have already been done by the
+ * PMU driver.
+ */
+ filtered_sample_type = sample_type & ~data->sample_flags;
+ __perf_event_header__init_id(header, data, event, filtered_sample_type);
if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
@@ -7318,7 +7588,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
data->callchain = perf_callchain(event, regs);
size += data->callchain->nr;
@@ -7330,7 +7600,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct perf_raw_record *raw = data->raw;
int size;
- if (raw) {
+ if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0;
@@ -7346,6 +7616,7 @@ void perf_prepare_sample(struct perf_event_header *header,
frag->pad = raw->size - sum;
} else {
size = sizeof(u64);
+ data->raw = NULL;
}
header->size += size;
@@ -7353,8 +7624,8 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (branch_sample_hw_index(event))
size += sizeof(u64);
size += data->br_stack->nr
@@ -7403,6 +7674,20 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ data->weight.full = 0;
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ data->data_src.val = PERF_MEM_NA;
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ data->txn = 0;
+
+ if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
+ if (filtered_sample_type & PERF_SAMPLE_ADDR)
+ data->addr = 0;
+ }
+
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7418,7 +7703,8 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
+ filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
#ifdef CONFIG_CGROUP_PERF
@@ -7621,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7638,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7684,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7737,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -8527,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8537,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -8931,7 +9205,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)subprog->bpf_func,
subprog->jited_len, unregister,
- prog->aux->ksym.name);
+ subprog->aux->ksym.name);
}
}
}
@@ -9174,13 +9448,26 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;
@@ -9203,24 +9490,59 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}
+ if (event->attr.sigtrap) {
+ /*
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
+ */
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
+ if (!event->pending_sigtrap) {
+ event->pending_sigtrap = pending_id;
+ local_inc(&event->ctx->nr_pending);
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_sigtrap; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ }
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending_irq);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -9670,6 +9992,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9719,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9735,6 +10133,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, 0, 0);
data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
perf_trace_buf_update(record, event_type);
@@ -9749,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- /* Cannot deliver synchronous signal to other task. */
- if (event->attr.sigtrap)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9777,44 +10165,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -9989,8 +10339,16 @@ static void bpf_overflow_handler(struct perf_event *event,
goto out;
rcu_read_lock();
prog = READ_ONCE(event->prog);
- if (prog)
+ if (prog) {
+ if (prog->call_get_stack &&
+ (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
+ data->callchain = perf_callchain(event, regs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
+
ret = bpf_prog_run(prog, &ctx);
+ }
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -10016,7 +10374,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event,
if (event->attr.precise_ip &&
prog->call_get_stack &&
- (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
@@ -10229,8 +10587,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
@@ -10892,36 +11251,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -10933,7 +11265,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10944,7 +11276,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10955,7 +11287,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -10985,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -11027,13 +11358,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -11101,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
- __perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11528,8 +11833,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11551,9 +11856,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;
- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
@@ -11623,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -11709,11 +12012,9 @@ err_pmu:
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
if (event->hw.target)
put_task_struct(event->hw.target);
- kmem_cache_free(perf_event_cache, event);
+ call_rcu(&event->rcu_head, free_event_rcu);
return ERR_PTR(err);
}
@@ -11975,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
-{
- struct perf_event_context *gctx;
-
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
-
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
- }
-
- return gctx;
-}
-
static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
@@ -12051,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -12183,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12227,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12239,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
- *
- * Racy, not holding group_leader->ctx->mutex, see comment with
- * perf_event_ctx_lock().
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perf_check_permission(&attr, task))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
- goto not_move_group;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
-
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
-
- /*
- * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
- * see the group_leader && !move_group test earlier.
- */
- if (group_leader && group_leader->ctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- }
}
-not_move_group:
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12386,36 +12599,33 @@ not_move_group:
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12426,9 +12636,10 @@ not_move_group:
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12436,9 +12647,10 @@ not_move_group:
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12455,8 +12667,6 @@ not_move_group:
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12478,25 +12688,17 @@ not_move_group:
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ /* event->pmu_ctx freed by free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -12522,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -12542,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12559,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12570,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12585,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ unaccount_event_cpu(event, cpu);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ unaccount_event_cpu(sibling, cpu);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, cpu);
+ perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12632,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12735,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12764,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12805,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12821,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -12865,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -12964,6 +13207,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -12984,17 +13228,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return NULL;
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -13117,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -13131,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13139,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13158,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13169,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13199,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13216,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13225,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13261,18 +13495,16 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13281,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -13288,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13318,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -13333,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -13358,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13508,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task);
- rcu_read_unlock();
+
+ preempt_disable();
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index f32320ac02fd..c3797701339c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
-
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
-
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
+ }
+
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
+
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
+
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * Readers want a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
__weak int arch_reserve_bp_slot(struct perf_event *bp)
@@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
int ret;
@@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
ret = arch_reserve_bp_slot(bp);
if (ret)
return ret;
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..c57610f52bb4
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ kunit_skip(test, "not enough cpus");
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ kunit_skip(test, "hw breakpoint already in use");
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 726132039c38..273a0fe7910a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+ irq_work_queue(&handle->event->pending_irq);
}
/*
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2eaa327f8158..d9e357b7e17c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -19,7 +19,7 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
@@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
- GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
+ folio_get(new_folio);
page_add_new_anon_rmap(new_page, vma, addr);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, vma, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -552,7 +555,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 84021b24f79e..15dc2ec80c46 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -60,17 +60,65 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
+#include <linux/sysfs.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -183,6 +231,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -374,10 +426,10 @@ static void coredump_task_exit(struct task_struct *tsk)
complete(&core_state->startup);
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
if (!self.task) /* see coredump_finish() */
break;
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -466,6 +518,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -733,14 +786,33 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
+ spin_unlock_irq(&sighand->siglock);
+}
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
+ synchronize_group_exit(tsk, code);
+
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
@@ -859,6 +931,7 @@ void __noreturn make_task_dead(int signr)
* Then do everything else.
*/
struct task_struct *tsk = current;
+ unsigned int limit;
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
@@ -873,6 +946,20 @@ void __noreturn make_task_dead(int signr)
}
/*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
@@ -905,7 +992,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
- else if (!thread_group_empty(current)) {
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 60dc825ecc2b..a7ccd2930c5f 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out_free;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
mutex_unlock(&fei_lock);
-out_free:
kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a9e92068b15..9f7fe3541897 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,13 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -75,7 +75,6 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -97,7 +96,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
-#include <linux/sched/mm.h>
+#include <linux/stackprotector.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*/
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
dup_anon_vma_name(orig, new);
}
return new;
@@ -537,6 +535,9 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
release_user_cpus_ptr(tsk);
scs_release(tsk);
@@ -580,11 +581,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
+ struct vm_area_struct *mpnt, *tmp;
int retval;
- unsigned long charge;
+ unsigned long charge = 0;
LIST_HEAD(uf);
+ MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -606,16 +608,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ retval = mas_expected_entries(&mas, oldmm->map_count);
+ if (retval)
+ goto out;
+
+ mas_for_each(&old_mas, mpnt, ULONG_MAX) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -629,7 +631,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto out;
+ goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -675,24 +677,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
-
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ /* Link the vma into the MT */
+ mas.index = tmp->vm_start;
+ mas.last = tmp->vm_end - 1;
+ mas_store(&mas, tmp);
+ if (mas_is_err(&mas))
+ goto fail_nomem_mas_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,10 +697,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);
if (retval)
- goto out;
+ goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ mas_destroy(&mas);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -714,6 +711,9 @@ out:
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
+
+fail_nomem_mas_store:
+ unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -721,7 +721,7 @@ fail_nomem_policy:
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
- goto out;
+ goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -756,8 +756,13 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
+ if (likely(!x))
+ continue;
+
+ /* Making sure this is not due to race with CPU offlining. */
+ x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
@@ -782,6 +787,8 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
@@ -791,6 +798,9 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -925,13 +935,13 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
+ for (i = 0; i < UCOUNT_COUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -1026,6 +1036,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->worker_private = NULL;
kcov_task_init(tsk);
+ kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
@@ -1109,9 +1120,10 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ int i;
+
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
@@ -1151,9 +1163,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1194,6 +1214,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -1285,13 +1306,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
- &old_exe_file->f_path))
+ &old_exe_file->f_path)) {
ret = -EBUSY;
+ break;
+ }
}
mmap_read_unlock(mm);
fput(old_exe_file);
@@ -1421,13 +1445,12 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
cgroup_enter_frozen();
- killed = wait_for_completion_killable(vfork);
+ killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
- freezer_count();
if (killed) {
task_lock(child);
@@ -1567,9 +1590,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
@@ -1693,6 +1713,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
sig->nr_threads = 1;
+ sig->quick_threads = 1;
atomic_set(&sig->live, 1);
refcount_set(&sig->sigcnt, 1);
@@ -2044,18 +2065,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- *
- * On vfork, the child process enters the target time namespace only
- * after exec.
- */
- if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
@@ -2119,7 +2128,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_free;
retval = -EAGAIN;
- if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_cleanup_count;
@@ -2410,12 +2419,6 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock(&current->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
-
rv_task_fork(p);
rseq_fork(p, clone_flags);
@@ -2432,6 +2435,14 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2463,6 +2474,7 @@ static __latent_entropy struct task_struct *copy_process(
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
+ current->signal->quick_threads++;
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
@@ -2595,11 +2607,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
-{
- return dup_mm(NULL, &init_mm);
-}
-
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@@ -2695,6 +2702,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
@@ -3011,10 +3025,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3032,19 +3063,6 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 45ab36ffd0e7..4fad0e6fca64 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -13,10 +13,11 @@
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
* system_transition_mutex
*/
bool pm_freezing;
@@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p)
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- unsigned int save = get_current_state();
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ set_current_state(TASK_FROZEN);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p)
}
}
+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ if (p->on_rq)
+ return 0;
+
+ if (p != current && task_curr(p))
+ return 0;
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ WRITE_ONCE(p->__state, TASK_FROZEN);
+ return TASK_FROZEN;
+}
+
+static bool __freeze_task(struct task_struct *p)
+{
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
+ * state in p->jobctl. If either of them got a wakeup that was missed because
+ * TASK_FROZEN, then their canonical state reflects that and the below will
+ * refuse to restore the special state and instead issue the wakeup.
+ */
+static int __set_task_special(struct task_struct *p, void *arg)
+{
+ unsigned int state = 0;
+
+ if (p->jobctl & JOBCTL_TRACED)
+ state = TASK_TRACED;
+
+ else if (p->jobctl & JOBCTL_STOPPED)
+ state = TASK_STOPPED;
+
+ if (state)
+ WRITE_ONCE(p->__state, state);
+
+ return state;
+}
+
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
+ unsigned long flags, flags2;
spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
+ if (WARN_ON_ONCE(freezing(p)))
+ goto unlock;
+
+ if (lock_task_sighand(p, &flags2)) {
+ /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
+ bool ret = task_call_func(p, __set_task_special, NULL);
+ unlock_task_sighand(p, &flags2);
+ if (ret)
+ goto unlock;
+ }
+
+ wake_up_state(p, TASK_FROZEN);
+unlock:
spin_unlock_irqrestore(&freezer_lock, flags);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b22ef1efe751..514e4582b863 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -638,6 +638,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
+ pid_t owner;
int err;
/* Futex address must be 32bit aligned */
@@ -659,6 +660,10 @@ retry:
* 2. A woken up waiter is killed before it can acquire the
* futex in user space.
*
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
* In both cases the TID validation below prevents a wakeup of
* potential waiters which can cause these waiters to block
* forever.
@@ -667,24 +672,27 @@ retry:
*
* 1) task->robust_list->list_op_pending != NULL
* @pending_op == true
- * 2) User space futex value == 0
+ * 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
*
* If these conditions are met, it is safe to attempt waking up a
* potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
*/
- if (pending_op && !pi && !uval) {
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0;
}
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ if (owner != task_pid_vnr(curr))
return 0;
/*
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 4ce0923f1ce3..ba01b9408203 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -334,7 +334,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* futex_queue() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
futex_queue(q, hb);
/* Arm the timer */
@@ -352,7 +352,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -430,7 +430,7 @@ retry:
return ret;
}
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
for (i = 0; i < count; i++) {
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
@@ -504,7 +504,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
return;
}
- freezable_schedule();
+ schedule();
}
/**
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index cbb0bed958ab..7670a811a565 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
for (i = 0; i < sfn_ptr->num_counters; i++)
dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
}
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 460c12b7dfea..74a4ef1da9ad 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -30,6 +30,13 @@
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -75,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -87,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
@@ -383,12 +395,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -402,7 +420,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 0c78e64f747d..473036b43c83 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -31,8 +31,8 @@ if [ "$building_out_of_srctree" ]; then
fi
all_dirs="$all_dirs $dir_list"
-# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed.
+# include/generated/utsversion.h is ignored because it is generated after this
+# script is executed. (utsversion.h is unneeded for kheaders)
#
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
# updated, but the contents might be still the same. When any CONFIG option is
@@ -42,7 +42,7 @@ all_dirs="$all_dirs $dir_list"
#
# Ignore them for md5 calculation to avoid pointless regeneration.
headers_md5="$(find $all_dirs -name "*.h" |
- grep -v "include/generated/compile.h" |
+ grep -v "include/generated/utsversion.h" |
grep -v "include/generated/autoconf.h" |
xargs ls -l | md5sum | cut -d ' ' -f1)"
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index bb2354f73ded..c71889f3f3fc 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
+ return;
/*
* When a freshly created task is scheduled once, changes its state to
@@ -191,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
+ unsigned int state;
+
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
@@ -198,8 +200,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ */
+ state = READ_ONCE(t->__state);
+ if ((state & TASK_UNINTERRUPTIBLE) &&
+ !(state & TASK_WAKEKILL) &&
+ !(state & TASK_NOLOAD))
check_hung_task(t, timeout);
}
unlock:
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..b64c44ae4c25 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,15 +86,10 @@ config GENERIC_IRQ_IPI
depends on SMP
select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
- bool
-
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8ac37e8e738a..49e7bc871fec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1561,10 +1561,10 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
-static struct device *irq_get_parent_device(struct irq_data *data)
+static struct device *irq_get_pm_device(struct irq_data *data)
{
if (data->domain)
- return data->domain->dev;
+ return data->domain->pm_dev;
return NULL;
}
@@ -1578,7 +1578,7 @@ static struct device *irq_get_parent_device(struct irq_data *data)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
@@ -1597,7 +1597,7 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f09c60393e55..5fdc0b557579 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -52,6 +52,7 @@ enum {
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -64,6 +65,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 5db0230aa6b5..fd0996274401 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
@@ -705,6 +708,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
+ */
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
+
/**
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
* to a domain.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 40fe7806cc8c..5b7cf28df290 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -321,7 +321,7 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
@@ -354,7 +354,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index a9ee535293eb..955267bbc2be 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -19,8 +19,31 @@
#include "internals.h"
+/**
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
+
/**
* msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
@@ -33,7 +56,7 @@ static inline int msi_sysfs_create_group(struct device *dev);
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+ const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -58,25 +81,56 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}
-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;
- desc->msi_index = index;
- ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
- if (ret)
- msi_free_desc(desc);
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
return ret;
}
/**
- * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
* @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
*
* Return: 0 on success or an appropriate failure code.
*/
-int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
{
struct msi_desc *desc;
@@ -88,40 +142,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;
- return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
-}
-/**
- * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
- * @dev: Pointer to the device for which the descriptors are allocated
- * @index: Index for the first MSI descriptor
- * @ndesc: Number of descriptors to allocate
- *
- * Return: 0 on success or an appropriate failure code.
- */
-static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
-{
- unsigned int idx, last = index + ndesc - 1;
- struct msi_desc *desc;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- for (idx = index; idx <= last; idx++) {
- desc = msi_alloc_desc(dev, 1, NULL);
- if (!desc)
- goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, idx);
- if (ret)
- goto fail;
- }
- return 0;
-
-fail_mem:
- ret = -ENOMEM;
-fail:
- msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
- return ret;
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -138,28 +160,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
return false;
}
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
/**
- * msi_free_msi_descs_range - Free MSI descriptors of a device
- * @dev: Device to free the descriptors
- * @filter: Descriptor state filter
- * @first_index: Index to start freeing from
- * @last_index: Last index to be freed
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
*/
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int first_index, unsigned int last_index)
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
{
- struct xarray *xa = &dev->msi.data->__store;
struct msi_desc *desc;
- unsigned long idx;
+ unsigned int idx;
+ int ret;
lockdep_assert_held(&dev->msi.data->mutex);
- xa_for_each_range(xa, idx, desc, first_index, last_index) {
- if (msi_desc_match(desc, filter)) {
- xa_erase(xa, idx);
- msi_free_desc(desc);
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
}
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -178,9 +269,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg);
static void msi_device_data_release(struct device *dev, void *res)
{
struct msi_device_data *md = res;
+ int i;
- WARN_ON_ONCE(!xa_empty(&md->__store));
- xa_destroy(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
dev->msi.data = NULL;
}
@@ -197,7 +292,7 @@ static void msi_device_data_release(struct device *dev, void *res)
int msi_setup_device_data(struct device *dev)
{
struct msi_device_data *md;
- int ret;
+ int ret, i;
if (dev->msi.data)
return 0;
@@ -212,7 +307,18 @@ int msi_setup_device_data(struct device *dev)
return ret;
}
- xa_init(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
mutex_init(&md->mutex);
dev->msi.data = md;
devres_add(dev, md);
@@ -235,27 +341,30 @@ EXPORT_SYMBOL_GPL(msi_lock_descs);
*/
void msi_unlock_descs(struct device *dev)
{
- /* Invalidate the index wich was cached by the iterator */
- dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
EXPORT_SYMBOL_GPL(msi_unlock_descs);
-static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
{
+ struct xarray *xa = &md->__domains[domid].store;
struct msi_desc *desc;
- xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
if (msi_desc_match(desc, filter))
return desc;
}
- md->__iter_idx = MSI_MAX_INDEX;
+ md->__iter_idx = MSI_XA_MAX_INDEX;
return NULL;
}
/**
- * msi_first_desc - Get the first MSI descriptor of a device
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
* @filter: Descriptor state filter
*
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
@@ -264,23 +373,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_
* Return: Pointer to the first MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
md->__iter_idx = 0;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
-EXPORT_SYMBOL_GPL(msi_first_desc);
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
/**
* msi_next_desc - Get the next MSI descriptor of a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
*
* The first invocation of msi_next_desc() has to be preceeded by a
* successful invocation of __msi_first_desc(). Consecutive invocations are
@@ -290,11 +402,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc);
* Return: Pointer to the next MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
@@ -303,30 +416,38 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
return NULL;
md->__iter_idx++;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
EXPORT_SYMBOL_GPL(msi_next_desc);
/**
- * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
* @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
* @index: MSI interrupt index to look for (0-based)
*
* Return: The Linux interrupt number on success (> 0), 0 if not found
*/
-unsigned int msi_get_virq(struct device *dev, unsigned int index)
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
unsigned int ret = 0;
- bool pcimsi;
+ bool pcimsi = false;
+ struct xarray *xa;
if (!dev->msi.data)
return 0;
- pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
msi_lock_descs(dev);
- desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
/*
* PCI-MSI has only one descriptor for multiple interrupts.
@@ -340,10 +461,11 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index)
ret = desc->irq;
}
}
+
msi_unlock_descs(dev);
return ret;
}
-EXPORT_SYMBOL_GPL(msi_get_virq);
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
#ifdef CONFIG_SYSFS
static struct attribute *msi_dev_attrs[] = {
@@ -459,7 +581,39 @@ static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *d
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
#endif /* !CONFIG_SYSFS */
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -613,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
- .domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -639,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
- if (ops->domain_alloc_irqs == NULL)
- ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
-
if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -651,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
if (ops->set_desc == NULL)
@@ -668,6 +805,40 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain) {
+ if (!domain->name && info->chip)
+ domain->name = info->chip->name;
+ irq_domain_update_bus_token(domain, info->bus_token);
+ }
+
+ return domain;
+}
+
/**
* msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
@@ -680,19 +851,210 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
+
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct msi_domain_template *bundle;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ if (!fwnode)
+ goto free_bundle;
+
+ if (msi_setup_device_data(dev))
+ goto free_fwnode;
+
+ msi_lock_descs(dev);
+
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ goto fail;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ goto fail;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ goto fail;
+
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+ msi_unlock_descs(dev);
+ return true;
+
+fail:
+ msi_unlock_descs(dev);
+free_fwnode:
+ kfree(fwnode);
+free_bundle:
+ kfree(bundle);
+ return false;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
struct irq_domain *domain;
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ msi_lock_descs(dev);
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = msi_get_device_domain(dev, domid);
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ if (!domain || !irq_domain_is_msi_device(domain))
+ goto unlock;
- return domain;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+ irq_domain_remove(domain);
+ kfree(container_of(info, struct msi_domain_template, info));
+
+unlock:
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+ bool ret = false;
+
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ ret = info->bus_token == bus_token;
+ }
+ msi_unlock_descs(dev);
+ return ret;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -700,13 +1062,8 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- int ret;
-
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
- return ret;
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
@@ -714,16 +1071,27 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
struct msi_desc *desc;
+ struct xarray *xa;
int ret, virq;
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return -EINVAL;
+
msi_lock_descs(dev);
- ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
if (ret)
goto unlock;
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+
for (virq = virq_base; virq < virq_base + nvec; virq++) {
- desc = xa_load(&dev->msi.data->__store, virq);
+ desc = xa_load(xa, virq);
desc->irq = virq;
ops->set_desc(arg, desc);
@@ -739,7 +1107,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
fail:
for (--virq; virq >= virq_base; virq--)
irq_domain_free_irqs_common(domain, virq, 1);
- msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+ msi_domain_free_descs(dev, &ctrl);
unlock:
msi_unlock_descs(dev);
return ret;
@@ -764,6 +1132,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
break;
default:
@@ -789,6 +1159,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
{
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
if (IS_ENABLED(CONFIG_PCI_MSI))
break;
@@ -850,18 +1222,19 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ unsigned int vflags = 0, allocated = 0;
msi_alloc_info_t arg = { };
- unsigned int vflags = 0;
struct msi_desc *desc;
- int allocated = 0;
+ unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -883,11 +1256,21 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* MSI affinity setting requires a special quirk (X86) when
* reservation mode is active.
*/
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ if (info->flags & MSI_FLAG_NOMASK_QUIRK)
vflags |= VIRQ_NOMASK_QUIRK;
}
- msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -913,76 +1296,213 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return 0;
}
-static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
- struct device *dev,
- unsigned int num_descs)
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
{
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
return 0;
- return msi_add_simple_msi_descs(dev, 0, num_descs);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
+
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
+
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
+ return ret;
}
/**
- * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
- * pair. Use this for MSI irqdomains which implement their own vector
+ * pair. Use this for MSI irqdomains which implement their own descriptor
* allocation/free.
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
- ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
- if (ret)
- return ret;
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ int ret;
- ret = ops->domain_alloc_irqs(domain, dev, nvec);
- if (ret)
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
return ret;
}
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
+ struct msi_desc *desc;
int ret;
msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ goto unlock;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ goto unlock;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ goto unlock;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+unlock:
msi_unlock_descs(dev);
- return ret;
+ return map;
}
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;
- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -997,44 +1517,99 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
}
-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
}
/**
- * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation.
*/
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- ops->domain_free_irqs(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: The id of the domain to operate on
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
msi_lock_descs(dev);
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
msi_unlock_descs(dev);
}
@@ -1048,5 +1623,3 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 714ac4c3b556..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,11 +113,40 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || (v + 1) < 0)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
@@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
@@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+ jump_label_unlock();
+ return false;
+ }
}
jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3e7e2c2ad2f7..83f499182c9a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -50,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -108,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -122,13 +130,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
return kallsyms_addresses[idx];
@@ -159,7 +177,6 @@ static bool cleanup_symbol_name(char *s)
* character in an identifier in C. Suffixes observed:
* - foo.llvm.[0-9a-f]+
* - foo.[0-9a-f]+
- * - foo.[0-9a-f]+.cfi_jt
*/
res = strchr(s, '.');
if (res) {
@@ -167,45 +184,103 @@ static bool cleanup_symbol_name(char *s)
return true;
}
- if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
- !IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
- CONFIG_CLANG_VERSION >= 130000)
- return false;
+ return false;
+}
- /*
- * Prior to LLVM 13, the following suffixes were observed when thinLTO
- * and CFI are both enabled:
- * - foo$[0-9]+
- */
- res = strrchr(s, '$');
- if (res) {
- *res = '\0';
- return true;
+static int compare_symbol_name(const char *name, char *namebuf)
+{
+ int ret;
+
+ ret = strcmp(name, namebuf);
+ if (!ret)
+ return ret;
+
+ if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
+ return 0;
+
+ return ret;
+}
+
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
+{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
+ char namebuf[KSYM_NAME_LEN];
+
+ low = 0;
+ high = kallsyms_num_syms - 1;
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = compare_symbol_name(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
}
- return false;
+ if (low > high)
+ return -ESRCH;
+
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
+ }
+
+ return 0;
}
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
- char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
+ int ret;
+ unsigned int i;
/* Skip the search for empty string. */
if (!*name)
return 0;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
-
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
- if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
- }
return module_kallsyms_lookup_name(name);
}
@@ -232,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
return 0;
}
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
+}
+
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 2d0c6f2f0243..27fabdcc40f5 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
extern const unsigned int kallsyms_markers[] __weak;
+extern const u8 kallsyms_seqs_of_names[] __weak;
#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..f35d9cc1aab1
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+ ITEM_DATA(vmap_area_list),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ u64 t0, t1, t;
+ unsigned long flags;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ local_irq_save(flags);
+ t0 = sched_clock();
+ (void)kallsyms_lookup_name(name);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static bool match_cleanup_name(const char *s, const char *name)
+{
+ char *p;
+ int len;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ p = strchr(s, '.');
+ if (!p)
+ return false;
+
+ len = strlen(name);
+ if (p - s != len)
+ return false;
+
+ return !strncmp(s, name, len);
+}
+
+static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (strcmp(name, stat->name) == 0 ||
+ (!stat->perf && match_cleanup_name(name, stat->name))) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ unsigned long flags;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ local_irq_save(flags);
+ t0 = sched_clock();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ unsigned long flags;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ local_irq_save(flags);
+ t0 = sched_clock();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ goto failed;
+ }
+
+ /*
+ * The first '.' may be the initial letter, in which case the
+ * entire symbol name will be truncated to an empty string in
+ * cleanup_symbol_name(). Do not test these symbols.
+ *
+ * For example:
+ * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
+ * .E_read_words
+ * .E_leading_bytes
+ * .E_trailing_bytes
+ * .E_write_words
+ * .E_copy
+ * .str.292.llvm.12122243386960820698
+ * .str.24.llvm.12122243386960820698
+ * .str.29.llvm.12122243386960820698
+ * .str.75.llvm.12122243386960820698
+ * .str.99.llvm.12122243386960820698
+ */
+ if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
+ continue;
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0])))
+ goto failed;
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt)
+ goto failed;
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr || lookup_addr != stat->addrs[0])
+ goto failed;
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt)
+ goto failed;
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_create(test_entry, NULL, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+ kthread_bind(t, 0);
+ wake_up_process(t);
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcov.c b/kernel/kcov.c
index e19c84b02452..e5cd09fd8a05 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 4f35d1bced6a..8cf70f068d92 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index fe12dfe254ec..54d077e1a2dc 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -14,10 +14,12 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include <linux/uaccess.h>
#include "encoding.h"
@@ -1308,3 +1310,51 @@ noinline void __tsan_atomic_signal_fence(int memorder)
}
}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 67794404042a..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change,
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 75712959c84e..8679322450f2 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -22,13 +22,6 @@
#define ITERS_PER_TEST 2000
-/* Test requirements. */
-static bool __init test_requires(void)
-{
- /* random should be initialized for the below tests */
- return prandom_u32() + prandom_u32() != 0;
-}
-
/*
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
@@ -38,15 +31,15 @@ static bool __init test_encode_decode(void)
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
unsigned long verif_masked_addr;
long encoded_watchpoint;
bool verif_is_write;
unsigned long addr;
size_t verif_size;
- prandom_bytes(&addr, sizeof(addr));
+ get_random_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;
@@ -259,7 +252,6 @@ static int __init kcsan_selftest(void)
pr_err("selftest: " #do_test " failed"); \
} while (0)
- RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
RUN_TEST(test_barrier);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b5e40f069768..cb8e6e6f983c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/*
* Because we write directly to the reserved memory region when loading
- * crash kernels we need a mutex here to prevent multiple crash kernels
- * from attempting to load simultaneously, and to prevent a crash kernel
- * from loading over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
*/
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (flags & KEXEC_ON_CRASH) {
@@ -165,7 +162,7 @@ out:
kimage_free(image);
out_unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index acd029b307e4..969e8f52f7da 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -46,7 +46,7 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -561,23 +561,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
@@ -809,7 +803,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -822,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
+ kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
@@ -873,7 +867,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -889,7 +883,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
- kunmap(page);
+ kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
@@ -959,7 +953,7 @@ late_initcall(kexec_core_sysctl_init);
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
+ /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -967,7 +961,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -976,7 +970,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
@@ -1004,14 +998,17 @@ void crash_kexec(struct pt_regs *regs)
}
}
-size_t crash_get_memory_size(void)
+ssize_t crash_get_memory_size(void)
{
- size_t size = 0;
+ ssize_t size = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
- mutex_lock(&kexec_mutex);
if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
+
+ kexec_unlock();
return size;
}
@@ -1022,7 +1019,8 @@ int crash_shrink_memory(unsigned long new_size)
unsigned long old_size;
struct resource *ram_res;
- mutex_lock(&kexec_mutex);
+ if (!kexec_trylock())
+ return -EBUSY;
if (kexec_crash_image) {
ret = -ENOENT;
@@ -1060,7 +1058,7 @@ int crash_shrink_memory(unsigned long new_size)
insert_resource(&iomem_resource, ram_res);
unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -1132,7 +1130,7 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1208,6 +1206,6 @@ int kernel_kexec(void)
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 1d546dc97c50..dd5983010b7b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
dest_image = &kexec_image;
@@ -411,7 +411,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
@@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
{
int i, j;
unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
+ struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..74da1409cd14 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca9d834d0b84..1c18ecf9f98b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1607,9 +1607,10 @@ int register_kprobe(struct kprobe *p)
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
+ bool on_func_entry;
/* Adjust probe address from symbol */
- addr = kprobe_addr(p);
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
@@ -1629,6 +1630,9 @@ int register_kprobe(struct kprobe *p)
mutex_lock(&kprobe_mutex);
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+
old_p = get_kprobe(p->addr);
if (old_p) {
/* Since this may unoptimize 'old_p', locking 'text_mutex'. */
@@ -1762,7 +1766,13 @@ static int __unregister_kprobe_top(struct kprobe *p)
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- ap->post_handler = NULL;
+ /*
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
+ */
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
}
noclean:
/*
@@ -2203,13 +2213,9 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
- }
+
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
if (!rp->rh)
@@ -2354,6 +2360,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2370,14 +2384,6 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
-
- /*
- * The module is going away. We should disarm the kprobe which
- * is using ftrace, because ftrace framework is still available at
- * 'MODULE_STATE_GOING' notification.
- */
- if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
- disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
@@ -2425,8 +2431,11 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
- if (ret)
+ if (ret) {
p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
out:
mutex_unlock(&kprobe_mutex);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index b1292a57c2a5..2df00b789b90 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
@@ -20,6 +21,14 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -34,6 +43,14 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
@@ -105,7 +122,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sprintf(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -210,6 +232,7 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3c677918d8f2..f97fd01a2932 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -704,6 +704,7 @@ int kthread_stop(struct task_struct *k)
kthread = to_kthread(k);
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
kthread_unpark(k);
+ set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
wake_up_process(k);
wait_for_completion(&kthread->exited);
ret = kthread->result;
@@ -1050,8 +1051,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_FUNCTION_MISMATCH(timer->function,
- kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 76166df011a4..781249098cb6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -112,7 +112,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bc475e62279d..201f0c0482fb 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -125,20 +125,10 @@ struct klp_find_arg {
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -153,6 +143,23 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
@@ -167,7 +174,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
if (objname)
module_kallsyms_on_each_symbol(klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
@@ -213,7 +220,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* we use the smallest/strictest upper bound possible (56, based on
* the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
@@ -227,7 +234,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%127[^,],%lu",
+ ".klp.sym.%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
@@ -325,6 +332,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static int __klp_disable_patch(struct klp_patch *patch);
@@ -431,6 +439,22 @@ static struct attribute *klp_patch_attrs[] = {
};
ATTRIBUTE_GROUPS(klp_patch);
+static ssize_t patched_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+ return sysfs_emit(buf, "%d\n", obj->patched);
+}
+
+static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
+static struct attribute *klp_object_attrs[] = {
+ &patched_kobj_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(klp_object);
+
static void klp_free_object_dynamic(struct klp_object *obj)
{
kfree(obj->name);
@@ -576,6 +600,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
static struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
@@ -1171,7 +1196,7 @@ int klp_module_coming(struct module *mod)
return -EINVAL;
if (!strcmp(mod->name, "vmlinux")) {
- pr_err("vmlinux.ko: invalid module name");
+ pr_err("vmlinux.ko: invalid module name\n");
return -EINVAL;
}
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 4c4f5a776d80..4152c71507e2 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
ftrace_test_recursion_unlock(bit);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5d03a2ad1066..f1b25ec581e0 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -196,36 +196,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
@@ -610,9 +610,23 @@ void klp_reverse_transition(void)
/* Called from copy_process() during fork */
void klp_copy_process(struct task_struct *child)
{
- child->patch_state = current->patch_state;
- /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+ /*
+ * The parent process may have gone through a KLP transition since
+ * the thread flag was copied in setup_thread_stack earlier. Bring
+ * the task flag up to date with the parent here.
+ *
+ * The operation is serialized against all klp_*_transition()
+ * operations by the tasklist_lock. The only exception is
+ * klp_update_patch_state(current), but we cannot race with
+ * that because we are current.
+ */
+ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
+ set_tsk_thread_flag(child, TIF_PATCH_PENDING);
+ else
+ clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
+
+ child->patch_state = current->patch_state;
}
/*
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index d51cabf28f38..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,7 +5,7 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 64a13eb56078..e3375bc40dad 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -934,8 +934,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name &&
- lock->key != &__lockdep_no_validate__);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 5fe4c5495ba3..185bd1c906b0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2e1600906c9f..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -18,7 +18,7 @@
* queued_read_lock_slowpath - acquire read lock of a queued rwlock
* @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
* queued_write_lock_slowpath - acquire write lock of a queued rwlock
* @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 65a9a10caa6f..2b23378775fe 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -313,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 old, tail;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..6afc249ce697 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -489,7 +489,7 @@ gotlock:
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct pv_node *node;
@@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#include <asm/qspinlock_paravirt.h>
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
u8 locked;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 65f0262f635e..44873594de03 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -133,14 +133,19 @@
* the owner value concurrently without lock. Read from owner, however,
* may not need READ_ONCE() as long as the pointer value is only used
* for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
*/
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, (long)current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, 0);
}
@@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
+ bool ret = false;
+ preempt_disable();
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- return true;
+ ret = true;
}
- return false;
+ preempt_enable();
+ return ret;
}
/*
@@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+ preempt_enable();
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index f2654d2fe43a..34bfae72f295 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -51,7 +51,7 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
@@ -74,7 +74,7 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
@@ -131,7 +131,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -156,7 +156,7 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
@@ -180,7 +180,7 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 7f49baaa4979..8475a0794f8c 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
@@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh);
#ifndef CONFIG_PREEMPT_RT
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 353004155d65..29dc253d03af 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -399,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
+ r = get_random_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 26ea5d04f56c..424b3bc58f3f 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -221,9 +221,10 @@ endchoice
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
+ depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
help
Support for decompressing kernel modules by the kernel itself
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 4d0bcb3d9e44..bb79ac1a6d8f 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info)
return page;
}
-#ifdef CONFIG_MODULE_COMPRESS_GZIP
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
#include <linux/zlib.h>
#define MODULE_COMPRESSION gzip
#define MODULE_DECOMPRESS_FN module_gzip_decompress
@@ -114,8 +114,8 @@ static ssize_t module_gzip_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out_inflate_end;
}
@@ -141,7 +141,7 @@ out:
kfree(s.workspace);
return retval;
}
-#elif CONFIG_MODULE_COMPRESS_XZ
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
#include <linux/xz.h>
#define MODULE_COMPRESSION xz
#define MODULE_DECOMPRESS_FN module_xz_decompress
@@ -173,8 +173,8 @@ static ssize_t module_xz_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out;
}
@@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info,
xz_dec_end(xz_dec);
return retval;
}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (!IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap(page);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kfree(wksp);
+ return retval;
+}
#else
#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
#endif
@@ -256,7 +344,7 @@ void module_decompress_cleanup(struct load_info *info)
static ssize_t compression_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
+ return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
}
static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 680d980a4fb2..2e2bf236f558 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -53,6 +53,7 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];
+#include <linux/dynamic_debug.h>
struct load_info {
const char *name;
/* pointer to module in temporary copy, freed at end of load_module() */
@@ -62,8 +63,7 @@ struct load_info {
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug *debug;
- unsigned int num_debug;
+ struct _ddebug_info dyndbg;
bool sig_ok;
#ifdef CONFIG_KALLSYMS
unsigned long mod_kallsyms_init_off;
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index f5c5c9175333..4523f99b0358 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -494,7 +494,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
-#ifdef CONFIG_LIVEPATCH
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
@@ -531,4 +530,3 @@ out:
mutex_unlock(&module_mutex);
return ret;
}
-#endif /* CONFIG_LIVEPATCH */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a4e4d84b6f4e..48568a0f5651 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
+#include <linux/cfi.h>
#include <uapi/linux/module.h>
#include "internal.h"
@@ -84,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = {
};
#endif
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
struct symsearch {
const struct kernel_symbol *start, *stop;
const s32 *crcs;
@@ -1144,8 +1142,6 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
-static void cfi_cleanup(struct module *mod);
-
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1190,9 +1186,6 @@ static void free_module(struct module *mod)
mod->name);
mutex_unlock(&module_mutex);
- /* Clean up CFI for the module. */
- cfi_cleanup(mod);
-
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
@@ -1598,16 +1591,16 @@ static void free_modinfo(struct module *mod)
}
}
-static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
+static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg)
{
- if (!debug)
+ if (!dyndbg->num_descs)
return;
- ddebug_add_module(debug, num, mod->name);
+ ddebug_add_module(dyndbg, mod->name);
}
-static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
+static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg)
{
- if (debug)
+ if (dyndbg->num_descs)
ddebug_remove_module(mod->name);
}
@@ -1678,6 +1671,11 @@ static int elf_validity_check(struct load_info *info)
info->hdr->e_machine);
goto no_exec;
}
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
pr_err("Invalid ELF section header size\n");
goto no_exec;
@@ -2111,8 +2109,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
if (section_addr(info, "__obsparm"))
pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
- info->debug = section_objs(info, "__dyndbg",
- sizeof(*info->debug), &info->num_debug);
+ info->dyndbg.descs = section_objs(info, "__dyndbg",
+ sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs);
+ info->dyndbg.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes);
return 0;
}
@@ -2249,6 +2249,11 @@ static void flush_module_icache(const struct module *mod)
(unsigned long)mod->core_layout.base + mod->core_layout.size);
}
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
char *secstrings,
@@ -2602,8 +2607,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
if (err < 0)
goto out;
- /* This relies on module_mutex for list integrity. */
+ /* These rely on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
if (module_check_misalignment(mod))
goto out_misaligned;
@@ -2665,8 +2671,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
-static void cfi_init(struct module *mod);
-
/*
* Allocate and load the module: note that size of section 0 is always
* zero, and we rely on this for optional sections.
@@ -2796,9 +2800,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
flush_module_icache(mod);
- /* Setup CFI for the module. */
- cfi_init(mod);
-
/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
@@ -2807,7 +2808,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
init_build_id(mod, info);
- dynamic_debug_setup(mod, info->debug, info->num_debug);
+ dynamic_debug_setup(mod, &info->dyndbg);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -2871,11 +2872,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
ddebug_cleanup:
ftrace_release_mod(mod);
- dynamic_debug_remove(mod, info->debug);
+ dynamic_debug_remove(mod, &info->dyndbg);
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
- cfi_cleanup(mod);
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
@@ -2961,41 +2961,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
return ((void *)addr >= start && (void *)addr < start + size);
}
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- initcall_t *init;
-#ifdef CONFIG_MODULE_UNLOAD
- exitcall_t *exit;
-#endif
-
- rcu_read_lock_sched();
- mod->cfi_check = (cfi_check_fn)
- find_kallsyms_symbol_value(mod, "__cfi_check");
- init = (initcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
- /* Fix init/exit functions to point to the CFI jump table */
- if (init)
- mod->init = *init;
-#ifdef CONFIG_MODULE_UNLOAD
- exit = (exitcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
- if (exit)
- mod->exit = *exit;
-#endif
- rcu_read_unlock_sched();
-
- cfi_module_add(mod, mod_tree.addr_min);
-#endif
-}
-
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- cfi_module_remove(mod, mod_tree.addr_min);
-#endif
-}
-
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index ce68f821dcd1..c921bf044050 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod)
int err;
struct kobject *kobj;
- if (!module_sysfs_initialized) {
+ if (!module_kset) {
pr_err("%s: module sysfs not initialized\n", mod->name);
err = -EINVAL;
goto out;
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
index 7f8133044d09..26d812e07615 100644
--- a/kernel/module/tracking.c
+++ b/kernel/module/tracking.c
@@ -10,6 +10,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/debugfs.h>
#include <linux/rculist.h>
#include "internal.h"
@@ -21,6 +22,9 @@ int try_add_tainted_module(struct module *mod)
module_assert_mutex_or_preempt();
+ if (!mod->taints)
+ goto out;
+
list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
lockdep_is_held(&module_mutex)) {
if (!strcmp(mod_taint->name, mod->name) &&
@@ -59,3 +63,70 @@ void print_unloaded_tainted_modules(void)
}
}
}
+
+#ifdef CONFIG_DEBUG_FS
+static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
+}
+
+static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
+}
+
+static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ size_t l;
+
+ mod_taint = list_entry(p, struct mod_unload_taint, list);
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+
+ seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations unloaded_tainted_modules_seq_ops = {
+ .start = unloaded_tainted_modules_seq_start,
+ .next = unloaded_tainted_modules_seq_next,
+ .stop = unloaded_tainted_modules_seq_stop,
+ .show = unloaded_tainted_modules_seq_show,
+};
+
+static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unloaded_tainted_modules_seq_ops);
+}
+
+static const struct file_operations unloaded_tainted_modules_fops = {
+ .open = unloaded_tainted_modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init unloaded_tainted_modules_init(void)
+{
+ struct dentry *dir;
+
+ dir = debugfs_create_dir("modules", NULL);
+ debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
+ &unloaded_tainted_modules_fops);
+
+ return 0;
+}
+module_init(unloaded_tainted_modules_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 0d5bd62c480e..ab75637fd904 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -62,7 +62,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -105,13 +105,13 @@ NOKPROBE_SYMBOL(notifier_call_chain);
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
- * @v Pointer passed unmodified to the notifier function
+ * @v: Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
- * Returns: the return value of the @val_up call.
+ * Return: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b4cbb406bc28..a487ff24129b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
- if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
@@ -255,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+int exec_task_namespaces(void)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new;
+
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
+
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
diff --git a/kernel/padata.c b/kernel/padata.c
index e5819bb8bd1d..e007b8a4b738 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -83,8 +83,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
rcu_read_unlock_bh();
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -388,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &reorder->list, list)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
if (cur->seq_nr < padata->seq_nr)
break;
- list_add(&padata->list, &cur->list);
+ }
+ list_add(&padata->list, pos);
spin_unlock(&reorder->lock);
/*
diff --git a/kernel/panic.c b/kernel/panic.c
index c6eb8f8db0c0..463c9295bc28 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -32,6 +33,7 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -58,6 +60,7 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -75,8 +78,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL)
+#ifdef CONFIG_SYSCTL
static struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
{
.procname = "oops_all_cpu_backtrace",
.data = &sysctl_oops_all_cpu_backtrace,
@@ -86,6 +90,14 @@ static struct ctl_table kern_panic_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+#endif
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
{ }
};
@@ -97,6 +109,25 @@ static __init int kernel_panic_sysctls_init(void)
late_initcall(kernel_panic_sysctls_init);
#endif
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -199,6 +230,19 @@ static void panic_print_sys_info(bool console_flush)
ftrace_dump(DUMP_ALL);
}
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -329,9 +373,6 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
-#ifdef CONFIG_VT
- unblank_screen();
-#endif
console_unblank();
/*
@@ -620,8 +661,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("kernel");
if (!regs)
dump_stack();
@@ -747,8 +787,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 5b92310425c5..14d66070757b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
const struct kobj_type *ktype = get_ktype(kobj);
@@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
@@ -954,7 +953,11 @@ struct kobj_type module_ktype = {
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -964,13 +967,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
index 2fc0a16ec77b..3fbc5e46b721 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
+EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 89c71fce225d..793c55a2becb 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -92,20 +92,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -641,7 +645,7 @@ static void power_down(void)
int error;
if (hibernation_mode == HIBERNATION_SUSPEND) {
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ error = suspend_devices_and_enter(mem_sleep_current);
if (error) {
hibernation_mode = hibernation_ops ?
HIBERNATION_PLATFORM :
@@ -713,6 +717,7 @@ static int load_image_and_restore(void)
int hibernate(void)
{
bool snapshot_test = false;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available()) {
@@ -720,7 +725,7 @@ int hibernate(void)
return -EPERM;
}
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -794,7 +799,7 @@ int hibernate(void)
pm_restore_console();
hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
pr_info("hibernation exit\n");
return error;
@@ -809,9 +814,10 @@ int hibernate(void)
*/
int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
+ unsigned int sleep_flags;
int error;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -891,7 +897,7 @@ restore:
hibernate_release();
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -1100,11 +1106,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -1112,7 +1119,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -1142,7 +1149,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!error)
pm_pr_dbg("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -1158,9 +1165,10 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t res;
if (len && buf[len-1] == '\n')
len--;
@@ -1173,9 +1181,10 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!res)
return -EINVAL;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_resume_device = res;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
+
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
swsusp_resume_device);
noresume = 0;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index e3694034b753..31ec4a9b9d70 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,14 +21,16 @@
#ifdef CONFIG_PM_SLEEP
-void lock_system_sleep(void)
+unsigned int lock_system_sleep(void)
{
- current->flags |= PF_FREEZER_SKIP;
+ unsigned int flags = current->flags;
+ current->flags |= PF_NOFREEZE;
mutex_lock(&system_transition_mutex);
+ return flags;
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
-void unlock_system_sleep(void)
+void unlock_system_sleep(unsigned int flags)
{
/*
* Don't use freezer_count() because we don't want the call to
@@ -46,7 +48,8 @@ void unlock_system_sleep(void)
* Which means, if we use try_to_freeze() here, it would make them
* enter the refrigerator, thus causing hibernation to lockup.
*/
- current->flags &= ~PF_FREEZER_SKIP;
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -263,16 +266,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -282,7 +286,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3068601e585a..6c1c7e566d35 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -27,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -36,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -50,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -83,9 +86,8 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
@@ -96,15 +98,14 @@ static int try_to_freeze_tasks(bool user_only)
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -129,17 +130,14 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
pm_wakeup_clear(0);
- pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
@@ -168,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -190,7 +183,7 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2a406753af90..cd8b7b35f1e8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1723,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable)
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -2259,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) {
memory_bm_set_bit(bm, buf[j]);
- else
+ } else {
+ if (!pfn_valid(buf[j]))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(buf[j]));
return -EFAULT;
+ }
}
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 827075944d28..fa3bf161d13f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
s2idle_ops = ops;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
static void s2idle_begin(void)
@@ -136,6 +138,9 @@ static void s2idle_loop(void)
break;
}
+ if (s2idle_ops && s2idle_ops->check)
+ s2idle_ops->check();
+
s2idle_enter();
}
@@ -200,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
suspend_ops = ops;
@@ -216,7 +223,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
mem_sleep_current = PM_SUSPEND_MEM;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d43c2aa583b2..3a4e70366f35 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev)
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available())
return -EPERM;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -98,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->dev = 0;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -106,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_free();
data = filp->private_data;
@@ -124,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
PM_POST_HIBERNATION : PM_POST_RESTORE);
hibernate_release();
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return 0;
}
@@ -132,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp)
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -157,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
*offp += res;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -165,16 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned long sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
if (need_wait) {
wait_for_device_probe();
need_wait = false;
}
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
@@ -196,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res > 0)
*offp += res;
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1a81fd9889b..7decf1e9c486 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -79,13 +79,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
+ */
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq and console_suspended,
+ * and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk;
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
#endif
enum devkmsg_log_bits {
@@ -220,8 +240,68 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
-/* Number of registered extended console drivers. */
-static int nr_ext_console_drivers;
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
+
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
@@ -433,7 +513,7 @@ static struct printk_ringbuffer *prb = &printk_rb_static;
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
*/
-static bool __printk_percpu_data_ready __read_mostly;
+static bool __printk_percpu_data_ready __ro_after_init;
bool printk_percpu_data_ready(void)
{
@@ -1817,13 +1897,13 @@ static void console_lock_spinning_enable(void)
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(void)
+static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -1843,6 +1923,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
@@ -2296,6 +2382,7 @@ asmlinkage __visible int _printk(const char *fmt, ...)
}
EXPORT_SYMBOL(_printk);
+static bool pr_flush(int timeout_ms, bool reset_on_progress);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
@@ -2324,12 +2411,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
+static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
static void call_console_driver(struct console *con, const char *text, size_t len,
char *dropped_text)
{
}
static bool suppress_message_printing(int level) { return false; }
+static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2392,7 +2480,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ strscpy(c->name, name, sizeof(c->name));
c->options = options;
set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
@@ -2554,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu)
}
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2574,10 +2662,10 @@ void console_lock(void)
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2624,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void)
* Check if the given console is currently capable and allowed to print
* records.
*
- * Requires the console_lock.
+ * Requires the console_srcu_read_lock.
*/
static inline bool console_is_usable(struct console *con)
{
- if (!(con->flags & CON_ENABLED))
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_ENABLED))
return false;
if (!con->write)
@@ -2639,8 +2729,7 @@ static inline bool console_is_usable(struct console *con)
* allocated. So unless they're explicitly marked as being able to
* cope (CON_ANYTIME) don't call them until this CPU is officially up.
*/
- if (!cpu_online(raw_smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
return false;
return true;
@@ -2665,16 +2754,18 @@ static void __console_unlock(void)
* DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
*
* @handover will be set to true if a printk waiter has taken over the
- * console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false.
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
+ *
+ * @cookie is the cookie from the SRCU read lock.
*
* Returns false if the given console has no next record to print, otherwise
* true.
*
- * Requires the console_lock.
+ * Requires the console_lock and the SRCU read lock.
*/
static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover)
+ char *dropped_text, bool *handover, int cookie)
{
static int panic_console_dropped;
struct printk_info info;
@@ -2734,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
con->seq++;
- *handover = console_lock_spinning_disable_and_check();
+ *handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
skip:
return true;
@@ -2771,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
bool any_usable = false;
struct console *con;
bool any_progress;
+ int cookie;
*next_seq = 0;
*handover = false;
@@ -2778,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
do {
any_progress = false;
- for_each_console(con) {
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
bool progress;
if (!console_is_usable(con))
continue;
any_usable = true;
- if (con->flags & CON_EXTENDED) {
+ if (console_srcu_read_flags(con) & CON_EXTENDED) {
/* Extended consoles do not print "dropped messages". */
progress = console_emit_next_record(con, &text[0],
&ext_text[0], NULL,
- handover);
+ handover, cookie);
} else {
progress = console_emit_next_record(con, &text[0],
NULL, &dropped_text[0],
- handover);
+ handover, cookie);
}
+
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
if (*handover)
return false;
@@ -2808,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
/* Allow panic_cpu to take over the consoles safely. */
if (abandon_console_lock_in_panic())
- return false;
+ goto abandon;
if (do_cond_resched)
cond_resched();
}
+ console_srcu_read_unlock(cookie);
} while (any_progress);
return any_usable;
+
+abandon:
+ console_srcu_read_unlock(cookie);
+ return false;
}
/**
- * console_unlock - unlock the console system
+ * console_unlock - unblock the console subsystem from printing
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
*
* While the console_lock was held, console output may have been buffered
* by printk(). If this is the case, console_unlock(); emits
@@ -2900,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
struct console *c;
+ int cookie;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
@@ -2913,9 +3020,14 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
if (!oops_in_progress)
@@ -2942,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
struct console *c;
+ int cookie;
u64 seq;
seq = prb_first_valid_seq(prb);
- for_each_console(c)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ /*
+ * If the above console_trylock() failed, this is an
+ * unsynchronized assignment. But in that case, the
+ * kernel is in "hope and pray" mode anyway.
+ */
c->seq = seq;
+ }
+ console_srcu_read_unlock(cookie);
}
console_unlock();
}
@@ -2958,15 +3080,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
@@ -2979,17 +3111,25 @@ struct tty_driver *console_device(int *index)
void console_stop(struct console *console)
{
__pr_flush(console, 1000, true);
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ console_list_unlock();
__pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -3082,6 +3222,72 @@ static void try_enable_default_console(struct console *newcon)
(con->flags & CON_BOOT) ? "boot" : "", \
con->name, con->index, ##__VA_ARGS__)
+static void console_init_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */
+ mutex_lock(&syslog_lock);
+ newcon->seq = syslog_seq;
+ mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message added to ringbuffer. */
+ newcon->seq = prb_next_seq(prb);
+
+ /*
+ * If any enabled boot consoles are due to be unregistered
+ * shortly, some may not be caught up and may be the same
+ * device as @newcon. Since it is not known which boot console
+ * is the same device, flush all consoles and, if necessary,
+ * start with the message of the enabled boot console that is
+ * the furthest behind.
+ */
+ if (bootcon_registered && !keep_bootcon) {
+ /*
+ * Hold the console_lock to stop console printing and
+ * guarantee safe access to console->seq.
+ */
+ console_lock();
+
+ /*
+ * Flush all consoles and set the console to start at
+ * the next unprinted sequence number.
+ */
+ if (!console_flush_all(true, &newcon->seq, &handover)) {
+ /*
+ * Flushing failed. Just choose the lowest
+ * sequence of the enabled boot consoles.
+ */
+
+ /*
+ * If there was a handover, this context no
+ * longer holds the console_lock.
+ */
+ if (handover)
+ console_lock();
+
+ newcon->seq = prb_next_seq(prb);
+ for_each_console(con) {
+ if ((con->flags & CON_BOOT) &&
+ (con->flags & CON_ENABLED) &&
+ con->seq < newcon->seq) {
+ newcon->seq = con->seq;
+ }
+ }
+ }
+
+ console_unlock();
+ }
+ }
+}
+
+#define console_first() \
+ hlist_entry(console_list.first, struct console, node)
+
+static int unregister_console_locked(struct console *console);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -3104,28 +3310,29 @@ static void try_enable_default_console(struct console *newcon)
void register_console(struct console *newcon)
{
struct console *con;
- bool bootcon_enabled = false;
- bool realcon_enabled = false;
+ bool bootcon_registered = false;
+ bool realcon_registered = false;
int err;
+ console_list_lock();
+
for_each_console(con) {
if (WARN(con == newcon, "console '%s%d' already registered\n",
- con->name, con->index))
- return;
- }
+ con->name, con->index)) {
+ goto unlock;
+ }
- for_each_console(con) {
if (con->flags & CON_BOOT)
- bootcon_enabled = true;
+ bootcon_registered = true;
else
- realcon_enabled = true;
+ realcon_registered = true;
}
/* Do not register boot consoles when there already is a real one. */
- if (newcon->flags & CON_BOOT && realcon_enabled) {
+ if ((newcon->flags & CON_BOOT) && realcon_registered) {
pr_info("Too late to register bootconsole %s%d\n",
newcon->name, newcon->index);
- return;
+ goto unlock;
}
/*
@@ -3141,8 +3348,8 @@ void register_console(struct console *newcon)
* flag set and will be first in the list.
*/
if (preferred_console < 0) {
- if (!console_drivers || !console_drivers->device ||
- console_drivers->flags & CON_BOOT) {
+ if (hlist_empty(&console_list) || !console_first()->device ||
+ console_first()->flags & CON_BOOT) {
try_enable_default_console(newcon);
}
}
@@ -3156,7 +3363,7 @@ void register_console(struct console *newcon)
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
- return;
+ goto unlock;
/*
* If we have a bootconsole, and are switching to a real console,
@@ -3164,42 +3371,38 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bootcon_enabled &&
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
}
+ newcon->dropped = 0;
+ console_init_seq(newcon, bootcon_registered);
+
/*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
*/
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
- /* Ensure this flag is always set for the head of the list */
+ if (hlist_empty(&console_list)) {
+ /* Ensure CON_CONSDEV is always set for the head. */
newcon->flags |= CON_CONSDEV;
- } else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
- }
+ hlist_add_head_rcu(&newcon->node, &console_list);
- if (newcon->flags & CON_EXTENDED)
- nr_ext_console_drivers++;
+ } else if (newcon->flags & CON_CONSDEV) {
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&newcon->node, &console_list);
- newcon->dropped = 0;
- if (newcon->flags & CON_PRINTBUFFER) {
- /* Get a consistent copy of @syslog_seq. */
- mutex_lock(&syslog_lock);
- newcon->seq = syslog_seq;
- mutex_unlock(&syslog_lock);
} else {
- /* Begin with next message. */
- newcon->seq = prb_next_seq(prb);
+ hlist_add_behind_rcu(&newcon->node, console_list.first);
}
- console_unlock();
+
+ /*
+ * No need to synchronize SRCU here! The caller does not rely
+ * on all contexts being able to see the new console before
+ * register_console() completes.
+ */
+
console_sysfs_notify();
/*
@@ -3210,24 +3413,28 @@ void register_console(struct console *newcon)
* went to the bootconsole (that they do not see on the real console)
*/
con_printk(KERN_INFO, newcon, "enabled\n");
- if (bootcon_enabled &&
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
- */
- for_each_console(con)
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (con->flags & CON_BOOT)
- unregister_console(con);
+ unregister_console_locked(con);
+ }
}
+unlock:
+ console_list_unlock();
}
EXPORT_SYMBOL(register_console);
-int unregister_console(struct console *console)
+/* Must be called under console_list_lock(). */
+static int unregister_console_locked(struct console *console)
{
- struct console *con;
int res;
+ lockdep_assert_console_list_lock_held();
+
con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
@@ -3236,51 +3443,94 @@ int unregister_console(struct console *console)
if (res > 0)
return 0;
- res = -ENODEV;
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else {
- for_each_console(con) {
- if (con->next == console) {
- con->next = console->next;
- res = 0;
- break;
- }
- }
- }
+ /* Disable it unconditionally */
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
- if (res)
- goto out_disable_unlock;
+ if (!console_is_registered_locked(console))
+ return -ENODEV;
- if (console->flags & CON_EXTENDED)
- nr_ext_console_drivers--;
+ hlist_del_init_rcu(&console->node);
/*
+ * <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
+ * </HISTORICAL>
+ *
+ * The above makes no sense as there is no guarantee that the next
+ * console has any device attached. Oh well....
*/
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
+ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
+ console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts
+ * must not be able to see this console in the list so that any
+ * exit/cleanup routines can be performed safely.
+ */
+ synchronize_srcu(&console_srcu);
- console->flags &= ~CON_ENABLED;
- console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
return res;
+}
-out_disable_unlock:
- console->flags &= ~CON_ENABLED;
- console_unlock();
+int unregister_console(struct console *console)
+{
+ int res;
+ console_list_lock();
+ res = unregister_console_locked(console);
+ console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
+/**
+ * console_force_preferred_locked - force a registered console preferred
+ * @con: The registered console to force preferred.
+ *
+ * Must be called under console_list_lock().
+ */
+void console_force_preferred_locked(struct console *con)
+{
+ struct console *cur_pref_con;
+
+ if (!console_is_registered_locked(con))
+ return;
+
+ cur_pref_con = console_first();
+
+ /* Already preferred? */
+ if (cur_pref_con == con)
+ return;
+
+ /*
+ * Delete, but do not re-initialize the entry. This allows the console
+ * to continue to appear registered (via any hlist_unhashed_lockless()
+ * checks), even though it was briefly removed from the console list.
+ */
+ hlist_del_rcu(&con->node);
+
+ /*
+ * Ensure that all SRCU list walks have completed so that the console
+ * can be added to the beginning of the console list and its forward
+ * list pointer can be re-initialized.
+ */
+ synchronize_srcu(&console_srcu);
+
+ con->flags |= CON_CONSDEV;
+ WARN_ON(!con->device);
+
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&con->node, &console_list);
+}
+EXPORT_SYMBOL(console_force_preferred_locked);
+
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
@@ -3327,10 +3577,12 @@ void __init console_init(void)
*/
static int __init printk_late_init(void)
{
+ struct hlist_node *tmp;
struct console *con;
int ret;
- for_each_console(con) {
+ console_list_lock();
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (!(con->flags & CON_BOOT))
continue;
@@ -3347,9 +3599,11 @@ static int __init printk_late_init(void)
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
- unregister_console(con);
+ unregister_console_locked(con);
}
}
+ console_list_unlock();
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
@@ -3369,6 +3623,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
struct console *c;
u64 last_diff = 0;
u64 printk_seq;
+ int cookie;
u64 diff;
u64 seq;
@@ -3379,9 +3634,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for (;;) {
diff = 0;
+ /*
+ * Hold the console_lock to guarantee safe access to
+ * console->seq and to prevent changes to @console_suspended
+ * until all consoles have been processed.
+ */
console_lock();
- for_each_console(c) {
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (con && con != c)
continue;
if (!console_is_usable(c))
@@ -3390,6 +3651,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
if (printk_seq < seq)
diff += seq - printk_seq;
}
+ console_srcu_read_unlock(cookie);
/*
* If consoles are suspended, it cannot be expected that they
@@ -3438,11 +3700,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* Context: Process context. May sleep while acquiring console lock.
* Return: true if all enabled printers are caught up.
*/
-bool pr_flush(int timeout_ms, bool reset_on_progress)
+static bool pr_flush(int timeout_ms, bool reset_on_progress)
{
return __pr_flush(NULL, timeout_ms, reset_on_progress);
}
-EXPORT_SYMBOL(pr_flush);
/*
* Delayed printk version, for scheduler-internal messages:
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 2b7b6ddab4f7..2dc4d5a1f1ff 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -203,7 +203,7 @@
* prb_rec_init_wr(&r, 5);
*
* // try to extend, but only if it does not exceed 32 bytes
- * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) {
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
* snprintf(&r.text_buf[r.info->text_len],
* r.text_buf_size - r.info->text_len, "hello");
*
diff --git a/kernel/profile.c b/kernel/profile.c
index 7ea01ba30e75..8a77769bc4b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,43 +59,39 @@ int profile_setup(char *str)
static const char schedstr[] = "schedule";
static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
+ const char *select = NULL;
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
- if (str[strlen(sleepstr)] == ',')
- str += strlen(sleepstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel sleep profiling enabled (shift: %u)\n",
- prof_shift);
+ select = sleepstr;
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- if (str[strlen(schedstr)] == ',')
- str += strlen(schedstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel schedule profiling enabled (shift: %u)\n",
- prof_shift);
+ select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
- if (str[strlen(kvmstr)] == ',')
- str += strlen(kvmstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel KVM profiling enabled (shift: %u)\n",
- prof_shift);
+ select = kvmstr;
} else if (get_option(&str, &par)) {
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
+
+ if (select) {
+ if (str[strlen(select)] == ',')
+ str += strlen(select) + 1;
+ if (get_option(&str, &par))
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel %s profiling enabled (shift: %u)\n",
+ select, prof_shift);
+ }
+
return 1;
}
__setup("profile=", profile_setup);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1893d909e45c..54482193e1ed 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
read_unlock(&tasklist_lock);
if (!ret && !ignore_state &&
- WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
ret = -ESRCH;
return ret;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index d471d22a5e21..ab62074174c3 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -54,27 +54,25 @@ config RCU_EXPERT
Say N if you are unsure.
config SRCU
- bool
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
+ def_bool y
config TINY_SRCU
bool
- default y if SRCU && TINY_RCU
+ default y if TINY_RCU
help
This option selects the single-CPU non-preemptible version of SRCU.
config TREE_SRCU
bool
- default y if SRCU && !TINY_RCU
+ default y if !TINY_RCU
help
This option selects the full-fledged version of SRCU.
+config NEED_SRCU_NMI_SAFE
+ def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
+
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
- select SRCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
@@ -311,4 +309,12 @@ config TASKS_TRACE_RCU_READ_MB
Say N here if you hate read-side memory barriers.
Take the default if you are unsure.
+config RCU_LAZY
+ bool "RCU callback lazy invocation functionality"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ To save power, batch RCU callbacks and flush after delay, memory
+ pressure, or callback list growing too big.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 1b0c41d490f0..232e29fe3e5e 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -27,7 +27,6 @@ config RCU_SCALE_TEST
tristate "performance tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
default n
help
This option provides a kernel module that runs performance
@@ -43,7 +42,6 @@ config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
default n
help
This option provides a kernel module that runs torture tests
@@ -59,7 +57,6 @@ config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
default n
help
This option provides a kernel module that runs performance tests
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index be5979da07f5..c5aa934de59b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -286,7 +286,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
+#if !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -375,6 +375,10 @@ extern void rcu_init_geometry(void);
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
+#endif /* !defined(CONFIG_TINY_RCU) */
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
/*
* Wrappers for the rcu_node::lock acquire and release.
*
@@ -437,7 +441,7 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
+#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
@@ -474,6 +478,14 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 3ef02d4a8108..91fb5905a008 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -95,6 +95,7 @@ torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
@@ -175,7 +176,7 @@ static struct rcu_scale_ops rcu_ops = {
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.exp_completed = rcu_exp_batches_completed,
- .async = call_rcu,
+ .async = call_rcu_hurry,
.gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -659,6 +660,14 @@ struct kfree_obj {
struct rcu_head rh;
};
+/* Used if doing RCU-kfree'ing via call_rcu(). */
+static void kfree_call_rcu(struct rcu_head *rh)
+{
+ struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh);
+
+ kfree(obj);
+}
+
static int
kfree_scale_thread(void *arg)
{
@@ -696,6 +705,11 @@ kfree_scale_thread(void *arg)
if (!alloc_ptr)
return -ENOMEM;
+ if (kfree_by_call_rcu) {
+ call_rcu(&(alloc_ptr->rh), kfree_call_rcu);
+ continue;
+ }
+
// By default kfree_rcu_test_single and kfree_rcu_test_double are
// initialized to false. If both have the same value (false or true)
// both are randomly tested, otherwise only the one with value true
@@ -767,11 +781,58 @@ kfree_scale_shutdown(void *arg)
return -EINVAL;
}
+// Used if doing RCU-kfree'ing via call_rcu().
+static unsigned long jiffies_at_lazy_cb;
+static struct rcu_head lazy_test1_rh;
+static int rcu_lazy_test1_cb_called;
+static void call_rcu_lazy_test1(struct rcu_head *rh)
+{
+ jiffies_at_lazy_cb = jiffies;
+ WRITE_ONCE(rcu_lazy_test1_cb_called, 1);
+}
+
static int __init
kfree_scale_init(void)
{
- long i;
int firsterr = 0;
+ long i;
+ unsigned long jif_start;
+ unsigned long orig_jif;
+
+ // Also, do a quick self-test to ensure laziness is as much as
+ // expected.
+ if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
+ pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n");
+ kfree_by_call_rcu = 0;
+ }
+
+ if (kfree_by_call_rcu) {
+ /* do a test to check the timeout. */
+ orig_jif = rcu_lazy_get_jiffies_till_flush();
+
+ rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_barrier();
+
+ jif_start = jiffies;
+ jiffies_at_lazy_cb = 0;
+ call_rcu(&lazy_test1_rh, call_rcu_lazy_test1);
+
+ smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
+
+ rcu_lazy_set_jiffies_till_flush(orig_jif);
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ }
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
@@ -784,7 +845,9 @@ kfree_scale_init(void)
schedule_timeout_uninterruptible(1);
}
- pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
+ pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
+ kfree_mult * sizeof(struct kfree_obj),
+ kfree_by_call_rcu);
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d8e1b270a065..634df26a2c27 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -84,10 +84,15 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test
torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -194,16 +199,24 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_GET_EXP 6
-#define RTWS_COND_SYNC 7
-#define RTWS_COND_SYNC_EXP 8
-#define RTWS_POLL_GET 9
-#define RTWS_POLL_GET_EXP 10
-#define RTWS_POLL_WAIT 11
-#define RTWS_POLL_WAIT_EXP 12
-#define RTWS_SYNC 13
-#define RTWS_STUTTER 14
-#define RTWS_STOPPING 15
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -211,13 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
"RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
"RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
"RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
"RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
"RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
"RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -332,13 +353,25 @@ struct rcu_torture_ops {
void (*exp_sync)(void);
unsigned long (*get_gp_state_exp)(void);
unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state_exp)(unsigned long oldstate);
void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_comp_state)(void);
+ void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
+ bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*get_gp_completed)(void);
+ void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -481,7 +514,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -489,6 +522,11 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+ return poll;
+}
+
static struct rcu_torture_ops rcu_ops = {
.ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
@@ -501,16 +539,27 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
+ .same_gp_state = same_state_synchronize_rcu,
+ .same_gp_state_full = same_state_synchronize_rcu_full,
+ .get_comp_state = get_completed_synchronize_rcu,
+ .get_comp_state_full = get_completed_synchronize_rcu_full,
.get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
.get_gp_completed = get_completed_synchronize_rcu,
+ .get_gp_completed_full = get_completed_synchronize_rcu_full,
.start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
.poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
.cond_sync = cond_synchronize_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
.get_gp_state_exp = get_state_synchronize_rcu,
.start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
- .call = call_rcu,
+ .call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -574,10 +623,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
DEFINE_STATIC_SRCU(srcu_ctl);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
+static struct rcu_torture_ops srcud_ops;
static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
{
- return srcu_read_lock(srcu_ctlp);
+ if (cur_ops == &srcud_ops)
+ return srcu_read_lock_nmisafe(srcu_ctlp);
+ else
+ return srcu_read_lock(srcu_ctlp);
}
static void
@@ -601,7 +654,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
{
- srcu_read_unlock(srcu_ctlp, idx);
+ if (cur_ops == &srcud_ops)
+ srcu_read_unlock_nmisafe(srcu_ctlp, idx);
+ else
+ srcu_read_unlock(srcu_ctlp, idx);
}
static int torture_srcu_read_lock_held(void)
@@ -709,6 +765,9 @@ static struct rcu_torture_ops srcud_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -804,7 +863,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
static void synchronize_rcu_mult_test(void)
{
- synchronize_rcu_mult(call_rcu_tasks, call_rcu);
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
}
static struct rcu_torture_ops tasks_ops = {
@@ -1148,15 +1207,35 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
- bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
- bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
- bool gp_sync1 = gp_sync;
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
- !gp_normal1 && !gp_poll1 && !gp_sync1)
- gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
- gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
@@ -1169,6 +1248,19 @@ static void rcu_torture_write_types(void)
} else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
}
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1181,18 +1273,33 @@ static void rcu_torture_write_types(void)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
- if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state &&
+ cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
synctype[nsynctypes++] = RTWS_POLL_GET;
pr_info("%s: Testing polling GPs.\n", __func__);
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
+ if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full
+ && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
pr_info("%s: Testing polling expedited GPs.\n", __func__);
} else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
}
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1202,6 +1309,40 @@ static void rcu_torture_write_types(void)
}
/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -1212,15 +1353,21 @@ rcu_torture_writer(void *arg)
bool boot_ended;
bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int expediting = 0;
unsigned long gp_snap;
+ unsigned long gp_snap1;
+ struct rcu_gp_oldstate gp_snap_full;
+ struct rcu_gp_oldstate gp_snap1_full;
int i;
int idx;
int oldnice = task_nice(current);
+ struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
bool stutter_waited;
+ unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
@@ -1261,11 +1408,12 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+
+ // Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
idx = cur_ops->readlock();
cookie = cur_ops->get_gp_state();
- WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
- cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
"%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
@@ -1277,6 +1425,21 @@ rcu_torture_writer(void *arg)
}
cur_ops->readunlock(idx);
}
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_gp_completed_full) {
+ cur_ops->get_gp_completed_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1284,12 +1447,7 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
- cur_ops->exp_sync();
- cur_ops->exp_sync();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@@ -1308,13 +1466,61 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync_exp(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
- while (!cur_ops->poll_gp_state(gp_snap))
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ gp_snap1 = cur_ops->get_gp_state();
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ if (cur_ops->poll_gp_state(ulo[i]) ||
+ cur_ops->same_gp_state(ulo[i], gp_snap1)) {
+ ulo[i] = gp_snap1;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ cur_ops->get_comp_state_full(&rgo[i]);
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ cur_ops->get_gp_state_full(&gp_snap1_full);
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ if (cur_ops->poll_gp_state_full(&rgo[i]) ||
+ cur_ops->same_gp_state_full(&rgo[i],
+ &gp_snap1_full)) {
+ rgo[i] = gp_snap1_full;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET_EXP:
@@ -1326,14 +1532,18 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
- cur_ops->sync();
- cur_ops->sync();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1400,6 +1610,7 @@ static int
rcu_torture_fakewriter(void *arg)
{
unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
@@ -1438,6 +1649,16 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync_exp(gp_snap);
break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@@ -1445,6 +1666,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_FULL:
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_POLL_GET_EXP:
gp_snap = cur_ops->start_gp_poll_exp();
while (!cur_ops->poll_gp_state_exp(gp_snap)) {
@@ -1452,6 +1680,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_SYNC:
cur_ops->sync();
break;
@@ -1715,7 +1950,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
*/
static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ bool checkpolling = !(torture_random(trsp) & 0xfff);
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int i;
unsigned long started;
unsigned long completed;
@@ -1731,8 +1968,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ }
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
@@ -1766,13 +2007,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
- "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
- __func__,
- rcu_torture_writer_state_getname(),
- rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
@@ -2600,12 +2850,12 @@ static int rcutorture_oom_notify(struct notifier_block *self,
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
- rcu_barrier();
+ cur_ops->cb_barrier();
ncbs = 0;
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
- rcu_barrier();
+ cur_ops->cb_barrier();
ncbs = 0;
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
@@ -3182,13 +3432,13 @@ static void rcu_test_debug_objects(void)
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu(&rh2, rcu_torture_leak_cb);
- call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ call_rcu_hurry(&rh2, rcu_torture_leak_cb);
+ call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu(rhp, rcu_torture_leak_cb);
- call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ call_rcu_hurry(rhp, rcu_torture_leak_cb);
+ call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
local_irq_enable();
rcu_read_unlock();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 92c002d65482..b12fb0cec44d 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
- unsigned short cookie;
+ unsigned long cookie;
cookie = get_state_synchronize_srcu(ssp);
- if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
return;
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
@@ -197,6 +197,16 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+
+ might_sleep();
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
call_srcu(ssp, &rs.head, wakeme_after_rcu);
@@ -215,7 +225,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
barrier();
ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
barrier();
- return ret & USHRT_MAX;
+ return ret;
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -240,10 +250,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ret;
+ return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1c304fec89c0..ca4b5dcec675 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -417,7 +417,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
}
return sum;
}
@@ -429,13 +429,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
}
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
+ "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
}
@@ -503,10 +508,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[0]);
- sum += READ_ONCE(cpuc->srcu_lock_count[1]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
}
return sum;
}
@@ -626,6 +631,29 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Check for consistent NMI safety.
+ */
+void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+{
+ int nmi_safe_mask = 1 << nmi_safe;
+ int old_nmi_safe_mask;
+ struct srcu_data *sdp;
+
+ /* NMI-unsafe use in NMI is a bad sign */
+ WARN_ON_ONCE(!nmi_safe && in_nmi());
+ sdp = raw_cpu_ptr(ssp->sda);
+ old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
+ if (!old_nmi_safe_mask) {
+ WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
+ return;
+ }
+ WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+}
+EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
+#endif /* CONFIG_PROVE_RCU */
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
@@ -636,7 +664,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -650,10 +678,45 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct, but in an NMI-safe manner using RMW atomics.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ int idx;
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
+ atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
+
+#endif // CONFIG_NEED_SRCU_NMI_SAFE
+
/*
* Start an SRCU grace period.
*/
@@ -1090,7 +1153,12 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
int ss_state;
check_init_srcu_struct(ssp);
- idx = srcu_read_lock(ssp);
+ /*
+ * While starting a new grace period, make sure we are in an
+ * SRCU read-side critical section so that the grace-period
+ * sequence number cannot wrap around in the meantime.
+ */
+ idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
sdp = per_cpu_ptr(ssp->sda, 0);
@@ -1123,7 +1191,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
srcu_funnel_exp_start(ssp, sdp_mynode, s);
- srcu_read_unlock(ssp, idx);
+ __srcu_read_unlock_nmisafe(ssp, idx);
return s;
}
@@ -1427,13 +1495,13 @@ void srcu_barrier(struct srcu_struct *ssp)
/* Initial count prevents reaching zero until all CBs are posted. */
atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
- idx = srcu_read_lock(ssp);
+ idx = __srcu_read_lock_nmisafe(ssp);
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
else
for_each_possible_cpu(cpu)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
- srcu_read_unlock(ssp, idx);
+ __srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
@@ -1687,8 +1755,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
+ u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
+ u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1696,8 +1764,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
+ l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
+ l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
c0 = l0 - u0;
c1 = l1 - u1;
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 5cefc702158f..e550f97779b8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -44,7 +44,7 @@ static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
{
- call_rcu(&rsp->cb_head, rcu_sync_func);
+ call_rcu_hurry(&rsp->cb_head, rcu_sync_func);
}
/**
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 83c7e6620d40..fe9840d90e96 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
// If the grace-period kthread is running, use it.
@@ -728,7 +728,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
lastinfo = j;
rtsi = rtsi * rcu_task_stall_info_mult;
- pr_info("%s: %s grace period %lu is %lu jiffies old.\n",
+ pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n",
__func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
}
}
@@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
}
// Only after all running tasks have been accounted for is it
@@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
}
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list is populated.
@@ -1533,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
{
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
synchronize_rcu();
// Any tasks that exit after this point will set
// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
@@ -1619,6 +1623,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list scan has completed.
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f0561ee16b9c..72913ce21258 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
void rcu_barrier(void)
{
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
}
EXPORT_SYMBOL(rcu_barrier);
@@ -158,6 +158,10 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+
+ if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+ WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+ return;
+ }
+
head->func = func;
head->next = NULL;
@@ -184,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
EXPORT_SYMBOL_GPL(call_rcu);
/*
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/*
* Return a grace-period-counter "cookie". For more information,
* see the Tree RCU header comment.
*/
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79aea7df4345..cf34a961821a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -76,6 +76,7 @@
/* Data structures. */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
+ .gpwrap = true,
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_RCU_CORE,
#endif
@@ -300,12 +301,6 @@ static bool rcu_dynticks_in_eqs(int snap)
return !(snap & RCU_DYNTICKS_IDX);
}
-/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
-bool rcu_is_idle_cpu(int cpu)
-{
- return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
-}
-
/*
* Return true if the CPU corresponding to the specified rcu_data
* structure has spent some time in an extended quiescent state since
@@ -1367,7 +1362,7 @@ static void rcu_poll_gp_seq_start(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
- if (rcu_init_invoked())
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
raw_lockdep_assert_held_rcu_node(rnp);
// If RCU was idle, note beginning of GP.
@@ -1383,7 +1378,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
- if (rcu_init_invoked())
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
raw_lockdep_assert_held_rcu_node(rnp);
// If the previously noted GP is still in effect, record the
@@ -1402,30 +1397,34 @@ static void rcu_poll_gp_seq_end(unsigned long *snap)
// where caller does not hold the root rcu_node structure's lock.
static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
{
+ unsigned long flags;
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq_rcu_node(rnp);
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
rcu_poll_gp_seq_start(snap);
if (rcu_init_invoked())
- raw_spin_unlock_irq_rcu_node(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
// Make the polled API aware of the end of a grace period, but where
// caller does not hold the root rcu_node structure's lock.
static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
{
+ unsigned long flags;
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq_rcu_node(rnp);
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
rcu_poll_gp_seq_end(snap);
if (rcu_init_invoked())
- raw_spin_unlock_irq_rcu_node(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -1755,6 +1754,8 @@ static noinline void rcu_gp_cleanup(void)
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -2103,7 +2104,7 @@ int rcutree_dying_cpu(unsigned int cpu)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return 0;
- blkd = !!(rnp->qsmask & rdp->grpmask);
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
return 0;
@@ -2341,8 +2342,8 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
- if (user)
- rcu_tasks_classic_qs(current, false);
+ if (user || rcu_is_cpu_rrupt_from_idle())
+ rcu_note_voluntary_context_switch(current);
lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -2413,7 +2414,7 @@ void rcu_force_quiescent_state(void)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = __this_cpu_read(rcu_data.mynode);
+ rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
@@ -2725,47 +2726,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed. However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested. In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section. On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu(). It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section. Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
+static void
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
{
static atomic_t doublefrees;
unsigned long flags;
@@ -2806,7 +2768,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2828,11 +2790,87 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(call_rcu);
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
+ * flush all lazy callbacks (including the new one) to the main ->cblist while
+ * doing so.
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't want the callback to be
+ * invoked after very long periods of time, which can happen on systems without
+ * memory pressure and on systems which are lightly loaded or mostly idle.
+ * This function will cause callbacks to be invoked sooner than later at the
+ * expense of extra power. Other than that, this function is identical to, and
+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
+ * ordering and other functionality.
+ */
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
+{
+ return __call_rcu_common(head, func, false);
+}
+EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#endif
+
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * By default the callbacks are 'lazy' and are kept hidden from the main
+ * ->cblist to prevent starting of grace periods too soon.
+ * If you desire grace periods to start very soon, use call_rcu_hurry().
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+}
+EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
@@ -3093,6 +3131,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
return !!krcp->head;
}
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+ return;
+ }
+ queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3150,7 +3203,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// work to repeat an attempt. Because previous batches are
// still in progress.
if (need_offload_krc(krcp))
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
@@ -3183,15 +3236,16 @@ static void fill_page_cache_func(struct work_struct *work)
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (bnode) {
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (!bnode)
+ break;
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
}
}
@@ -3338,7 +3392,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3371,7 +3425,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
- return count;
+ return count == 0 ? SHRINK_EMPTY : count;
}
static unsigned long
@@ -3414,49 +3468,27 @@ void __init kfree_rcu_scheduler_running(void)
raw_spin_lock_irqsave(&krcp->lock, flags);
if (need_offload_krc(krcp))
- schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
*
- * However, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
static int rcu_blocking_is_gp(void)
{
- int ret;
-
- // Invoking preempt_model_*() too early gets a splat.
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
- preempt_model_full() || preempt_model_rt())
- return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ return false;
might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- /*
- * If the rcu_state.n_online_cpus counter is equal to one,
- * there is only one CPU, and that CPU sees all prior accesses
- * made by any CPU that was online at the time of its access.
- * Furthermore, if this counter is equal to one, its value cannot
- * change until after the preempt_enable() below.
- *
- * Furthermore, if rcu_state.n_online_cpus is equal to one here,
- * all later CPUs (both this one and any that come online later
- * on) are guaranteed to see all accesses prior to this point
- * in the code, without the need for additional memory barriers.
- * Those memory barriers are provided by CPU-hotplug code.
- */
- ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
- preempt_enable();
- return ret;
+ return true;
}
/**
@@ -3499,30 +3531,59 @@ static int rcu_blocking_is_gp(void)
*/
void synchronize_rcu(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp()) {
- // Note well that this code runs with !PREEMPT && !SMP.
- // In addition, all code that advances grace periods runs at
- // process level. Therefore, this normal GP overlaps with
- // other normal GPs only by being fully nested within them,
- // which allows reuse of ->gp_seq_polled_snap.
- rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
- rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
- if (rcu_init_invoked())
- cond_resched_tasks_rcu_qs();
- return; // Context allows vacuous grace periods.
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu_hurry);
+ return;
}
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
@@ -3541,21 +3602,42 @@ unsigned long get_state_synchronize_rcu(void)
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
*
- * Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * or poll_state_synchronize_rcu() to determine whether or not a full
- * grace period has elapsed in the meantime. If the needed grace period
- * is not already slated to start, notifies RCU core of the need for that
- * grace period.
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
*
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
+ * This does not guarantee that the needed grace period will actually
+ * start.
*/
-unsigned long start_poll_synchronize_rcu(void)
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
+ */
+ smp_mb(); /* ^^^ */
+ rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
{
unsigned long flags;
- unsigned long gp_seq = get_state_synchronize_rcu();
bool needwake;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -3575,17 +3657,57 @@ unsigned long start_poll_synchronize_rcu(void)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ start_poll_synchronize_rcu_common();
return gp_seq;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
- * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
*
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
- * which oldstate was obtained, return @true, otherwise return @false.
+ * which @oldstate was obtained, return @true, otherwise return @false.
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
@@ -3594,10 +3716,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
* more than a billion grace periods (and way more on a 64-bit system!).
- * Those needing to keep oldstate values for very long time periods
- * (many hours even on 32-bit systems) should check them occasionally
- * and either refresh them or set a flag indicating that the grace period
- * has completed.
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
@@ -3616,8 +3739,56 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
*
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
@@ -3641,6 +3812,33 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
/*
* Check to see if there is any immediate RCU-related work to be done by
* the current CPU, returning 1 if so and zero otherwise. The checks are
@@ -3731,6 +3929,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
{
unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ bool wake_nocb = false;
+ bool was_alldone = false;
lockdep_assert_held(&rcu_state.barrier_lock);
if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
@@ -3739,7 +3939,14 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+ * queue. This way we don't wait for bypass timer that can reach seconds
+ * if it's fully lazy.
+ */
+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
@@ -3747,6 +3954,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ if (wake_nocb)
+ wake_nocb_gp(rdp, false);
smp_store_release(&rdp->barrier_seq_snap, gseq);
}
@@ -4113,8 +4322,6 @@ void rcu_report_dead(unsigned int cpu)
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
- /* QS for any half-done expedited grace period. */
- rcu_report_exp_rdp(rdp);
rcu_preempt_deferred_qs(current);
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
@@ -4162,7 +4369,7 @@ void rcutree_migrate_callbacks(int cpu)
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
needwake = rcu_advance_cbs(my_rnp, rdp) ||
@@ -4312,9 +4519,20 @@ early_initcall(rcu_spawn_gp_kthread);
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
rcu_test_sync_prims();
+
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
+
+ // Switch out of early boot mode.
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c..fcb5d696eb17 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -263,14 +263,16 @@ struct rcu_data {
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE_BYPASS 1
-#define RCU_NOCB_WAKE 2
-#define RCU_NOCB_WAKE_FORCE 3
+#define RCU_NOCB_WAKE_LAZY 2
+#define RCU_NOCB_WAKE 3
+#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -439,10 +441,12 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j);
+ unsigned long j, bool lazy);
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags);
+ bool *was_alldone, unsigned long flags,
+ bool lazy);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index be667583a554..ed6c3cce28f2 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
- if (rcu_is_cpu_rrupt_from_idle()) {
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
@@ -906,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -924,14 +927,17 @@ void synchronize_rcu_expedited(void)
// them, which allows reuse of ->gp_seq_polled_exp_snap.
rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
- if (rcu_init_invoked())
- cond_resched();
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
return; // Context allows vacuous grace periods.
}
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
return;
}
@@ -1028,6 +1034,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
* cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
*
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
@@ -1053,3 +1077,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate)
synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index a8f574d8850d..9e1c8caec5ce 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -257,6 +257,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
}
/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+ jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+ return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
+/*
* Arrange to wake the GP kthread for this NOCB group at some future
* time when it is safe to do so.
*/
@@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
/*
- * Bypass wakeup overrides previous deferments. In case
- * of callback storm, no need to wake up too early.
+ * Bypass wakeup overrides previous deferments. In case of
+ * callback storms, no need to wake up too early.
*/
- if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ if (waketype == RCU_NOCB_WAKE_LAZY &&
+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else {
@@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
* proves to be initially empty, just return false because the no-CB GP
* kthread may need to be awakened in this case.
*
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
* Note that this function always returns true if rhp is NULL.
*/
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in,
+ unsigned long j, bool lazy)
{
struct rcu_cblist rcl;
+ struct rcu_head *rhp = rhp_in;
WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
rcu_lockdep_assert_cblist_protected(rdp);
@@ -310,7 +343,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
if (rhp)
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+
+ /*
+ * If the new CB requested was a lazy one, queue it onto the main
+ * ->cblist so that we can take advantage of the grace-period that will
+ * happen regardless. But queue it onto the bypass list first so that
+ * the lazy CB is ordered with the existing CBs in the bypass list.
+ */
+ if (lazy && rhp) {
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ rhp = NULL;
+ }
rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ WRITE_ONCE(rdp->lazy_len, 0);
+
rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
WRITE_ONCE(rdp->nocb_bypass_first, j);
rcu_nocb_bypass_unlock(rdp);
@@ -326,13 +372,13 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
* Note that this function always returns true if rhp is NULL.
*/
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
+ unsigned long j, bool lazy)
{
if (!rcu_rdp_is_offloaded(rdp))
return true;
rcu_lockdep_assert_cblist_protected(rdp);
rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
}
/*
@@ -345,7 +391,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
if (!rcu_rdp_is_offloaded(rdp) ||
!rcu_nocb_bypass_trylock(rdp))
return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
}
/*
@@ -367,12 +413,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
* there is only one CPU in operation.
*/
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
+ bool *was_alldone, unsigned long flags,
+ bool lazy)
{
unsigned long c;
unsigned long cur_gp_seq;
unsigned long j = jiffies;
long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
lockdep_assert_irqs_disabled();
@@ -417,24 +465,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// If there hasn't yet been all that many ->cblist enqueues
// this jiffy, tell the caller to enqueue onto ->cblist. But flush
// ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ // Lazy CBs throttle this back and do immediate bypass queuing.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
if (*was_alldone)
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
return false; // Caller must enqueue the callback.
}
// If ->nocb_bypass has been used too long or is too full,
// flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ (ncbs && bypass_is_lazy &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
ncbs >= qhimark) {
rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
if (*was_alldone)
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstQ"));
@@ -447,7 +500,12 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
rcu_advance_cbs_nowake(rdp->mynode, rdp);
rdp->nocb_gp_adv_time = j;
}
- rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ // The flush succeeded and we moved CBs into the regular list.
+ // Don't wait for the wake up timer as it may be too far ahead.
+ // Wake up the GP thread now instead, if the cblist was empty.
+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
+
return true; // Callback already enqueued.
}
@@ -457,13 +515,24 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+ if (lazy)
+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
if (!ncbs) {
WRITE_ONCE(rdp->nocb_bypass_first, j);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
}
rcu_nocb_bypass_unlock(rdp);
smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
+ // A wake up of the grace period kthread or timer adjustment
+ // needs to be done only if:
+ // 1. Bypass list was fully empty before (this is the first
+ // bypass list entry), or:
+ // 2. Both of these conditions are met:
+ // a. The bypass list previously had only lazy CBs, and:
+ // b. The new CB is non-lazy.
+ if (ncbs && (!bypass_is_lazy || lazy)) {
local_irq_restore(flags);
} else {
// No-CBs GP kthread might be indefinitely asleep, if so, wake.
@@ -491,8 +560,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
unsigned long flags)
__releases(rdp->nocb_lock)
{
+ long bypass_len;
unsigned long cur_gp_seq;
unsigned long j;
+ long lazy_len;
long len;
struct task_struct *t;
@@ -506,9 +577,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
}
// Need to actually to a wakeup.
len = rcu_segcblist_n_cbs(&rdp->cblist);
+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_len = READ_ONCE(rdp->lazy_len);
if (was_alldone) {
rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
+ // Only lazy CBs in bypass list
+ if (lazy_len && bypass_len == lazy_len) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazy"));
+ } else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
@@ -599,12 +677,12 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
static void nocb_gp_wait(struct rcu_data *my_rdp)
{
bool bypass = false;
- long bypass_ncbs;
int __maybe_unused cpu = my_rdp->cpu;
unsigned long cur_gp_seq;
unsigned long flags;
bool gotcbs = false;
unsigned long j = jiffies;
+ bool lazy = false;
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
bool needwake_gp;
@@ -634,24 +712,43 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* won't be ignored for long.
*/
list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+ long bypass_ncbs;
+ bool flush_bypass = false;
+ long lazy_ncbs;
+
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
lockdep_assert_held(&rdp->nocb_lock);
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ flush_bypass = true;
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
continue; /* No callbacks here, try next. */
}
+
+ if (flush_bypass) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+ }
+
if (bypass_ncbs) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+ if (bypass_ncbs == lazy_ncbs)
+ lazy = true;
+ else
+ bypass = true;
}
rnp = rdp->mynode;
@@ -699,12 +796,20 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
my_rdp->nocb_gp_gp = needwait_gp;
my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
- TPS("WakeBypassIsDeferred"));
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ if (!rcu_nocb_poll) {
+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
+ if (lazy && !bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazyIsDeferred"));
+ // Otherwise add a deferred bypass wake up.
+ } else if (bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
}
+
if (rcu_nocb_poll) {
/* Polling, so trace if first poll in the series. */
if (gotcbs)
@@ -1030,7 +1135,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* return false, which means that future calls to rcu_nocb_try_bypass()
* will refuse to put anything into the bypass.
*/
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
/*
* Start with invoking rcu_core() early. This way if the current thread
* happens to preempt an ongoing call to rcu_core() in the middle,
@@ -1111,7 +1216,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
if (!ret)
cpumask_clear_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+ pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
@@ -1196,7 +1301,7 @@ int rcu_nocb_cpu_offload(int cpu)
if (!ret)
cpumask_set_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
+ pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
@@ -1207,47 +1312,87 @@ int rcu_nocb_cpu_offload(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
-void __init rcu_init_nohz(void)
+static unsigned long
+lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu;
- bool need_rcu_nocb_mask = false;
- bool offload_all = false;
- struct rcu_data *rdp;
+ unsigned long count = 0;
-#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
- if (!rcu_state.nocb_is_setup) {
- need_rcu_nocb_mask = true;
- offload_all = true;
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ count += READ_ONCE(rdp->lazy_len);
}
-#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
- need_rcu_nocb_mask = true;
- offload_all = false; /* NO_HZ_FULL has its own mask. */
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int _count = READ_ONCE(rdp->lazy_len);
+
+ if (_count == 0)
+ continue;
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ sc->nr_to_scan -= _count;
+ count += _count;
+ if (sc->nr_to_scan <= 0)
+ break;
}
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+ return count ? count : SHRINK_STOP;
+}
- if (need_rcu_nocb_mask) {
+static struct shrinker lazy_rcu_shrinker = {
+ .count_objects = lazy_rcu_shrink_count,
+ .scan_objects = lazy_rcu_shrink_scan,
+ .batch = 0,
+ .seeks = DEFAULT_SEEKS,
+};
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ const struct cpumask *cpumask = NULL;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ cpumask = tick_nohz_full_mask;
+#endif
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
+ !rcu_state.nocb_is_setup && !cpumask)
+ cpumask = cpu_possible_mask;
+
+ if (cpumask) {
if (!cpumask_available(rcu_nocb_mask)) {
if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
return;
}
}
+
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
rcu_state.nocb_is_setup = true;
}
if (!rcu_state.nocb_is_setup)
return;
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (offload_all)
- cpumask_setall(rcu_nocb_mask);
+ if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
+ pr_err("Failed to register lazy_rcu shrinker!\n");
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
@@ -1284,6 +1429,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
rcu_cblist_init(&rdp->nocb_bypass);
+ WRITE_ONCE(rdp->lazy_len, 0);
mutex_init(&rdp->nocb_gp_kthread_mutex);
}
@@ -1452,8 +1598,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
(long)rdp->nocb_gp_seq,
rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
}
/* Dump out nocb kthread state for the specified rcu_data structure. */
@@ -1497,7 +1643,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
/* It is OK for GP kthreads to have GP state. */
@@ -1564,14 +1710,19 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ return false;
+}
+
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
+ unsigned long j, bool lazy)
{
return true;
}
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
+ bool *was_alldone, unsigned long flags, bool lazy)
{
return false;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 438ecae6bd7e..7b0fe741a088 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
(rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
+ (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
@@ -718,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user)
struct task_struct *t = current;
lockdep_assert_irqs_disabled();
- if (user || rcu_is_cpu_rrupt_from_idle()) {
- rcu_note_voluntary_context_switch(current);
- }
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
@@ -824,6 +822,7 @@ void rcu_read_unlock_strict(void)
if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
return;
rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
}
@@ -869,7 +868,7 @@ void rcu_all_qs(void)
if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
return;
- preempt_disable();
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
preempt_enable();
@@ -931,10 +930,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
return false;
}
-// Except that we do need to respond to a request by an expedited grace
-// period for a quiescent state from this CPU. Note that requests from
-// tasks are handled when removing the task from the blocked-tasks list
-// below.
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -972,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user)
* neither access nor modify, at least not while the
* corresponding CPU is online.
*/
-
rcu_qs();
}
}
@@ -1220,11 +1221,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
* We don't include outgoingcpu in the affinity set, use -1 if there is
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->boost_kthread_mutex.
*/
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rcu_rnp_online_cpus(rnp);
+ unsigned long mask;
cpumask_var_t cm;
int cpu;
@@ -1233,13 +1236,17 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
mutex_lock(&rnp->boost_kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm))
+ if (cpumask_empty(cm)) {
cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
set_cpus_allowed_ptr(t, cm);
mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c3fbbcc09327..5653560573e2 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
if (cpu_is_offline(cpu))
pr_err("Offline CPU %d blocking current GP.\n", cpu);
- else if (!trigger_single_cpu_backtrace(cpu))
+ else
dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
} else {
pr_err("Stack dump where RCU GP kthread last ran:\n");
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ dump_cpu_task(cpu);
}
}
wake_up_process(gpk);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 738842c4886b..f5e6a2f95a2a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -224,7 +224,7 @@ void rcu_test_sync_prims(void)
synchronize_rcu_expedited();
}
-#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+#if !defined(CONFIG_TINY_RCU)
/*
* Switch to run-time mode once RCU has fully initialized.
@@ -239,7 +239,7 @@ static int __init rcu_set_runtime_mode(void)
}
core_initcall(rcu_set_runtime_mode);
-#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -559,10 +559,8 @@ static void early_boot_test_call_rcu(void)
struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
- call_srcu(&early_srcu, &shead, test_callback);
- }
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
+ call_srcu(&early_srcu, &shead, test_callback);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -585,11 +583,9 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_boot_test_counter++;
- srcu_barrier(&early_srcu);
- WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
- }
+ early_boot_test_counter++;
+ srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3c35445bf5ad..3bba88c7ffc6 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -243,6 +243,17 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for restart.
+ */
+static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
+
+static void do_kernel_restart_prepare(void)
+{
+ blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -254,6 +265,7 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ do_kernel_restart_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
@@ -396,6 +408,11 @@ register_sys_off_handler(enum sys_off_mode mode,
handler->list = &power_off_handler_list;
break;
+ case SYS_OFF_MODE_RESTART_PREPARE:
+ handler->list = &restart_prep_handler_list;
+ handler->blocking = true;
+ break;
+
case SYS_OFF_MODE_RESTART:
handler->list = &restart_handler_list;
break;
diff --git a/kernel/relay.c b/kernel/relay.c
index 6a611e779e95..ef12532168d9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- const size_t pa_size = n_pages * sizeof(struct page *);
- if (pa_size > PAGE_SIZE)
- return vzalloc(pa_size);
- return kzalloc(pa_size, GFP_KERNEL);
+ return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
@@ -151,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -510,7 +507,7 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
chan->cb = cb;
kref_init(&chan->kref);
@@ -581,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan,
if (!chan || !base_filename)
return -EINVAL;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
diff --git a/kernel/resource.c b/kernel/resource.c
index 4c5e80b92f2f..ddbbacb9fb50 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -888,7 +888,7 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
@@ -1283,9 +1283,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
@@ -1658,6 +1656,7 @@ __setup("reserve=", reserve_setup);
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
int err = 0;
loff_t l;
@@ -1667,12 +1666,12 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
@@ -1683,10 +1682,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
@@ -1707,18 +1704,15 @@ static int strict_iomem_checks;
*
* Returns true if exclusive to the kernel, otherwise returns false.
*/
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
IORESOURCE_EXCLUSIVE;
bool skip_children = false, err = false;
- int size = PAGE_SIZE;
struct resource *p;
- addr = addr & PAGE_MASK;
-
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, skip_children) {
+ for_each_resource(root, p, skip_children) {
if (p->start >= addr + size)
break;
if (p->end < addr) {
@@ -1757,6 +1751,12 @@ bool iomem_is_exclusive(u64 addr)
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
diff --git a/kernel/rseq.c b/kernel/rseq.c
index bda8175f8f99..d38ab944105d 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -171,12 +171,27 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
return 0;
}
+static bool rseq_warn_flags(const char *str, u32 flags)
+{
+ u32 test_flags;
+
+ if (!flags)
+ return false;
+ test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
+ test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
+ return true;
+}
+
static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
{
u32 flags, event_mask;
int ret;
- if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags)
+ if (rseq_warn_flags("rseq_cs", cs_flags))
return -EINVAL;
/* Get thread flags. */
@@ -184,7 +199,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
if (ret)
return ret;
- if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags)
+ if (rseq_warn_flags("rseq", flags))
return -EINVAL;
/*
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 4ebaf97f7bd8..991fc9002535 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
struct task_struct *t;
unsigned long flags;
- BUG_ON(!lock_task_sighand(p, &flags));
+ if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+ return;
prev = p->signal->autogroup;
if (prev == ag) {
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 35f15c26ed54..d57a5c1c1cd9 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -204,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
@@ -241,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..25b582b6ee5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <uapi/linux/sched/types.h>
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -142,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -360,10 +357,7 @@ static void __sched_core_flip(bool enabled)
/*
* Toggle the offline CPUs.
*/
- cpumask_copy(&sched_core_mask, cpu_possible_mask);
- cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
-
- for_each_cpu(cpu, &sched_core_mask)
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
@@ -481,8 +475,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
- * - sched_move_task()/
- * cpu_cgroup_fork(): p->sched_task_group
+ * - sched_move_task(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
@@ -708,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -1398,7 +1392,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -1549,8 +1543,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -1616,7 +1610,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
@@ -1624,7 +1618,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
@@ -2059,7 +2053,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
@@ -2195,14 +2189,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags);
+ struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
if (likely(!p->migration_disabled))
return;
@@ -2212,7 +2210,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
/*
* Violates locking rules! see comment in __do_set_cpus_allowed().
*/
- __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+ __do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
@@ -2234,6 +2232,10 @@ EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
if (p->migration_disabled > 1) {
p->migration_disabled--;
@@ -2249,7 +2251,7 @@ void migrate_enable(void)
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ __set_cpus_allowed_ptr(p, &ac);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
@@ -2328,7 +2330,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
@@ -2529,19 +2531,25 @@ out_unlock:
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
- if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
- p->cpus_ptr = new_mask;
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
return;
}
- cpumask_copy(&p->cpus_mask, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
}
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -2558,7 +2566,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
*
* XXX do further audits, this smells like something putrid.
*/
- if (flags & SCA_MIGRATE_DISABLE)
+ if (ctx->flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -2577,7 +2585,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask, flags);
+ p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -2585,14 +2593,27 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
set_next_task(rq, p);
}
+/*
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- __do_set_cpus_allowed(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+
+ __do_set_cpus_allowed(p, &ac);
+ kfree(ac.user_mask);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ unsigned long flags;
+
if (!src->user_cpus_ptr)
return 0;
@@ -2600,7 +2621,10 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
if (!dst->user_cpus_ptr)
return -ENOMEM;
+ /* Use pi_lock to protect content of user_cpus_ptr */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
return 0;
}
@@ -2696,6 +2720,8 @@ void release_user_cpus_ptr(struct task_struct *p)
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
@@ -2778,7 +2804,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
/*
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
@@ -2838,8 +2864,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
* Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags,
+ struct affinity_context *ctx,
struct rq *rq,
struct rq_flags *rf)
__releases(rq->lock)
@@ -2848,7 +2873,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool kthread = p->flags & PF_KTHREAD;
- struct cpumask *user_mask = NULL;
unsigned int dest_cpu;
int ret = 0;
@@ -2868,7 +2892,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
- if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
ret = -EINVAL;
goto out;
}
@@ -2877,18 +2901,18 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (!(flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, new_mask))
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
goto out;
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
- !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
ret = -EBUSY;
goto out;
}
@@ -2899,22 +2923,15 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
- dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
- __do_set_cpus_allowed(p, new_mask, flags);
-
- if (flags & SCA_USER)
- user_mask = clear_user_cpus_ptr(p);
+ __do_set_cpus_allowed(p, ctx);
- ret = affine_move_task(rq, p, rf, dest_cpu, flags);
-
- kfree(user_mask);
-
- return ret;
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
out:
task_rq_unlock(rq, p, rf);
@@ -2932,25 +2949,41 @@ out:
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, u32 flags)
+ struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
/*
* Change a given task's CPU affinity to the intersection of its current
- * affinity mask and @subset_mask, writing the resulting mask to @new_mask
- * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
* If the resulting mask is empty, leave the affinity unchanged and return
* -EINVAL.
*/
@@ -2958,17 +2991,14 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
struct cpumask *new_mask,
const struct cpumask *subset_mask)
{
- struct cpumask *user_mask = NULL;
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
struct rq_flags rf;
struct rq *rq;
int err;
- if (!p->user_cpus_ptr) {
- user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
- if (!user_mask)
- return -ENOMEM;
- }
-
rq = task_rq_lock(p, &rf);
/*
@@ -2981,31 +3011,21 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
goto err_unlock;
}
- if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
err = -EINVAL;
goto err_unlock;
}
- /*
- * We're about to butcher the task affinity, so keep track of what
- * the user asked for in case we're able to restore it later on.
- */
- if (user_mask) {
- cpumask_copy(user_mask, p->cpus_ptr);
- p->user_cpus_ptr = user_mask;
- }
-
- return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
err_unlock:
task_rq_unlock(rq, p, &rf);
- kfree(user_mask);
return err;
}
/*
* Restrict the CPU affinity of task @p so that it is a subset of
- * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
* old affinity mask. If the resulting mask is empty, we warn and walk
* up the cpuset hierarchy until we find a suitable mask.
*/
@@ -3049,34 +3069,29 @@ out_free_mask:
}
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
/*
* Restore the affinity of a task @p which was previously restricted by a
- * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
- * @p->user_cpus_ptr.
+ * call to force_compatible_cpus_allowed_ptr().
*
* It is the caller's responsibility to serialise this with any calls to
* force_compatible_cpus_allowed_ptr(@p).
*/
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{
- struct cpumask *user_mask = p->user_cpus_ptr;
- unsigned long flags;
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
/*
- * Try to restore the old affinity mask. If this fails, then
- * we free the mask explicitly to avoid it being inherited across
- * a subsequent fork().
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
*/
- if (!user_mask || !__sched_setaffinity(p, user_mask))
- return;
-
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- user_mask = clear_user_cpus_ptr(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
- kfree(user_mask);
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -3254,12 +3269,12 @@ out:
/*
* wait_task_inactive - wait for a thread to unschedule.
*
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
@@ -3290,12 +3305,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
+ * But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p)) {
- if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ while (task_on_cpu(rq, p)) {
+ if (!(READ_ONCE(p->__state) & match_state))
return 0;
cpu_relax();
}
@@ -3307,10 +3322,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
- running = task_running(rq, p);
+ running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || READ_ONCE(p->__state) == match_state)
+ if (READ_ONCE(p->__state) & match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3554,10 +3569,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
- return set_cpus_allowed_ptr(p, new_mask);
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@ -3725,13 +3739,6 @@ void sched_ttwu_pending(void *arg)
if (!llist)
return;
- /*
- * rq::ttwu_pending racy indication of out-standing wakeups.
- * Races such that false-negatives are possible, since they
- * are shorter lived that false-positives would be.
- */
- WRITE_ONCE(rq->ttwu_pending, 0);
-
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -3745,6 +3752,17 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
+ /*
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
+ *
+ * It is ok to clear ttwu_pending when another task pending.
+ * We will receive IPI after local irq enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
@@ -4206,6 +4224,40 @@ out:
return success;
}
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
+ */
+ if (state == TASK_RUNNING || state == TASK_WAKING)
+ return true;
+
+ /*
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
+ *
+ * See try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ if (p->on_rq)
+ return true;
+
+#ifdef CONFIG_SMP
+ /*
+ * Ensure the task has finished __schedule() and will not be referenced
+ * anymore. Again, see try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+#endif
+
+ return false;
+}
+
/**
* task_call_func - Invoke a function on task in fixed state
* @p: Process for which the function is to be invoked, can be @current.
@@ -4223,28 +4275,12 @@ out:
int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
struct rq *rq = NULL;
- unsigned int state;
struct rq_flags rf;
int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- state = READ_ONCE(p->__state);
-
- /*
- * Ensure we load p->on_rq after p->__state, otherwise it would be
- * possible to, falsely, observe p->on_rq == 0.
- *
- * See try_to_wake_up() for a longer comment.
- */
- smp_rmb();
-
- /*
- * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
- * the task is blocked. Make sure to check @state since ttwu() can drop
- * locks at the end, see ttwu_queue_wakelist().
- */
- if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
+ if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
/*
@@ -4396,7 +4432,18 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
+static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4412,6 +4459,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
@@ -4520,6 +4570,17 @@ static struct ctl_table sched_core_sysctls[] = {
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
{}
};
static int __init sched_core_sysctl_init(void)
@@ -4815,10 +4876,10 @@ static inline void finish_task(struct task_struct *prev)
#ifdef CONFIG_SMP
-static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
- struct callback_head *next;
+ struct balance_callback *next;
lockdep_assert_rq_held(rq);
@@ -4845,15 +4906,15 @@ static void balance_push(struct rq *rq);
* This abuse is tolerated because it places all the unlikely/odd cases behind
* a single test, namely: rq->balance_callback == NULL.
*/
-struct callback_head balance_push_callback = {
+struct balance_callback balance_push_callback = {
.next = NULL,
- .func = (void (*)(struct callback_head *))balance_push,
+ .func = balance_push,
};
-static inline struct callback_head *
+static inline struct balance_callback *
__splice_balance_callbacks(struct rq *rq, bool split)
{
- struct callback_head *head = rq->balance_callback;
+ struct balance_callback *head = rq->balance_callback;
if (likely(!head))
return NULL;
@@ -4875,7 +4936,7 @@ __splice_balance_callbacks(struct rq *rq, bool split)
return head;
}
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return __splice_balance_callbacks(rq, true);
}
@@ -4885,7 +4946,7 @@ static void __balance_callbacks(struct rq *rq)
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
@@ -4902,12 +4963,12 @@ static inline void __balance_callbacks(struct rq *rq)
{
}
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
}
@@ -5166,6 +5227,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -5720,8 +5782,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -6179,7 +6240,7 @@ static void sched_core_balance(struct rq *rq)
preempt_enable();
}
-static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
static void queue_core_balance(struct rq *rq)
{
@@ -6429,7 +6490,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -7410,7 +7471,7 @@ static int __sched_setscheduler(struct task_struct *p,
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
- struct callback_head *head;
+ struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -8079,7 +8140,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
#endif
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
@@ -8093,13 +8154,16 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
}
cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, mask, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
if (retval)
goto out_free_new_mask;
@@ -8110,7 +8174,24 @@ again:
* Just reset the cpumask to the cpuset's cpus_allowed.
*/
cpumask_copy(new_mask, cpus_allowed);
- goto again;
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
}
out_free_new_mask:
@@ -8122,6 +8203,8 @@ out_free_cpus_allowed:
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
struct task_struct *p;
int retval;
@@ -8156,7 +8239,21 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
- retval = __sched_setaffinity(p, in_mask);
+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!user_mask) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ cpumask_copy(user_mask, in_mask);
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
out_put_task:
put_task_struct(p);
return retval;
@@ -8649,7 +8746,7 @@ again:
if (curr->sched_class != p->sched_class)
goto out_unlock;
- if (task_running(p_rq, p) || !task_is_running(p))
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8861,7 +8958,7 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
read_task_thread_flags(p));
@@ -8889,7 +8986,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
@@ -8937,6 +9034,12 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
+#ifdef CONFIG_SMP
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
+#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -8961,7 +9064,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+ set_cpus_allowed_common(idle, &ac);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -9601,9 +9704,6 @@ LIST_HEAD(task_groups);
static struct kmem_cache *task_group_cache __read_mostly;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
@@ -9647,14 +9747,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
@@ -9759,6 +9851,7 @@ void __init sched_init(void)
rq->core_cookie = 0UL;
#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
@@ -10163,7 +10256,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10179,7 +10272,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -10210,7 +10303,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10288,53 +10381,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
-}
-
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if it's
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (READ_ONCE(task->__state) == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -11170,8 +11229,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
@@ -11183,6 +11243,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (cpu == smp_processor_id() && in_hardirq()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 93878cb2a46d..a57fd8f27498 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -88,7 +88,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* core has now entered/left forced idle state. Defer accounting to the
* next scheduling edge, rather than always forcing a reschedule here.
*/
- if (task_running(rq, p))
+ if (task_on_cpu(rq, p))
resched_curr(rq);
task_rq_unlock(rq, p, &rf);
@@ -205,7 +205,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
default:
err = -EINVAL;
goto out;
- };
+ }
if (type == PIDTYPE_PID) {
__sched_core_set(task, cookie);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 02d970a879ed..57c92d751bcd 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -123,7 +123,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1;
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return 1;
/* Ensure the capacity of the CPUs fits the task. */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index fa9ce9d83683..a286e726eb4b 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
int task_pri = convert_prio(p->prio);
int idx, cpu;
- BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+ WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0ab79d819a0d..0d97d54276cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -124,15 +124,12 @@ static inline int dl_bw_cpus(int i)
return cpus;
}
-static inline unsigned long __dl_bw_capacity(int i)
+static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
{
- struct root_domain *rd = cpu_rq(i)->rd;
unsigned long cap = 0;
+ int i;
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
-
- for_each_cpu_and(i, rd->span, cpu_active_mask)
+ for_each_cpu_and(i, mask, cpu_active_mask)
cap += capacity_orig_of(i);
return cap;
@@ -144,11 +141,14 @@ static inline unsigned long __dl_bw_capacity(int i)
*/
static inline unsigned long dl_bw_capacity(int i)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ if (!sched_asym_cpucap_active() &&
capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
} else {
- return __dl_bw_capacity(i);
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return __dl_bw_capacity(cpu_rq(i)->rd->span);
}
}
@@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
- BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
if (task_on_rq_queued(p))
return;
@@ -431,8 +431,8 @@ static void task_non_contending(struct task_struct *p)
sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- __dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
}
return;
@@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct rb_node *leftmost;
- BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
leftmost = rb_add_cached(&p->pushable_dl_tasks,
&rq->dl.pushable_dl_tasks_root,
@@ -644,8 +644,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return rq->online && dl_task(prev);
}
-static DEFINE_PER_CPU(struct callback_head, dl_push_head);
-static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_push_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
@@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* Failed to find any suitable CPU.
* The task will never come back!
*/
- BUG_ON(dl_bandwidth_enabled());
+ WARN_ON_ONCE(dl_bandwidth_enabled());
/*
* If admission control is disabled we
@@ -770,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ struct rq *rq)
+{
+ /* for non-boosted task, pi_of(dl_se) == dl_se */
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+}
+
/*
* We are being explicitly informed that a new instance is starting,
* and this means that:
@@ -803,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
- dl_se->runtime = dl_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
/*
@@ -830,16 +837,14 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
+ WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
- if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
- }
+ if (dl_se->dl_deadline == 0)
+ replenish_dl_new_period(dl_se, rq);
if (dl_se->dl_yielded && dl_se->runtime > 0)
dl_se->runtime = 0;
@@ -866,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
if (dl_se->dl_yielded)
@@ -1024,8 +1028,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
return;
}
- dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
- dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
}
@@ -1333,11 +1336,7 @@ static void update_curr_dl(struct rq *rq)
trace_sched_stat_runtime(curr, delta_exec, 0);
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
+ update_current_exec_runtime(curr, now, delta_exec);
if (dl_entity_is_special(dl_se))
return;
@@ -1616,7 +1615,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
@@ -1640,7 +1639,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- BUG_ON(on_dl_rq(dl_se));
+ WARN_ON_ONCE(on_dl_rq(dl_se));
update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
@@ -1814,6 +1813,14 @@ static void yield_task_dl(struct rq *rq)
#ifdef CONFIG_SMP
+static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
+ struct rq *rq)
+{
+ return (!rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ rq->dl.earliest_dl.curr));
+}
+
static int find_later_rq(struct task_struct *task);
static int
@@ -1849,16 +1856,14 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
* Take the capacity of the CPU into account to
* ensure it fits the requirement of the task.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity))
+ if (sched_asym_cpucap_active())
select_rq |= !dl_task_fits_capacity(p, cpu);
if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
- (dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr) ||
- (cpu_rq(target)->dl.dl_nr_running == 0)))
+ dl_task_is_earliest_deadline(p, cpu_rq(target)))
cpu = target;
}
rcu_read_unlock();
@@ -2017,7 +2022,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
return NULL;
dl_se = pick_next_dl_entity(dl_rq);
- BUG_ON(!dl_se);
+ WARN_ON_ONCE(!dl_se);
p = dl_task_of(dl_se);
return p;
@@ -2087,7 +2092,7 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
@@ -2225,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (later_rq->dl.dl_nr_running &&
- !dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr)) {
+ if (!dl_task_is_earliest_deadline(task, later_rq)) {
/*
* Target rq has tasks of equal or earlier deadline,
* retrying does not release any lock and is unlikely
@@ -2241,7 +2244,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_running(rq, task) ||
+ task_on_cpu(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
@@ -2255,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
* its earliest one has a later deadline than our
* task, the rq is a good one.
*/
- if (!later_rq->dl.dl_nr_running ||
- dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr))
+ if (dl_task_is_earliest_deadline(task, later_rq))
break;
/* Otherwise we try again. */
@@ -2277,12 +2278,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
return p;
}
@@ -2428,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq)
* - it will preempt the last one we pulled (if any).
*/
if (p && dl_time_before(p->dl.deadline, dmin) &&
- (!this_rq->dl.dl_nr_running ||
- dl_time_before(p->dl.deadline,
- this_rq->dl.earliest_dl.curr))) {
+ dl_task_is_earliest_deadline(p, this_rq)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!task_on_rq_queued(p));
@@ -2475,7 +2474,7 @@ skip:
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
@@ -2486,13 +2485,12 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
@@ -2502,7 +2500,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -2516,7 +2514,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- set_cpus_allowed_common(p, new_mask, flags);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -3007,17 +3005,15 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
+ unsigned long flags, cap;
struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
+ cap = __dl_bw_capacity(trial);
raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ if (__dl_overflow(cur_dl_b, cap, 0, 0))
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 667876da8382..1637b65ba07a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..c36aa54ae071 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -40,6 +40,7 @@
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
#include <linux/mutex_api.h>
#include <linux/profile.h>
@@ -177,6 +178,11 @@ int __weak arch_asym_cpu_priority(int cpu)
static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+#endif
+
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
{
@@ -196,6 +202,16 @@ static struct ctl_table sched_fair_sysctls[] = {
.extra1 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
{}
};
@@ -799,8 +815,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static void attach_entity_cfs_rq(struct sched_entity *se);
-
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -835,20 +849,6 @@ void post_init_entity_util_avg(struct task_struct *p)
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
- }
-
- sa->runnable_avg = sa->util_avg;
-
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -864,7 +864,19 @@ void post_init_entity_util_avg(struct task_struct *p)
return;
}
- attach_entity_cfs_rq(se);
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ sa->runnable_avg = sa->util_avg;
}
#else /* !CONFIG_SMP */
@@ -1094,6 +1106,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
struct numa_group {
refcount_t refcount;
@@ -1436,6 +1451,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = xchg_page_access_time(page, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1443,9 +1572,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(page);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -1592,11 +1756,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
-static inline bool test_idle_cores(int cpu, bool def);
+static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
/*
@@ -2600,7 +2764,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -2685,6 +2849,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2765,6 +2938,7 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2802,7 +2976,7 @@ static void task_numa_work(struct callback_head *work)
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
return;
/*
@@ -2821,13 +2995,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
@@ -3838,8 +4015,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {}
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- * avg. The immediate corollary is that all (fair) tasks must be attached, see
- * post_init_entity_util_avg().
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
@@ -4003,6 +4179,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4032,6 +4209,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
@@ -4064,8 +4248,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
- * post_init_entity_util_avg() which will have added things to the
- * cfs_rq, so we can remove unconditionally.
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
sync_entity_load_avg(se);
@@ -4108,14 +4292,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
}
#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
}
#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
return task_util_est(p);
}
@@ -4254,15 +4440,144 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p,
- unsigned long capacity)
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
+{
+ unsigned long capacity_orig, capacity_orig_thermal;
+ unsigned long capacity = capacity_of(cpu);
+ bool fits, uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use capacity_orig_of() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for thermal pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ *
+ * In case of capacity inversion we should honour the inverted capacity
+ * for both uclamp_min and uclamp_max all the time.
+ */
+ capacity_orig = cpu_in_capacity_inversion(cpu);
+ if (capacity_orig) {
+ capacity_orig_thermal = capacity_orig;
+ } else {
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+ }
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
+ * to cpu1
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+ fits = fits && (uclamp_min <= capacity_orig_thermal);
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return;
if (!p || p->nr_cpus_allowed == 1) {
@@ -4270,7 +4585,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -4292,6 +4607,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define DO_DETACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -4434,7 +4750,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -4511,6 +4828,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ int action = UPDATE_TG;
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -4519,12 +4841,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
update_stats_dequeue_fair(cfs_rq, se, flags);
@@ -5682,7 +6005,10 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5893,8 +6219,8 @@ dequeue_throttle:
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -6260,7 +6586,7 @@ static inline void set_idle_cores(int cpu, int val)
WRITE_ONCE(sds->has_idle_cores, val);
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
@@ -6268,7 +6594,7 @@ static inline bool test_idle_cores(int cpu, bool def)
if (sds)
return READ_ONCE(sds->has_idle_cores);
- return def;
+ return false;
}
/*
@@ -6284,7 +6610,7 @@ void __update_idle_core(struct rq *rq)
int cpu;
rcu_read_lock();
- if (test_idle_cores(core, true))
+ if (test_idle_cores(core))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -6310,9 +6636,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
bool idle = true;
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core, p);
-
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
@@ -6339,13 +6662,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
- !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6360,9 +6682,9 @@ static inline void set_idle_cores(int cpu, int val)
{
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
- return def;
+ return false;
}
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
@@ -6370,7 +6692,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core, p);
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6389,19 +6711,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct sched_domain_shared *sd_share;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
- struct sched_domain *this_sd;
+ struct sched_domain *this_sd = NULL;
u64 time = 0;
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
unsigned long now = jiffies;
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
/*
* If we're busy, the assumption that the last idle period
* predicts the future is flawed; age away the remaining
@@ -6455,7 +6777,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (has_idle_core)
set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && !has_idle_core) {
+ if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
time = cpu_clock(this) - time;
/*
@@ -6478,21 +6800,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long task_util, best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (fits_capacity(task_util, cpu_cap))
+ if (util_fits_cpu(task_util, util_min, util_max, cpu))
return cpu;
if (cpu_cap > best_cap) {
@@ -6504,10 +6828,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
{
- if (static_branch_unlikely(&sched_asym_cpucapacity))
- return fits_capacity(task_util, capacity_of(cpu));
+ if (sched_asym_cpucap_active())
+ return util_fits_cpu(util, util_min, util_max, cpu);
return true;
}
@@ -6519,16 +6846,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
- unsigned long task_util;
+ unsigned long task_util, util_min, util_max;
int i, recent_used_cpu;
/*
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/*
@@ -6537,7 +6866,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
- asym_fits_capacity(task_util, target))
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
@@ -6545,7 +6874,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_capacity(task_util, prev))
+ asym_fits_cpu(task_util, util_min, util_max, prev))
return prev;
/*
@@ -6560,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
- asym_fits_capacity(task_util, prev)) {
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
@@ -6572,7 +6901,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
- asym_fits_capacity(task_util, recent_used_cpu)) {
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
@@ -6580,7 +6909,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
@@ -6601,10 +6930,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
if (sched_smt_active()) {
- has_idle_core = test_idle_cores(target, false);
+ has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
- i = select_idle_smt(p, sd, prev);
+ i = select_idle_smt(p, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
@@ -6868,6 +7197,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
struct sched_domain *sd;
@@ -6892,7 +7223,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!uclamp_task_util(p, p_util_min, p_util_max))
goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -6900,7 +7231,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
- bool compute_prev_delta = false;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long util_min, util_max;
+ unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
@@ -6936,26 +7269,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used()) {
+ if (uclamp_rq_is_idle(cpu_rq(cpu))) {
+ util_min = p_util_min;
+ util_max = p_util_max;
+ } else {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+ }
+ if (!util_fits_cpu(util, util_min, util_max, cpu))
continue;
lsub_positive(&cpu_cap, util);
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
- compute_prev_delta = true;
+ prev_spare_cap = cpu_cap;
} else if (cpu_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
- * in the performance domain.
+ * among the remaining CPUs in the performance
+ * domain.
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
}
}
- if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
continue;
eenv_pd_busy_time(&eenv, cpus, p);
@@ -6963,7 +7315,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
- if (compute_prev_delta) {
+ if (prev_spare_cap > 0) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu);
/* CPU utilization has changed */
@@ -6974,7 +7326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
- if (max_spare_cap_cpu >= 0) {
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
@@ -7076,8 +7428,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
return new_cpu;
}
-static void detach_entity_cfs_rq(struct sched_entity *se);
-
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7099,15 +7449,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
- /*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(se);
-
- } else {
+ if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
/*
@@ -7279,7 +7621,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- BUG_ON(!pse);
+ WARN_ON_ONCE(!pse);
cse_is_idle = se_is_idle(se);
pse_is_idle = se_is_idle(pse);
@@ -7938,7 +8280,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -8012,8 +8354,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
@@ -8049,20 +8389,24 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
env->loop++;
- /* We've more or less seen every task there is, call it quits */
- if (env->loop > env->loop_max)
+ /*
+ * We've more or less seen every task there is, call it quits
+ * unless we haven't found any movable task yet.
+ */
+ if (env->loop > env->loop_max &&
+ !(env->flags & LBF_ALL_PINNED))
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
@@ -8108,7 +8452,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -8159,7 +8503,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
@@ -8497,16 +8841,73 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
+ unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
+ struct rq *rq = cpu_rq(cpu);
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+ rq->cpu_capacity_orig = capacity_orig;
if (!capacity)
capacity = 1;
- cpu_rq(cpu)->cpu_capacity = capacity;
- trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+ rq->cpu_capacity = capacity;
+
+ /*
+ * Detect if the performance domain is in capacity inversion state.
+ *
+ * Capacity inversion happens when another perf domain with equal or
+ * lower capacity_orig_of() ends up having higher capacity than this
+ * domain after subtracting thermal pressure.
+ *
+ * We only take into account thermal pressure in this detection as it's
+ * the only metric that actually results in *real* reduction of
+ * capacity due to performance points (OPPs) being dropped/become
+ * unreachable due to thermal throttling.
+ *
+ * We assume:
+ * * That all cpus in a perf domain have the same capacity_orig
+ * (same uArch).
+ * * Thermal pressure will impact all cpus in this perf domain
+ * equally.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+ struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+
+ rq->cpu_capacity_inverted = 0;
+
+ for (; pd; pd = pd->next) {
+ struct cpumask *pd_span = perf_domain_span(pd);
+ unsigned long pd_cap_orig, pd_cap;
+
+ cpu = cpumask_any(pd_span);
+ pd_cap_orig = arch_scale_cpu_capacity(cpu);
+
+ if (capacity_orig < pd_cap_orig)
+ continue;
+
+ /*
+ * handle the case of multiple perf domains have the
+ * same capacity_orig but one of them is under higher
+ * thermal pressure. We record it as capacity
+ * inversion.
+ */
+ if (capacity_orig == pd_cap_orig) {
+ pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+
+ if (pd_cap > inv_cap) {
+ rq->cpu_capacity_inverted = inv_cap;
+ break;
+ }
+ } else if (pd_cap_orig > inv_cap) {
+ rq->cpu_capacity_inverted = inv_cap;
+ break;
+ }
+ }
+ }
+
+ trace_sched_cpu_capacity_tp(rq);
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
@@ -9113,6 +9514,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -9132,12 +9537,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
@@ -10099,14 +10504,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
@@ -10134,7 +10538,7 @@ redo:
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
@@ -10182,7 +10586,9 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
+ /* Stop if we tried all running tasks */
+ if (env.loop < busiest->nr_running)
+ goto more_balance;
}
/*
@@ -10213,7 +10619,7 @@ more_balance:
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -10245,7 +10651,7 @@ more_balance:
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -10430,7 +10836,7 @@ static int active_load_balance_cpu_stop(void *data)
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
@@ -10916,8 +11322,7 @@ static bool update_nohz_stats(struct rq *rq)
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
-static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -11032,7 +11437,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (idle != CPU_IDLE)
return false;
- _nohz_idle_balance(this_rq, flags, idle);
+ _nohz_idle_balance(this_rq, flags);
return true;
}
@@ -11052,7 +11457,7 @@ void nohz_run_idle_balance(int cpu)
* (ie NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
- _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
}
static void nohz_newidle_balance(struct rq *this_rq)
@@ -11552,6 +11957,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#ifdef CONFIG_SMP
+ /*
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+#endif
+
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
@@ -11563,14 +11979,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
@@ -11666,39 +12074,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_set_group_fair(struct task_struct *p)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
-
- set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-}
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
-static void task_move_group_fair(struct task_struct *p)
-{
detach_task_cfs_rq(p);
- set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
+ set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
-static void task_change_group_fair(struct task_struct *p, int type)
-{
- switch (type) {
- case TASK_SET_GROUP:
- task_set_group_fair(p);
- break;
-
- case TASK_MOVE_GROUP:
- task_move_group_fair(p);
- break;
- }
-}
-
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -12075,6 +12469,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
+ int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ }
+
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ecb4b4ff4ce0..8ac8b81bfee6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
+ group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@@ -188,6 +189,7 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
+ atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
group->poll_min_period = U32_MAX;
@@ -201,6 +203,7 @@ void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
return;
}
@@ -211,7 +214,7 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
@@ -224,9 +227,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -240,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -254,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -278,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /*
+ * When collect_percpu_times() from the avgs_work, we don't want to
+ * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+ * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+ * So for the current CPU, we need to re-arm avgs_work only when
+ * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+ * we can just check PSI_NONIDLE delta.
+ */
+ if (current_work() == &group->avgs_work.work) {
+ bool reschedule;
+
+ if (cpu == current_cpu)
+ reschedule = tasks[NR_RUNNING] +
+ tasks[NR_IOWAIT] +
+ tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
+ }
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -413,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool nonidle;
u64 now;
dwork = to_delayed_work(work);
@@ -424,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
collect_percpu_times(group, PSI_AVGS, &changed_states);
- nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -435,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)
if (now >= group->avg_next_update)
group->avg_next_update = update_averages(group, now);
- if (nonidle) {
+ if (changed_states & PSI_STATE_RESCHEDULE) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
group->avg_next_update - now) + 1);
}
@@ -537,10 +564,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
/* Calculate growth since last update */
growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
+ if (!t->pending_event) {
+ if (growth < t->threshold)
+ continue;
- t->pending_event = true;
+ t->pending_event = true;
+ }
}
/* Limit event signaling to once per window */
if (now < t->last_event_time + t->win.size)
@@ -561,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
-/* Schedule polling if it's not already scheduled. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+/* Schedule polling if it's not already scheduled or forced. */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+ bool force)
{
struct task_struct *task;
/*
- * Do not reschedule if already scheduled.
- * Possible race with a timer scheduled after this check but before
- * mod_timer below can be tolerated because group->polling_next_update
- * will keep updates on schedule.
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_poll_work).
*/
- if (timer_pending(&group->poll_timer))
+ if (atomic_xchg(&group->poll_scheduled, 1) && !force)
return;
rcu_read_lock();
@@ -584,12 +612,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
*/
if (likely(task))
mod_timer(&group->poll_timer, jiffies + delay);
+ else
+ atomic_set(&group->poll_scheduled, 0);
rcu_read_unlock();
}
static void psi_poll_work(struct psi_group *group)
{
+ bool force_reschedule = false;
u32 changed_states;
u64 now;
@@ -597,6 +628,43 @@ static void psi_poll_work(struct psi_group *group)
now = sched_clock();
+ if (now > group->polling_until) {
+ /*
+ * We are either about to start or might stop polling if no
+ * state change was recorded. Resetting poll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immediate poll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the polling window
+ * should be negligible and polling_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->poll_scheduled, 0);
+ /*
+ * A task change can race with the poll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * poll_scheduled and the task state accesses, such that if the poll
+ * worker misses the state update, the task change is guaranteed to
+ * reschedule the poll worker:
+ *
+ * poll worker:
+ * atomic_set(poll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(poll_scheduled, 1) == 0:
+ * schedule poll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* Polling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
+
+
collect_percpu_times(group, PSI_POLL, &changed_states);
if (changed_states & group->poll_states) {
@@ -622,7 +690,8 @@ static void psi_poll_work(struct psi_group *group)
group->polling_next_update = update_triggers(group, now);
psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
+ nsecs_to_jiffies(group->polling_next_update - now) + 1,
+ force_reschedule);
out:
mutex_unlock(&group->trigger_lock);
@@ -688,35 +757,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
- u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
+ u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we assess the aggregate resource states this CPU's
- * tasks have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- *
- * Then we update the task counts according to the state
+ * First we update the task counts according to the state
* change requested through the @clear and @set bits.
+ *
+ * Then if the cgroup PSI stats accounting enabled, we
+ * assess the aggregate resource states this CPU's tasks
+ * have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
- record_times(groupc, now);
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], groupc->tasks[4],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
}
@@ -725,9 +812,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
- /* Calculate state mask representing active states */
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ write_seqcount_end(&groupc->seq);
+ return;
+ }
+
for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s))
+ if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
@@ -739,41 +842,28 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state.
*/
- if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
+ record_times(groupc, now);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+ psi_schedule_poll_work(group, 1, false);
if (wake_clock && !delayed_work_pending(&group->avgs_work))
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
{
- if (*iter == &psi_system)
- return NULL;
-
#ifdef CONFIG_CGROUPS
- if (static_branch_likely(&psi_cgroups_enabled)) {
- struct cgroup *cgroup = NULL;
-
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else
- cgroup = cgroup_parent(*iter);
-
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
- }
- }
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
#endif
- *iter = &psi_system;
return &psi_system;
}
@@ -796,8 +886,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
u64 now;
if (!task->pid)
@@ -806,19 +894,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set);
now = cpu_clock(cpu);
- /*
- * Periodic aggregation shuts off if there is a period of no
- * task changes, so we wake it back up if necessary. However,
- * don't do this if the task change is the aggregation worker
- * itself going to sleep, or we'll ping-pong forever.
- */
- if (unlikely((clear & TSK_RUNNING) &&
- (task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_avgs_work))
- wake_clock = false;
- while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, now, wake_clock);
+ group = task_psi_group(task);
+ do {
+ psi_group_change(group, cpu, clear, set, now, true);
+ } while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +906,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- void *iter;
u64 now = cpu_clock(cpu);
if (next->pid) {
- bool identical_state;
-
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When switching between tasks that have an identical
- * runtime state, the cgroup that contains both tasks
- * we reach the first common ancestor. Iterate @next's
- * ancestors only until we encounter @prev's ONCPU.
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
*/
- identical_state = prev->psi_flags == next->psi_flags;
- iter = NULL;
- while ((group = iterate_groups(next, &iter))) {
- if (identical_state &&
- per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ group = task_psi_group(next);
+ do {
+ if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+ PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
- }
+ } while ((group = group->parent));
}
if (prev->pid) {
int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
/*
* When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +943,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
}
psi_flags_change(prev, clear, set);
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, clear, set, now, true);
+ group = task_psi_group(prev);
+ do {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ } while ((group = group->parent));
/*
- * TSK_ONCPU is handled up to the common ancestor. If we're tasked
- * with dequeuing too, finish that for the rest of the hierarchy.
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
*/
- if (sleep) {
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = iterate_groups(prev, &iter))
- psi_group_change(group, cpu, clear, set, now, true);
+ for (; group; group = group->parent)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ struct psi_group_cpu *groupc;
+ u64 now;
+
+ if (!task->pid)
+ return;
+
+ now = cpu_clock(cpu);
+
+ group = task_psi_group(task);
+ do {
+ if (!group->enabled)
+ continue;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (group->poll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_poll_work(group, 1, false);
+ } while ((group = group->parent));
+}
+#endif
+
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -917,6 +1041,7 @@ void psi_memstall_enter(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_enter);
/**
* psi_memstall_leave - mark the end of an memory stall section
@@ -946,11 +1071,12 @@ void psi_memstall_leave(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -963,12 +1089,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM;
}
group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -996,7 +1123,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf;
struct rq *rq;
- if (static_branch_likely(&psi_disabled)) {
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -1044,10 +1171,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_state() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_state() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ u64 now;
+
+ rq_lock_irq(rq, &rf);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ rq_unlock_irq(rq, &rf);
+ }
+}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
+ bool only_full = false;
int full;
u64 now;
@@ -1062,7 +1224,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1076,7 +1242,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1104,6 +1270,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@@ -1221,6 +1392,7 @@ void psi_trigger_destroy(struct psi_trigger *t)
* can no longer be found through group->poll_task.
*/
kthread_stop(task_to_destroy);
+ atomic_set(&group->poll_scheduled, 0);
}
kfree(t);
}
@@ -1388,6 +1560,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1395,6 +1594,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 55f39c8f4203..ed2a47e4ddae 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -410,8 +410,8 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static DEFINE_PER_CPU(struct callback_head, rt_push_head);
-static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);
@@ -509,7 +509,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
unsigned int cpu_cap;
/* Only heterogeneous systems can benefit from this check */
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return true;
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
@@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq)
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
- BUG_ON(want);
+ WARN_ON_ONCE(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
@@ -1062,11 +1062,7 @@ static void update_curr_rt(struct rq *rq)
trace_sched_stat_runtime(curr, delta_exec, 0);
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
+ update_current_exec_runtime(curr, now, delta_exec);
if (!rt_bandwidth_enabled())
return;
@@ -1849,7 +1845,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
@@ -1897,7 +1893,7 @@ static int find_lowest_rq(struct task_struct *task)
* If we're on asym system ensure we consider the different capacities
* of the CPUs when searching for the lowest_mask.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
task, lowest_mask,
@@ -2004,7 +2000,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_running(rq, task) ||
+ task_on_cpu(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2462,7 +2458,7 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- bool need_to_push = !task_running(rq, p) &&
+ bool need_to_push = !task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..771f8ddb7053 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -321,21 +321,6 @@ struct dl_bw {
u64 total_bw;
};
-/*
- * Verify the fitness of task @p to run on @cpu taking into account the
- * CPU original capacity and the runtime/deadline ratio of the task.
- *
- * The function will return true if the CPU original capacity of the
- * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
- * task and false otherwise.
- */
-static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
-{
- unsigned long cap = arch_scale_cpu_capacity(cpu);
-
- return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
-}
-
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
@@ -953,6 +938,12 @@ struct uclamp_rq {
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
+struct rq;
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
+};
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -1050,8 +1041,9 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
+ unsigned long cpu_capacity_inverted;
- struct callback_head *balance_callback;
+ struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1159,6 +1151,9 @@ struct rq {
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1197,6 +1192,14 @@ static inline bool is_migration_disabled(struct task_struct *p)
#endif
}
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() this_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
struct sched_group;
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1284,7 +1287,7 @@ static inline bool sched_group_cookie_match(struct rq *rq,
return true;
for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
- if (sched_core_cookie_match(rq, p))
+ if (sched_core_cookie_match(cpu_rq(cpu), p))
return true;
}
return false;
@@ -1399,14 +1402,6 @@ static inline void update_idle_core(struct rq *rq)
static inline void update_idle_core(struct rq *rq) { }
#endif
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() this_cpu_ptr(&runqueues)
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() raw_cpu_ptr(&runqueues)
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline struct task_struct *task_of(struct sched_entity *se)
{
@@ -1559,7 +1554,7 @@ struct rq_flags {
#endif
};
-extern struct callback_head balance_push_callback;
+extern struct balance_callback balance_push_callback;
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
@@ -1739,7 +1734,7 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static inline void
queue_balance_callback(struct rq *rq,
- struct callback_head *head,
+ struct balance_callback *head,
void (*func)(struct rq *rq))
{
lockdep_assert_rq_held(rq);
@@ -1752,7 +1747,7 @@ queue_balance_callback(struct rq *rq,
if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
- head->func = (void (*)(struct callback_head *))func;
+ head->func = func;
head->next = rq->balance_callback;
rq->balance_callback = head;
}
@@ -1815,6 +1810,11 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
+static __always_inline bool sched_asym_cpucap_active(void)
+{
+ return static_branch_unlikely(&sched_asym_cpucapacity);
+}
+
struct sched_group_capacity {
atomic_t ref;
/*
@@ -1881,6 +1881,13 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
+
+static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+{
+ if (!p->user_cpus_ptr)
+ return cpu_possible_mask; /* &init_task.cpus_mask */
+ return p->user_cpus_ptr;
+}
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1942,6 +1949,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
+ p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -2060,7 +2068,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p;
}
-static inline int task_running(struct rq *rq, struct task_struct *p)
+static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->on_cpu;
@@ -2147,6 +2155,12 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
+struct affinity_context {
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
+};
+
struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
@@ -2175,9 +2189,7 @@ struct sched_class {
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
- void (*set_cpus_allowed)(struct task_struct *p,
- const struct cpumask *newmask,
- u32 flags);
+ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
@@ -2204,11 +2216,8 @@ struct sched_class {
void (*update_curr)(struct rq *rq);
-#define TASK_SET_GROUP 0
-#define TASK_MOVE_GROUP 1
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_change_group)(struct task_struct *p, int type);
+ void (*task_change_group)(struct task_struct *p);
#endif
};
@@ -2291,7 +2300,7 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
static inline struct task_struct *get_push_task(struct rq *rq)
{
@@ -2435,6 +2444,12 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+#ifdef CONFIG_PREEMPT_RT
+#define SCHED_NR_MIGRATE_BREAK 8
+#else
+#define SCHED_NR_MIGRATE_BREAK 32
+#endif
+
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
@@ -2452,6 +2467,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK
@@ -2709,8 +2725,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
- BUG_ON(!irqs_disabled());
- BUG_ON(rq1 != rq2);
+ WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
double_rq_clock_clear_update(rq1, rq2);
@@ -2726,7 +2742,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- BUG_ON(rq1 != rq2);
+ WARN_ON_ONCE(rq1 != rq2);
raw_spin_rq_unlock(rq1);
__release(rq2->lock);
}
@@ -2877,6 +2893,24 @@ static inline unsigned long capacity_orig_of(int cpu)
return cpu_rq(cpu)->cpu_capacity_orig;
}
+/*
+ * Returns inverted capacity if the CPU is in capacity inversion state.
+ * 0 otherwise.
+ *
+ * Capacity inversion detection only considers thermal impact where actual
+ * performance points (OPPs) gets dropped.
+ *
+ * Capacity inversion state happens when another performance domain that has
+ * equal or lower capacity_orig_of() becomes effectively larger than the perf
+ * domain this CPU belongs to due to thermal pressure throttling it hard.
+ *
+ * See comment in update_cpu_capacity().
+ */
+static inline unsigned long cpu_in_capacity_inversion(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_inverted;
+}
+
/**
* enum cpu_util_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2896,6 +2930,21 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
enum cpu_util_type type,
struct task_struct *p);
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the original capacity of @cpu is
+ * greater than or equal to task's deadline density right shifted by
+ * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT);
+}
+
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2963,6 +3012,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ return READ_ONCE(rq->uclamp[clamp_id].value);
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
+}
+
/**
* uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
* @rq: The rq to clamp against. Must not be NULL.
@@ -2998,12 +3064,12 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
* Ignore last runnable task's max clamp, as this task will
* reset it. Similarly, no need to read the rq's min clamp.
*/
- if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ if (uclamp_rq_is_idle(rq))
goto out;
}
- min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
- max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+ min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
+ max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
@@ -3044,6 +3110,15 @@ static inline bool uclamp_is_used(void)
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
@@ -3057,6 +3132,25 @@ static inline bool uclamp_is_used(void)
{
return false;
}
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
+}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
@@ -3157,4 +3251,14 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+static inline void update_current_exec_runtime(struct task_struct *curr,
+ u64 now, u64 delta_exec)
+{
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = now;
+ cgroup_account_cputime(curr, delta_exec);
+}
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baa839c1ba96..38f3698f5e5b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
}
#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
+
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -123,11 +128,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
- if (!wakeup || p->sched_psi_wake_requeue) {
+ if (!wakeup) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
- if (p->sched_psi_wake_requeue)
- p->sched_psi_wake_requeue = 0;
} else {
if (p->in_iowait)
clear |= TSK_IOWAIT;
@@ -138,8 +141,6 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
- int clear = TSK_RUNNING;
-
if (static_branch_likely(&psi_disabled))
return;
@@ -152,10 +153,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (sleep)
return;
- if (p->in_memstall)
- clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
-
- psi_task_change(p, clear, 0);
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -167,19 +165,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || p->in_memstall)) {
+ if (unlikely(p->psi_flags)) {
struct rq_flags rf;
struct rq *rq;
- int clear = 0;
-
- if (p->in_iowait)
- clear |= TSK_IOWAIT;
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
- psi_task_change(p, clear, 0);
- p->sched_psi_wake_requeue = 1;
+ psi_task_change(p, p->psi_flags, 0);
__task_rq_unlock(rq, &rf);
}
}
@@ -201,6 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d04073a93eb4..85590599b4d6 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -71,20 +71,17 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *curr = rq->curr;
- u64 delta_exec;
+ u64 now, delta_exec;
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
schedstat_set(curr->stats.exec_max,
max(curr->stats.exec_max, delta_exec));
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = rq_clock_task(rq);
- cgroup_account_cputime(curr, delta_exec);
+ update_current_exec_runtime(curr, now, delta_exec);
}
/*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847c..133b74730738 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -121,11 +121,12 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
return nr_exclusive;
}
-static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
wait_queue_entry_t bookmark;
+ int remaining = nr_exclusive;
bookmark.flags = 0;
bookmark.private = NULL;
@@ -134,10 +135,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
do {
spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ remaining = __wake_up_common(wq_head, mode, remaining,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
+
+ return nr_exclusive - remaining;
}
/**
@@ -147,13 +150,14 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * If this function wakes up a task, it executes a full memory barrier before
- * accessing the task state.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
diff --git a/kernel/scs.c b/kernel/scs.c
index b7e1b096d906..d7809affe740 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -12,6 +12,10 @@
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
+#ifdef CONFIG_DYNAMIC_SCS
+DEFINE_STATIC_KEY_FALSE(dynamic_scs_enabled);
+#endif
+
static void __scs_account(void *s, int account)
{
struct page *scs_page = vmalloc_to_page(s);
@@ -101,14 +105,20 @@ static int scs_cleanup(unsigned int cpu)
void __init scs_init(void)
{
+ if (!scs_is_enabled())
+ return;
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
scs_cleanup);
}
int scs_prepare(struct task_struct *tsk, int node)
{
- void *s = scs_alloc(node);
+ void *s;
+ if (!scs_is_enabled())
+ return 0;
+
+ s = scs_alloc(node);
if (!s)
return -ENOMEM;
@@ -148,7 +158,7 @@ void scs_release(struct task_struct *tsk)
{
void *s = task_scs(tsk);
- if (!s)
+ if (!scs_is_enabled() || !s)
return;
WARN(task_scs_end_corrupted(tsk),
diff --git a/kernel/signal.c b/kernel/signal.c
index 6f86fda5e432..ae26da61c4d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -913,8 +913,9 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
if (signal->core_state)
return sig == SIGKILL;
/*
- * The process is in the middle of dying, nothing to do.
+ * The process is in the middle of dying, drop the signal.
*/
+ return false;
} else if (sig_kernel_stop(sig)) {
/*
* This is a stop signal. Remove SIGCONT from all queues.
@@ -1254,7 +1255,7 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
static void print_fatal_signal(int signr)
{
- struct pt_regs *regs = signal_pt_regs();
+ struct pt_regs *regs = task_pt_regs(current);
pr_info("potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
@@ -2304,7 +2305,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
preempt_enable_no_resched();
- freezable_schedule();
+ schedule();
cgroup_leave_frozen(true);
/*
@@ -2473,7 +2474,7 @@ static bool do_signal_stop(int signr)
/* Now we don't run again until woken by SIGCONT or SIGKILL */
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
return true;
} else {
/*
@@ -2548,11 +2549,11 @@ static void do_freezer_trap(void)
* immediately (if there is a non-fatal signal pending), and
* put the task into sleep.
*/
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
clear_thread_flag(TIF_SIGPENDING);
spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
@@ -2692,6 +2693,7 @@ relock:
/* Has this task already been marked for death? */
if ((signal->flags & SIGNAL_GROUP_EXIT) ||
signal->group_exec_task) {
+ clear_siginfo(&ksig->info);
ksig->info.si_signo = signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
@@ -3600,9 +3602,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- __set_current_state(TASK_INTERRUPTIBLE);
- ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
- HRTIMER_MODE_REL);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+ HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
diff --git a/kernel/smp.c b/kernel/smp.c
index 650810a6f29b..06a413987a14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
if (cpu >= 0) {
if (static_branch_unlikely(&csdlock_debug_extended))
csd_lock_print_extended(csd, cpu);
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
arch_send_call_function_single_ipi(cpu);
@@ -1070,7 +1069,7 @@ static int __init nrcpus(char *str)
int nr_cpus;
if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
- nr_cpu_ids = nr_cpus;
+ set_nr_cpu_ids(nr_cpus);
return 0;
}
@@ -1088,14 +1087,16 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
+#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
+#endif
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
- nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+ set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}
/* Called by boot processor to activate the rest. */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index b9f54544e749..2c7396da470c 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
/* The outgoing CPU will normally get done quite quickly. */
if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
- goto update_state;
+ goto update_state_early;
udelay(5);
/* But if the outgoing CPU dawdles, wait increasingly long times. */
@@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
break;
sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
}
-update_state:
+update_state_early:
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+update_state:
if (oldstate == CPU_DEAD) {
/* Outgoing CPU died normally, update state. */
smp_mb(); /* atomic_read() before update. */
atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
} else {
/* Outgoing CPU still hasn't died, set state accordingly. */
- if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, CPU_BROKEN) != oldstate)
+ if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ &oldstate, CPU_BROKEN))
goto update_state;
ret = false;
}
@@ -475,14 +476,14 @@ bool cpu_report_death(void)
int newstate;
int cpu = smp_processor_id();
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
do {
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
if (oldstate != CPU_BROKEN)
newstate = CPU_DEAD;
else
newstate = CPU_DEAD_FROZEN;
- } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, newstate) != oldstate);
+ } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ &oldstate, newstate));
return newstate == CPU_DEAD;
}
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index dc5665b62814..639397b5491c 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -15,7 +15,18 @@ extern struct static_call_site __start_static_call_sites[],
extern struct static_call_tramp_key __start_static_call_tramp_key[],
__stop_static_call_tramp_key[];
-static bool static_call_initialized;
+static int static_call_initialized;
+
+/*
+ * Must be called before early_initcall() to be effective.
+ */
+void static_call_force_reinit(void)
+{
+ if (WARN_ON_ONCE(!static_call_initialized))
+ return;
+
+ static_call_initialized++;
+}
/* mutex to protect key modules/sites */
static DEFINE_MUTEX(static_call_mutex);
@@ -475,7 +486,8 @@ int __init static_call_init(void)
{
int ret;
- if (static_call_initialized)
+ /* See static_call_force_reinit(). */
+ if (static_call_initialized == 1)
return 0;
cpus_read_lock();
@@ -490,11 +502,12 @@ int __init static_call_init(void)
BUG();
}
- static_call_initialized = true;
-
#ifdef CONFIG_MODULES
- register_module_notifier(&static_call_module_nb);
+ if (!static_call_initialized)
+ register_module_notifier(&static_call_module_nb);
#endif
+
+ static_call_initialized = 1;
return 0;
}
early_initcall(static_call_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index b911fa6d81ab..5fd54bf0e886 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -25,6 +25,7 @@
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
+#include <linux/random.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
@@ -496,7 +497,7 @@ static void flag_nproc_exceeded(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
+ if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
new->user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
@@ -1366,6 +1367,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
if (!copy_from_user(tmp, name, len)) {
struct new_utsname *u;
+ add_device_randomness(tmp, len);
down_write(&uts_sem);
u = utsname();
memcpy(u->nodename, tmp, len);
@@ -1419,6 +1421,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
if (!copy_from_user(tmp, name, len)) {
struct new_utsname *u;
+ add_device_randomness(tmp, len);
down_write(&uts_sem);
u = utsname();
memcpy(u->domainname, tmp, len);
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index 664ded05dd7a..6ef887c19c48 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -9,9 +9,6 @@
#define KUNIT_PROC_READ 0
#define KUNIT_PROC_WRITE 1
-static int i_zero;
-static int i_one_hundred = 100;
-
/*
* Test that proc_dointvec will not try to use a NULL .data field even when the
* length is non-zero.
@@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
/*
* proc_dointvec expects a buffer in user space, so we allocate one. We
@@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
.maxlen = 0,
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 4;
loff_t pos = 0;
@@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 5;
loff_t pos = 0;
@@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "9";
size_t len = sizeof(input) - 1;
@@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "-9";
size_t len = sizeof(input) - 1;
@@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;
@@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 205d605cacc5..137d4abe3eda 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,9 +82,16 @@
#include <linux/rtmutex.h>
#endif
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
+EXPORT_SYMBOL(sysctl_vals);
+
+const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
+EXPORT_SYMBOL_GPL(sysctl_long_vals);
+
#if defined(CONFIG_SYSCTL)
-/* Constants used for minimum and maximum */
+/* Constants used for minimum and maximum */
#ifdef CONFIG_PERF_EVENTS
static const int six_hundred_forty_kb = 640 * 1024;
@@ -129,11 +136,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
-#ifdef CONFIG_COMPACTION
-/* min_extfrag_threshold is SYSCTL_ZERO */;
-static const int max_extfrag_threshold = 1000;
-#endif
-
#endif /* CONFIG_SYSCTL */
/*
@@ -265,13 +267,14 @@ int proc_dostring(struct ctl_table *table, int write,
ppos);
}
-static size_t proc_skip_spaces(char **buf)
+static void proc_skip_spaces(char **buf, size_t *size)
{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
+ while (*size) {
+ if (!isspace(**buf))
+ break;
+ (*size)--;
+ (*buf)++;
+ }
}
static void proc_skip_char(char **buf, size_t *size, const char v)
@@ -340,13 +343,12 @@ static int proc_get_long(char **buf, size_t *size,
unsigned long *val, bool *neg,
const char *perm_tr, unsigned perm_tr_len, char *tr)
{
- int len;
char *p, tmp[TMPBUFLEN];
+ ssize_t len = *size;
- if (!*size)
+ if (len <= 0)
return -EINVAL;
- len = *size;
if (len > TMPBUFLEN - 1)
len = TMPBUFLEN - 1;
@@ -519,7 +521,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
@@ -546,7 +548,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
@@ -588,7 +590,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left) {
err = -EINVAL;
goto out_free;
@@ -608,7 +610,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
}
if (!err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
out_free:
if (err)
@@ -1052,9 +1054,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
return 0;
}
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
+ i = data;
+ min = table->extra1;
+ max = table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
left = *lenp;
@@ -1073,7 +1075,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
if (write) {
bool neg;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
@@ -1102,7 +1104,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
@@ -1631,17 +1633,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing",
- .data = NULL, /* filled in by handler */
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_numa_balancing,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_FOUR,
- },
-#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
.data = &panic_timeout,
@@ -2115,6 +2106,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
@@ -2216,7 +2208,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = (void *)&max_extfrag_threshold,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "compact_unevictable_allowed",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index dff75bcde151..065e1ef8fc8d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
+ head = READ_ONCE(task->task_works);
do {
- head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
- } while (cmpxchg(&task->task_works, head, work) != head);
+ } while (!try_cmpxchg(&task->task_works, &head, work));
switch (notify) {
case TWA_NONE:
@@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task,
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = READ_ONCE(*pprev))) {
- if (!match(work, data))
+ work = READ_ONCE(*pprev);
+ while (work) {
+ if (!match(work, data)) {
pprev = &work->next;
- else if (cmpxchg(pprev, work, work->next) == work)
+ work = READ_ONCE(*pprev);
+ } else if (try_cmpxchg(pprev, &work, work->next))
break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -151,16 +153,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ work = READ_ONCE(task->task_works);
do {
head = NULL;
- work = READ_ONCE(task->task_works);
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
- } while (cmpxchg(&task->task_works, work, head) != work);
+ } while (!try_cmpxchg(&task->task_works, &work, head));
if (!work)
break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f7e246336218..8ce3fa0c19e2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = {
.module = THIS_MODULE,
.ops = taskstats_ops,
.n_ops = ARRAY_SIZE(taskstats_ops),
+ .resv_start_op = CGROUPSTATS_CMD_GET + 1,
.netnsok = true,
};
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5d85014d59b5..960143b183cd 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -76,7 +76,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
}
/**
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds
* @latch: value to convert
* @evt: pointer to clock event device descriptor
*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cee5da1e54c4..9cf32ccda715 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = prandom_u32() % nr_cpu_ids;
+ cpu = get_random_u32_below(nr_cpu_ids);
cpu = cpumask_next(cpu - 1, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_online_mask);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 23af5eca11b1..3ae661ab6260 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
struct restart_block *restart;
do {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
- freezable_schedule();
+ schedule();
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index aec832801c26..0775b9ec952a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -192,6 +192,24 @@ static void timens_setup_vdso_data(struct vdso_data *vdata,
offset[CLOCK_BOOTTIME_ALARM] = boottime;
}
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
/*
* Protects possibly multiple offsets writers racing each other
* and tasks entering the namespace.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..63a8ce7177dd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1017,7 +1017,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
unsigned int idx = UINT_MAX;
int ret = 0;
- BUG_ON(!timer->function);
+ debug_assert_init(timer);
/*
* This is a common optimization triggered by the networking code - if
@@ -1044,6 +1044,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
@@ -1070,6 +1078,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
} else {
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
}
@@ -1083,7 +1099,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
+ * otherwise timer_delete_sync() can't detect that the timer's
* handler yet has not finished. This also guarantees that the
* timer is serialized wrt itself.
*/
@@ -1121,14 +1137,20 @@ out_unlock:
}
/**
- * mod_timer_pending - modify a pending timer's timeout
- * @timer: the pending timer to be modified
- * @expires: new timeout in jiffies
+ * mod_timer_pending - Modify a pending timer's timeout
+ * @timer: The pending timer to be modified
+ * @expires: New absolute timeout in jiffies
+ *
+ * mod_timer_pending() is the same for pending timers as mod_timer(), but
+ * will not activate inactive timers.
*
- * mod_timer_pending() is the same for pending timers as mod_timer(),
- * but will not re-activate and modify already deleted timers.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
*
- * It is useful for unserialized use of timers.
+ * Return:
+ * * %0 - The timer was inactive and not modified or was in
+ * shutdown state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
@@ -1137,24 +1159,31 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer_pending);
/**
- * mod_timer - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer() is a more efficient way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
+ * mod_timer - Modify a timer's timeout
+ * @timer: The timer to be modified
+ * @expires: New absolute timeout in jiffies
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
+ * mod_timer() is more efficient than the above open coded sequence. In
+ * case that the timer is inactive, the del_timer() part is a NOP. The
+ * timer is in any case activated with the new expiry time @expires.
+ *
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
- * The function returns whether it has modified a pending timer or not.
- * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
- * active timer returns 1.)
+ * If @timer->function == NULL then the start operation is silently
+ * discarded. In this case the return value is 0 and meaningless.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires did
+ * not change the effective expiry time
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
@@ -1165,11 +1194,22 @@ EXPORT_SYMBOL(mod_timer);
/**
* timer_reduce - Modify a timer's timeout if it would reduce the timeout
* @timer: The timer to be modified
- * @expires: New timeout in jiffies
+ * @expires: New absolute timeout in jiffies
*
* timer_reduce() is very similar to mod_timer(), except that it will only
- * modify a running timer if that would reduce the expiration time (it will
- * start a timer that isn't running).
+ * modify an enqueued timer if that would reduce the expiration time. If
+ * @timer is not enqueued it starts the timer.
+ *
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires
+ * did not change the effective expiry time such that the
+ * timer would expire earlier than already scheduled
*/
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
@@ -1178,39 +1218,51 @@ int timer_reduce(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(timer_reduce);
/**
- * add_timer - start a timer
- * @timer: the timer to be added
+ * add_timer - Start a timer
+ * @timer: The timer to be started
*
- * The kernel will do a ->function(@timer) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
+ * Start @timer to expire at @timer->expires in the future. @timer->expires
+ * is the absolute expiry time measured in 'jiffies'. When the timer expires
+ * timer->function(timer) will be invoked from soft interrupt context.
*
- * The timer's ->expires, ->function fields must be set prior calling this
- * function.
+ * The @timer->expires and @timer->function fields must be set prior
+ * to calling this function.
*
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * If @timer->expires is already in the past @timer will be queued to
+ * expire at the next timer tick.
+ *
+ * This can only operate on an inactive timer. Attempts to invoke this on
+ * an active timer are rejected with a warning.
*/
void add_timer(struct timer_list *timer)
{
- BUG_ON(timer_pending(timer));
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
/**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * add_timer_on - Start a timer on a particular CPU
+ * @timer: The timer to be started
+ * @cpu: The CPU to start it on
+ *
+ * Same as add_timer() except that it starts the timer on the given CPU.
*
- * This is not very scalable on SMP. Double adds are not possible.
+ * See add_timer() for further details.
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
struct timer_base *new_base, *base;
unsigned long flags;
- BUG_ON(timer_pending(timer) || !timer->function);
+ debug_assert_init(timer);
+
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1220,6 +1272,13 @@ void add_timer_on(struct timer_list *timer, int cpu)
* wrong base locked. See lock_timer_base().
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated while
+ * holding base lock to prevent a race against the shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
if (base != new_base) {
timer->flags |= TIMER_MIGRATING;
@@ -1233,22 +1292,27 @@ void add_timer_on(struct timer_list *timer, int cpu)
debug_timer_activate(timer);
internal_add_timer(base, timer);
+out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
/**
- * del_timer - deactivate a timer.
- * @timer: the timer to be deactivated
- *
- * del_timer() deactivates a timer - this works on both active and inactive
- * timers.
- *
- * The function returns whether it has deactivated a pending timer or not.
- * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
- * active timer returns 1.)
+ * __timer_delete - Internal function: Deactivate a timer
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the time. In that
+ * case any attempt to rearm @timer after this function returns will be
+ * silently ignored.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer(struct timer_list *timer)
+static int __timer_delete(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1256,24 +1320,90 @@ int del_timer(struct timer_list *timer)
debug_assert_init(timer);
- if (timer_pending(timer)) {
+ /*
+ * If @shutdown is set then the lock has to be taken whether the
+ * timer is pending or not to protect against a concurrent rearm
+ * which might hit between the lockless pending check and the lock
+ * aquisition. By taking the lock it is ensured that such a newly
+ * enqueued timer is dequeued and cannot end up with
+ * timer->function == NULL in the expiry code.
+ *
+ * If timer->function is currently executed, then this makes sure
+ * that the callback cannot requeue the timer.
+ */
+ if (timer_pending(timer) || shutdown) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
}
return ret;
}
-EXPORT_SYMBOL(del_timer);
/**
- * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer to delete
+ * timer_delete - Deactivate a timer
+ * @timer: The timer to be deactivated
+ *
+ * The function only deactivates a pending timer, but contrary to
+ * timer_delete_sync() it does not take into account whether the timer's
+ * callback function is concurrently executed on a different CPU or not.
+ * It neither prevents rearming of the timer. If @timer can be rearmed
+ * concurrently then the return value of this function is meaningless.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete(struct timer_list *timer)
+{
+ return __timer_delete(timer, false);
+}
+EXPORT_SYMBOL(timer_delete);
+
+/**
+ * timer_shutdown - Deactivate a timer and prevent rearming
+ * @timer: The timer to be deactivated
*
- * This function tries to deactivate a timer. Upon successful (ret >= 0)
- * exit the timer is not queued and the handler is not running on any CPU.
+ * The function does not wait for an eventually running timer callback on a
+ * different CPU but it prevents rearming of the timer. Any attempt to arm
+ * @timer after this function returns will be silently ignored.
+ *
+ * This function is useful for teardown code and should only be used when
+ * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_shutdown(struct timer_list *timer)
+{
+ return __timer_delete(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown);
+
+/**
+ * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
+ * @timer: Timer to deactivate
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the timer. Any
+ * attempt to rearm @timer after this function returns will be silently
+ * ignored.
+ *
+ * This function cannot guarantee that the timer cannot be rearmed
+ * right after dropping the base lock if @shutdown is false. That
+ * needs to be prevented by the calling code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1285,11 +1415,34 @@ int try_to_del_timer_sync(struct timer_list *timer)
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
+
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: Timer to deactivate
+ *
+ * This function tries to deactivate a timer. On success the timer is not
+ * queued and the timer callback function is not running on any CPU.
+ *
+ * This function does not guarantee that the timer cannot be rearmed right
+ * after dropping the base lock. That needs to be prevented by the calling
+ * code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ return __try_to_del_timer_sync(timer, false);
+}
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_PREEMPT_RT
@@ -1365,44 +1518,29 @@ static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/**
- * del_timer_sync - deactivate a timer and wait for the handler to finish.
- * @timer: the timer to be deactivated
- *
- * This function only differs from del_timer() on SMP: besides deactivating
- * the timer it also makes sure the handler has finished executing on other
- * CPUs.
- *
- * Synchronization rules: Callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts unless the timer is an irqsafe one. The caller must
- * not hold locks which would prevent completion of the timer's
- * handler. The timer's handler must not call add_timer_on(). Upon exit the
- * timer is not queued and the handler is not running on any CPU.
- *
- * Note: For !irqsafe timers, you must not hold locks that are held in
- * interrupt context while calling this function. Even if the lock has
- * nothing to do with the timer in question. Here's why::
- *
- * CPU0 CPU1
- * ---- ----
- * <SOFTIRQ>
- * call_timer_fn();
- * base->running_timer = mytimer;
- * spin_lock_irq(somelock);
- * <IRQ>
- * spin_lock(somelock);
- * del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
- *
- * Now del_timer_sync() will never return and never release somelock.
- * The interrupt on the other CPU is waiting to grab somelock but
- * it has interrupted the softirq that CPU0 is waiting to finish.
- *
- * The function returns whether it has deactivated a pending timer or not.
+ * __timer_delete_sync - Internal function: Deactivate a timer and wait
+ * for the handler to finish.
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, @timer->function will be set to NULL under the
+ * timer base lock which prevents rearming of @timer
+ *
+ * If @shutdown is not set the timer can be rearmed later. If the timer can
+ * be rearmed concurrently, i.e. after dropping the base lock then the
+ * return value is meaningless.
+ *
+ * If @shutdown is set then @timer->function is set to NULL under timer
+ * base lock which prevents rearming of the timer. Any attempt to rearm
+ * a shutdown timer is silently ignored.
+ *
+ * If the timer should be reused after shutdown it has to be initialized
+ * again.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer_sync(struct timer_list *timer)
+static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
int ret;
@@ -1422,7 +1560,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
+ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
/*
* Must be able to sleep on PREEMPT_RT because of the slowpath in
@@ -1432,7 +1570,7 @@ int del_timer_sync(struct timer_list *timer)
lockdep_assert_preemption_enabled();
do {
- ret = try_to_del_timer_sync(timer);
+ ret = __try_to_del_timer_sync(timer, shutdown);
if (unlikely(ret < 0)) {
del_timer_wait_running(timer);
@@ -1442,8 +1580,96 @@ int del_timer_sync(struct timer_list *timer)
return ret;
}
-EXPORT_SYMBOL(del_timer_sync);
-#endif
+
+/**
+ * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
+ * @timer: The timer to be deactivated
+ *
+ * Synchronization rules: Callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's callback
+ * function. The timer's handler must not call add_timer_on(). Upon exit
+ * the timer is not queued and the handler is not running on any CPU.
+ *
+ * For !irqsafe timers, the caller must not hold locks that are held in
+ * interrupt context. Even if the lock has nothing to do with the timer in
+ * question. Here's why::
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * timer_delete_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now timer_delete_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but it has
+ * interrupted the softirq that CPU0 is waiting to finish.
+ *
+ * This function cannot guarantee that the timer is not rearmed again by
+ * some concurrent or preempting code, right after it dropped the base
+ * lock. If there is the possibility of a concurrent rearm then the return
+ * value of the function is meaningless.
+ *
+ * If such a guarantee is needed, e.g. for teardown situations then use
+ * timer_shutdown_sync() instead.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, false);
+}
+EXPORT_SYMBOL(timer_delete_sync);
+
+/**
+ * timer_shutdown_sync - Shutdown a timer and prevent rearming
+ * @timer: The timer to be shutdown
+ *
+ * When the function returns it is guaranteed that:
+ * - @timer is not queued
+ * - The callback function of @timer is not running
+ * - @timer cannot be enqueued again. Any attempt to rearm
+ * @timer is silently ignored.
+ *
+ * See timer_delete_sync() for synchronization rules.
+ *
+ * This function is useful for final teardown of an infrastructure where
+ * the timer is subject to a circular dependency problem.
+ *
+ * A common pattern for this is a timer and a workqueue where the timer can
+ * schedule work and work can arm the timer. On shutdown the workqueue must
+ * be destroyed and the timer must be prevented from rearming. Unless the
+ * code has conditionals like 'if (mything->in_shutdown)' to prevent that
+ * there is no way to get this correct with timer_delete_sync().
+ *
+ * timer_shutdown_sync() is solving the problem. The correct ordering of
+ * calls in this case is:
+ *
+ * timer_shutdown_sync(&mything->timer);
+ * workqueue_destroy(&mything->workqueue);
+ *
+ * After this 'mything' can be safely freed.
+ *
+ * This obviously implies that the timer is not required to be functional
+ * for the rest of the shutdown operation.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
+ */
+int timer_shutdown_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown_sync);
static void call_timer_fn(struct timer_list *timer,
void (*fn)(struct timer_list *),
@@ -1465,8 +1691,8 @@ static void call_timer_fn(struct timer_list *timer,
#endif
/*
* Couple the lock chain with the lock chain at
- * del_timer_sync() by acquiring the lock_map around the fn()
- * call here and in del_timer_sync().
+ * timer_delete_sync() by acquiring the lock_map around the fn()
+ * call here and in timer_delete_sync().
*/
lock_map_acquire(&lockdep_map);
@@ -1509,6 +1735,12 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
fn = timer->function;
+ if (WARN_ON_ONCE(!fn)) {
+ /* Should never happen. Emphasis on should! */
+ base->running_timer = NULL;
+ continue;
+ }
+
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
@@ -1933,7 +2165,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_setup_on_stack(&timer.timer, process_timeout, 0);
__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
- del_singleshot_timer_sync(&timer.timer);
+ del_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
@@ -2017,8 +2249,6 @@ int timers_dead_cpu(unsigned int cpu)
struct timer_base *new_base;
int b, i;
- BUG_ON(cpu_online(cpu));
-
for (b = 0; b < NR_BASES; b++) {
old_base = per_cpu_ptr(&timer_bases[b], cpu);
new_base = get_cpu_ptr(&timer_bases[b]);
@@ -2035,7 +2265,8 @@ int timers_dead_cpu(unsigned int cpu)
*/
forward_timer_base(new_base);
- BUG_ON(old_base->running_timer);
+ WARN_ON_ONCE(old_base->running_timer);
+ old_base->running_timer = NULL;
for (i = 0; i < WHEEL_SIZE; i++)
migrate_timer_list(new_base, old_base->vectors + i);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1052126bdca2..197545241ab8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -46,10 +46,16 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
If this is set, then arguments and stack can be found from
- the pt_regs passed into the function callback regs parameter
+ the ftrace_regs passed into the function callback regs parameter
by default, even without setting the REGS flag in the ftrace_ops.
- This allows for use of regs_get_kernel_argument() and
- kernel_stack_pointer().
+ This allows for use of ftrace_regs_get_argument() and
+ ftrace_regs_get_stack_pointer().
+
+config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ bool
+ help
+ If the architecture generates __patchable_function_entries sections
+ but does not want them included in the ftrace locations.
config HAVE_FTRACE_MCOUNT_RECORD
bool
@@ -76,6 +82,13 @@ config HAVE_OBJTOOL_MCOUNT
help
Arch supports objtool --mcount
+config HAVE_OBJTOOL_NOP_MCOUNT
+ bool
+ help
+ Arch supports the objtool options --mcount with --mnop.
+ An architecture can select this if it wants to enable nop'ing
+ of ftrace locations.
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -369,6 +382,7 @@ config SCHED_TRACER
config HWLAT_TRACER
bool "Tracer to detect hardware latencies (like SMIs)"
select GENERIC_TRACER
+ select TRACER_MAX_TRACE
help
This tracer, when enabled will create one or more kernel threads,
depending on what the cpumask file is set to, which each thread
@@ -404,6 +418,7 @@ config HWLAT_TRACER
config OSNOISE_TRACER
bool "OS Noise tracer"
select GENERIC_TRACER
+ select TRACER_MAX_TRACE
help
In the context of high-performance computing (HPC), the Operating
System Noise (osnoise) refers to the interference experienced by an
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7f5eb295fe19..918a7d12df8f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -346,8 +346,40 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
+static int blk_trace_start(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_setup &&
+ bt->trace_state != Blktrace_stopped)
+ return -EINVAL;
+
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ trace_note_time(bt);
+
+ return 0;
+}
+
+static int blk_trace_stop(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_running)
+ return -EINVAL;
+
+ bt->trace_state = Blktrace_stopped;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+
+ return 0;
+}
+
static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
+ blk_trace_stop(bt);
synchronize_rcu();
blk_trace_free(q, bt);
put_probe_ref();
@@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q)
if (!bt)
return -EINVAL;
- if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(q, bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
- int ret;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
@@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
if (bt == NULL)
return -EINVAL;
- /*
- * For starting a trace, we can transition from a setup or stopped
- * trace. For stopping a trace, the state must be running
- */
- ret = -EINVAL;
- if (start) {
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped) {
- blktrace_seq++;
- smp_mb();
- bt->trace_state = Blktrace_running;
- raw_spin_lock_irq(&running_trace_lock);
- list_add(&bt->running_list, &running_trace_list);
- raw_spin_unlock_irq(&running_trace_lock);
-
- trace_note_time(bt);
- ret = 0;
- }
- } else {
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- ret = 0;
- }
- }
-
- return ret;
+ if (start)
+ return blk_trace_start(bt);
+ else
+ return blk_trace_stop(bt);
}
int blk_trace_startstop(struct request_queue *q, int start)
@@ -717,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
*/
/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * blk_trace_ioctl - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
@@ -765,17 +769,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
}
/**
- * blk_trace_shutdown: - stop and cleanup trace structures
+ * blk_trace_shutdown - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
void blk_trace_shutdown(struct request_queue *q)
{
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->debugfs_mutex))) {
- __blk_trace_startstop(q, 0);
+ lockdep_is_held(&q->debugfs_mutex)))
__blk_trace_remove(q);
- }
}
#ifdef CONFIG_BLK_CGROUP
@@ -1546,7 +1548,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
@@ -1614,13 +1617,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- }
+ blk_trace_stop(bt);
put_probe_ref();
synchronize_rcu();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 68e5cdd24cef..3bbd3f0c810c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/bpf_perf_event.h>
#include <linux/btf.h>
#include <linux/filter.h>
@@ -20,6 +21,8 @@
#include <linux/fprobe.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/key.h>
+#include <linux/verification.h>
#include <net/bpf_sk_storage.h>
@@ -685,6 +688,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
+ sd->sample_flags |= PERF_SAMPLE_RAW;
err = __bpf_perf_event_output(regs, map, flags, sd);
@@ -743,6 +747,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
+ sd->sample_flags |= PERF_SAMPLE_RAW;
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
@@ -769,7 +774,7 @@ BPF_CALL_0(bpf_get_current_task_btf)
const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
- .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_type = RET_PTR_TO_BTF_ID_TRUSTED,
.ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
@@ -1026,11 +1031,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_X86_KERNEL_IBT
+static unsigned long get_entry_ip(unsigned long fentry_ip)
+{
+ u32 instr;
+
+ /* Being extra safe in here in case entry ip is on the page-edge. */
+ if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
+ return fentry_ip;
+ if (is_endbr(instr))
+ fentry_ip -= ENDBR_INSN_SIZE;
+ return fentry_ip;
+}
+#else
+#define get_entry_ip(fentry_ip) fentry_ip
+#endif
+
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
struct kprobe *kp = kprobe_running();
- return kp ? (uintptr_t)kp->addr : 0;
+ if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+ return 0;
+
+ return get_entry_ip((uintptr_t)kp->addr);
}
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
@@ -1181,6 +1205,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_KEYS
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_ptr: data to verify
+ * @sig_ptr: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+ struct bpf_dynptr_kern *sig_ptr,
+ struct bpf_key *trusted_keyring)
+{
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ return verify_pkcs7_signature(data_ptr->data,
+ bpf_dynptr_get_size(data_ptr),
+ sig_ptr->data,
+ bpf_dynptr_get_size(sig_ptr),
+ trusted_keyring->key,
+ VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
+ NULL);
+}
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+__diag_pop();
+
+BTF_SET8_START(key_sig_kfunc_set)
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+BTF_SET8_END(key_sig_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &key_sig_kfunc_set,
+};
+
+static int __init bpf_key_sig_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_key_sig_kfunc_set);
+}
+
+late_initcall(bpf_key_sig_kfuncs_init);
+#endif /* CONFIG_KEYS */
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1255,6 +1457,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1279,9 +1485,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_task_stack:
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
- return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
- return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1289,8 +1495,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
return &bpf_task_storage_delete_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
@@ -1507,6 +1717,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
return -EINVAL;
+ if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
+ return -ENOENT;
+
if (unlikely(!br_stack))
return -ENOENT;
@@ -2042,9 +2255,15 @@ static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ goto out;
+ }
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+out:
+ this_cpu_dec(*(prog->active));
}
#define UNPACK(...) __VA_ARGS__
@@ -2242,6 +2461,8 @@ struct bpf_kprobe_multi_link {
unsigned long *addrs;
u64 *cookies;
u32 cnt;
+ u32 mods_cnt;
+ struct module **mods;
};
struct bpf_kprobe_multi_run_ctx {
@@ -2297,6 +2518,14 @@ error:
return err;
}
+static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++)
+ module_put(mods[i]);
+}
+
static void free_user_syms(struct user_syms *us)
{
kvfree(us->syms);
@@ -2309,6 +2538,7 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
unregister_fprobe(&kmulti_link->fp);
+ kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
@@ -2318,6 +2548,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
kvfree(kmulti_link->addrs);
kvfree(kmulti_link->cookies);
+ kfree(kmulti_link->mods);
kfree(kmulti_link);
}
@@ -2340,7 +2571,7 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void
swap(*cookie_a, *cookie_b);
}
-static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
{
const unsigned long *addr_a = a, *addr_b = b;
@@ -2351,7 +2582,7 @@ static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
{
- return __bpf_kprobe_multi_cookie_cmp(a, b);
+ return bpf_kprobe_multi_addrs_cmp(a, b);
}
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
@@ -2369,7 +2600,7 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
return 0;
entry_ip = run_ctx->entry_ip;
addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
- __bpf_kprobe_multi_cookie_cmp);
+ bpf_kprobe_multi_addrs_cmp);
if (!addr)
return 0;
cookie = link->cookies + (addr - link->addrs);
@@ -2414,13 +2645,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
}
static void
-kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
struct pt_regs *regs)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, entry_ip, regs);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2453,6 +2684,71 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv)
}
}
+struct module_addr_args {
+ unsigned long *addrs;
+ u32 addrs_cnt;
+ struct module **mods;
+ int mods_cnt;
+ int mods_cap;
+};
+
+static int module_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct module_addr_args *args = data;
+ struct module **mods;
+
+ /* We iterate all modules symbols and for each we:
+ * - search for it in provided addresses array
+ * - if found we check if we already have the module pointer stored
+ * (we iterate modules sequentially, so we can check just the last
+ * module pointer)
+ * - take module reference and store it
+ */
+ if (!bsearch(&addr, args->addrs, args->addrs_cnt, sizeof(addr),
+ bpf_kprobe_multi_addrs_cmp))
+ return 0;
+
+ if (args->mods && args->mods[args->mods_cnt - 1] == mod)
+ return 0;
+
+ if (args->mods_cnt == args->mods_cap) {
+ args->mods_cap = max(16, args->mods_cap * 3 / 2);
+ mods = krealloc_array(args->mods, args->mods_cap, sizeof(*mods), GFP_KERNEL);
+ if (!mods)
+ return -ENOMEM;
+ args->mods = mods;
+ }
+
+ if (!try_module_get(mod))
+ return -EINVAL;
+
+ args->mods[args->mods_cnt] = mod;
+ args->mods_cnt++;
+ return 0;
+}
+
+static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
+{
+ struct module_addr_args args = {
+ .addrs = addrs,
+ .addrs_cnt = addrs_cnt,
+ };
+ int err;
+
+ /* We return either err < 0 in case of error, ... */
+ err = module_kallsyms_on_each_symbol(module_callback, &args);
+ if (err) {
+ kprobe_multi_put_modules(args.mods, args.mods_cnt);
+ kfree(args.mods);
+ return err;
+ }
+
+ /* or number of modules found if everything is ok. */
+ *mods = args.mods;
+ return args.mods_cnt;
+}
+
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2563,10 +2859,25 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
bpf_kprobe_multi_cookie_cmp,
bpf_kprobe_multi_cookie_swap,
link);
+ } else {
+ /*
+ * We need to sort addrs array even if there are no cookies
+ * provided, to allow bsearch in get_modules_for_addrs.
+ */
+ sort(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_addrs_cmp, NULL);
+ }
+
+ err = get_modules_for_addrs(&link->mods, addrs, cnt);
+ if (err < 0) {
+ bpf_link_cleanup(&link_primer);
+ return err;
}
+ link->mods_cnt = err;
err = register_fprobe_ips(&link->fp, addrs, cnt);
if (err) {
+ kprobe_multi_put_modules(link->mods, link->mods_cnt);
bpf_link_cleanup(&link_primer);
return err;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index aac63ca9c3d1..e8143e368074 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -141,6 +141,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
return -E2BIG;
fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ if (!fp->rethook)
+ return -ENOMEM;
for (i = 0; i < size; i++) {
struct fprobe_rethook_node *node;
@@ -301,7 +303,8 @@ int unregister_fprobe(struct fprobe *fp)
{
int ret;
- if (!fp || fp->ops.func != fprobe_handler)
+ if (!fp || (fp->ops.saved_func != fprobe_handler &&
+ fp->ops.saved_func != fprobe_kprobe_handler))
return -EINVAL;
/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 439e2ab6905e..442438b93fe9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -163,7 +163,7 @@ static void ftrace_sync_ipi(void *data)
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic, RCU, or per CPU ops, or we force list func,
+ * If this is a dynamic or RCU ops, or we force list func,
* then it needs to call the list anyway.
*/
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
@@ -1289,6 +1289,7 @@ static int ftrace_add_mod(struct trace_array *tr,
if (!ftrace_mod)
return -ENOMEM;
+ INIT_LIST_HEAD(&ftrace_mod->list);
ftrace_mod->func = kstrdup(func, GFP_KERNEL);
ftrace_mod->module = kstrdup(module, GFP_KERNEL);
ftrace_mod->enable = enable;
@@ -1644,6 +1645,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+static bool skip_record(struct dyn_ftrace *rec)
+{
+ /*
+ * At boot up, weak functions are set to disable. Function tracing
+ * can be enabled before they are, and they still need to be disabled now.
+ * If the record is disabled, still continue if it is marked as already
+ * enabled (this is needed to keep the accounting working).
+ */
+ return rec->flags & FTRACE_FL_DISABLED &&
+ !(rec->flags & FTRACE_FL_ENABLED);
+}
+
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1693,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
if (all) {
@@ -2016,7 +2029,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
char ins[MCOUNT_INSN_SIZE];
- int i;
if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
@@ -2024,9 +2036,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
}
printk(KERN_CONT "%s", fmt);
-
- for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
+ pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
}
enum ftrace_bug_type ftrace_bug_type;
@@ -2126,7 +2136,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
ftrace_bug_type = FTRACE_BUG_UNKNOWN;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
return FTRACE_UPDATE_IGNORE;
/*
@@ -2241,7 +2251,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags = 0;
+ rec->flags &= FTRACE_FL_DISABLED;
else
/*
* Just disable the record, but keep the ops TRAMP
@@ -2478,14 +2488,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- struct pt_regs *regs = ftrace_get_regs(fregs);
unsigned long addr;
addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
- arch_ftrace_set_direct_caller(regs, addr);
+ arch_ftrace_set_direct_caller(fregs, addr);
}
struct ftrace_ops direct_ops = {
@@ -2634,7 +2643,7 @@ void __weak ftrace_replace_code(int mod_flags)
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
failed = __ftrace_replace_code(rec, enable);
@@ -2753,6 +2762,19 @@ void __weak ftrace_arch_code_modify_post_process(void)
{
}
+static int update_ftrace_func(ftrace_func_t func)
+{
+ static ftrace_func_t save_func;
+
+ /* Avoid updating if it hasn't changed */
+ if (func == save_func)
+ return 0;
+
+ save_func = func;
+
+ return ftrace_update_ftrace_func(func);
+}
+
void ftrace_modify_all_code(int command)
{
int update = command & FTRACE_UPDATE_TRACE_FUNC;
@@ -2773,7 +2795,7 @@ void ftrace_modify_all_code(int command)
* traced.
*/
if (update) {
- err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ err = update_ftrace_func(ftrace_ops_list_func);
if (FTRACE_WARN_ON(err))
return;
}
@@ -2789,7 +2811,7 @@ void ftrace_modify_all_code(int command)
/* If irqs are disabled, we are in stop machine */
if (!irqs_disabled())
smp_call_function(ftrace_sync_ipi, NULL, 1);
- err = ftrace_update_ftrace_func(ftrace_trace_function);
+ err = update_ftrace_func(ftrace_trace_function);
if (FTRACE_WARN_ON(err))
return;
}
@@ -3019,18 +3041,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled) {
- /*
- * If these are dynamic or per_cpu ops, they still
- * need their data freed. Since, function tracing is
- * not currently active, we can just free them
- * without synchronizing all CPUs.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- goto free_ops;
-
- return 0;
- }
+ if (!command || !ftrace_enabled)
+ goto out;
/*
* If the ops uses a trampoline, then it needs to be
@@ -3067,11 +3079,10 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
removed_ops = NULL;
ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+out:
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the per_cpu
- * ops.
*/
if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
@@ -3094,7 +3105,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
- free_ops:
ftrace_trampoline_free(ops);
}
@@ -3191,7 +3201,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
/* if we can't allocate this size, try something smaller */
if (!order)
return -ENOMEM;
- order >>= 1;
+ order--;
goto again;
}
@@ -4193,6 +4203,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
}
found = 1;
}
+ cond_resched();
} while_for_each_ftrace_rec();
out_unlock:
mutex_unlock(&ftrace_lock);
@@ -5427,6 +5438,8 @@ static struct ftrace_ops stub_ops = {
* it is safe to modify the ftrace record, where it should be
* currently calling @old_addr directly, to call @new_addr.
*
+ * This is called with direct_mutex locked.
+ *
* Safety checks should be made to make sure that the code at
* @rec->ip is currently calling @old_addr. And this must
* also update entry->direct to @new_addr.
@@ -5439,6 +5452,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
unsigned long ip = rec->ip;
int ret;
+ lockdep_assert_held(&direct_mutex);
+
/*
* The ftrace_lock was used to determine if the record
* had more than one registered user to it. If it did,
@@ -5461,7 +5476,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
if (ret)
goto out_lock;
- ret = register_ftrace_function(&stub_ops);
+ ret = register_ftrace_function_nolock(&stub_ops);
if (ret) {
ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
goto out_lock;
@@ -6081,8 +6096,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- if (iter->tr && !list_empty(&iter->tr->mod_trace))
- iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ if (iter->tr) {
+ if (list_empty(&iter->tr->mod_trace))
+ iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+ else
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ }
} else
orig_hash = &iter->ops->func_hash->notrace_hash;
@@ -7384,7 +7403,7 @@ void __init ftrace_init(void)
}
pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, count / ENTRIES_PER_PAGE + 1);
+ count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
@@ -7511,8 +7530,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
/*
* Check the following for each ops before calling their func:
* if RCU flag is set, then rcu_is_watching() must be true
- * if PER_CPU is set, then ftrace_function_local_disable()
- * must be false
* Otherwise test if the ip matches the ops filter
*
* If any of the above fails then the op->func() is not executed.
@@ -7562,8 +7579,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func);
/*
* If there's only one function registered but it does not support
- * recursion, needs RCU protection and/or requires per cpu handling, then
- * this function will be called by the mcount trampoline.
+ * recursion, needs RCU protection, then this function will be called
+ * by the mcount trampoline.
*/
static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
@@ -8250,6 +8267,10 @@ struct kallsyms_data {
size_t found;
};
+/* This function gets called for all kernel and module symbols
+ * and returns 1 in case we resolved all the requested symbols,
+ * 0 otherwise.
+ */
static int kallsyms_callback(void *data, const char *name,
struct module *mod, unsigned long addr)
{
@@ -8265,8 +8286,7 @@ static int kallsyms_callback(void *data, const char *name,
if (args->addrs[idx])
return 0;
- addr = ftrace_location(addr);
- if (!addr)
+ if (!ftrace_location(addr))
return 0;
args->addrs[idx] = addr;
@@ -8293,17 +8313,19 @@ static int kallsyms_callback(void *data, const char *name,
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
{
struct kallsyms_data args;
- int err;
+ int found_all;
memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
args.cnt = cnt;
args.found = 0;
- err = kallsyms_on_each_symbol(kallsyms_callback, &args);
- if (err < 0)
- return err;
- return args.found == args.cnt ? 0 : -ESRCH;
+
+ found_all = kallsyms_on_each_symbol(kallsyms_callback, &args);
+ if (found_all)
+ return 0;
+ found_all = module_kallsyms_on_each_symbol(kallsyms_callback, &args);
+ return found_all ? 0 : -ESRCH;
}
#ifdef CONFIG_SYSCTL
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 18b0f1cbb947..c736487fc0e4 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -35,6 +35,49 @@
static struct trace_event_file *gen_kprobe_test;
static struct trace_event_file *gen_kretprobe_test;
+#define KPROBE_GEN_TEST_FUNC "do_sys_open"
+
+/* X86 */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%ax"
+#define KPROBE_GEN_TEST_ARG1 "filename=%dx"
+#define KPROBE_GEN_TEST_ARG2 "flags=%cx"
+#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)"
+
+/* ARM64 */
+#elif defined(CONFIG_ARM64)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%x0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%x1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%x2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%x3"
+
+/* ARM */
+#elif defined(CONFIG_ARM)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%r0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%r1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%r2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%r3"
+
+/* RISCV */
+#elif defined(CONFIG_RISCV)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%a0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%a1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%a2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%a3"
+
+/* others */
+#else
+#define KPROBE_GEN_TEST_ARG0 NULL
+#define KPROBE_GEN_TEST_ARG1 NULL
+#define KPROBE_GEN_TEST_ARG2 NULL
+#define KPROBE_GEN_TEST_ARG3 NULL
+#endif
+
+static bool trace_event_file_is_valid(struct trace_event_file *input)
+{
+ return input && !IS_ERR(input);
+}
+
/*
* Test to make sure we can create a kprobe event, then add more
* fields.
@@ -58,23 +101,23 @@ static int __init test_gen_kprobe_cmd(void)
* fields.
*/
ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
- "do_sys_open",
- "dfd=%ax", "filename=%dx");
+ KPROBE_GEN_TEST_FUNC,
+ KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
if (ret)
- goto free;
+ goto out;
/* Use kprobe_event_add_fields to add the rest of the fields */
- ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+ ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kprobe_test event file. We need to prevent
@@ -97,13 +140,13 @@ static int __init test_gen_kprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kprobe_test))
+ gen_kprobe_test = NULL;
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kprobe_test");
- free:
- kfree(buf);
-
goto out;
}
@@ -128,17 +171,17 @@ static int __init test_gen_kretprobe_cmd(void)
* Define the kretprobe event.
*/
ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
- "do_sys_open",
+ KPROBE_GEN_TEST_FUNC,
"$retval");
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kretprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kretprobe_test event file. We need to
@@ -162,13 +205,13 @@ static int __init test_gen_kretprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kretprobe_test))
+ gen_kretprobe_test = NULL;
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kretprobe_test");
- free:
- kfree(buf);
-
goto out;
}
@@ -182,10 +225,12 @@ static int __init kprobe_event_gen_test_init(void)
ret = test_gen_kretprobe_cmd();
if (ret) {
- WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
- trace_put_event_file(gen_kretprobe_test);
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+ trace_put_event_file(gen_kretprobe_test);
+ }
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
}
@@ -194,24 +239,30 @@ static int __init kprobe_event_gen_test_init(void)
static void __exit kprobe_event_gen_test_exit(void)
{
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kprobe_test", false));
+ if (trace_event_file_is_valid(gen_kprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kprobe_test"));
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kretprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kretprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index c69d82273ce7..32c3dfdb4d6a 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -83,8 +83,10 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
{
struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
- if (!rh || !handler)
+ if (!rh || !handler) {
+ kfree(rh);
return NULL;
+ }
rh->data = data;
rh->handler = handler;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d59b6a328b7f..c366a0a9ddba 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -413,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -518,6 +519,7 @@ struct ring_buffer_per_cpu {
local_t committing;
local_t commits;
local_t pages_touched;
+ local_t pages_lost;
local_t pages_read;
long last_pages_touch;
size_t shortest_full;
@@ -884,7 +886,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
- * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -893,10 +895,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
+ size_t lost;
size_t cnt;
read = local_read(&buffer->buffers[cpu]->pages_read);
+ lost = local_read(&buffer->buffers[cpu]->pages_lost);
cnt = local_read(&buffer->buffers[cpu]->pages_touched);
+
+ if (WARN_ON_ONCE(cnt < lost))
+ return 0;
+
+ cnt -= lost;
+
/* The reader can read an empty page, but not more than that */
if (cnt < read) {
WARN_ON_ONCE(read > cnt + 1);
@@ -906,6 +916,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+
+ return (dirty * 100) > (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -917,13 +942,56 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (!buffer)
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+
+ /* Wake up individual ones too. One level recursion */
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_wake_waiters(buffer, cpu);
+
+ rbwork = &buffer->irq_work;
+ } else {
+ if (WARN_ON_ONCE(!buffer->buffers))
+ return;
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ /* The CPU buffer may not have been initialized yet */
+ if (!cpu_buffer)
+ return;
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ rbwork->wait_index++;
+ /* make sure the waiters see the new index */
+ smp_wmb();
+
+ rb_wake_up_waiters(&rbwork->work);
+}
+
+/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
@@ -938,6 +1006,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ long wait_index;
int ret = 0;
/*
@@ -956,6 +1025,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -1000,26 +1070,29 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
- size_t nr_pages;
- size_t dirty;
+ bool done;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ done = !pagebusy && full_hit(buffer, cpu, full);
+
if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
+ if (done)
break;
}
schedule();
+
+ /* Make sure to see the new wait index */
+ smp_rmb();
+ if (wait_index != work->wait_index)
+ break;
}
if (full)
@@ -1036,6 +1109,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -1045,14 +1119,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
@@ -1060,8 +1135,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ if (full) {
+ poll_wait(filp, &work->full_waiters, poll_table);
+ work->full_waiters_pending = true;
+ } else {
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ }
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1077,6 +1158,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -1718,9 +1802,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(cpu_buffer->reader_page);
- rb_head_page_deactivate(cpu_buffer);
-
if (head) {
+ rb_head_page_deactivate(cpu_buffer);
+
list_for_each_entry_safe(bpage, tmp, head, list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
@@ -1956,6 +2040,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
*/
local_add(page_entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
}
/*
@@ -1977,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
int retries, success;
+ unsigned long flags;
- raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /* Can be called at early boot up, where interrupts must not been enabled */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* We are holding the reader lock, so the reader page won't be swapped
* in the ring buffer. Now we are racing with the writer trying to
@@ -2035,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* tracing
*/
RB_WARN_ON(cpu_buffer, !success);
- raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
/* free pages if they weren't inserted */
if (!success) {
@@ -2163,8 +2250,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- schedule_work_on(cpu,
- &cpu_buffer->update_pages_work);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu != smp_processor_id()) {
+ migrate_enable();
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ } else {
+ update_pages_handler(&cpu_buffer->update_pages_work);
+ migrate_enable();
+ }
}
}
@@ -2213,9 +2308,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- schedule_work_on(cpu_id,
- &cpu_buffer->update_pages_work);
- wait_for_completion(&cpu_buffer->update_done);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu_id == smp_processor_id()) {
+ rb_update_pages(cpu_buffer);
+ migrate_enable();
+ } else {
+ migrate_enable();
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ }
}
cpu_buffer->nr_pages_to_update = 0;
@@ -2440,6 +2543,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_add(entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
/*
* The entries will be zeroed out when we move the
@@ -2608,6 +2712,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
+ /* Make sure the padding is visible before the write update */
+ smp_wmb();
+
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
@@ -2619,6 +2726,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* time delta must be non zero */
event->time_delta = 1;
+ /* Make sure the padding is visible before the tail_page->write update */
+ smp_wmb();
+
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
@@ -3088,8 +3198,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
rb_end_commit(cpu_buffer);
@@ -3098,10 +3207,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -3125,10 +3230,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
@@ -3298,15 +3400,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer)
*
* Must be paired with ring_buffer_lock_reserve.
*/
-int ring_buffer_unlock_commit(struct trace_buffer *buffer,
- struct ring_buffer_event *event)
+int ring_buffer_unlock_commit(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -3892,7 +3993,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
memcpy(body, data, length);
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -4587,6 +4688,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ /*
+ * The writer has preempt disable, wait for it. But not forever
+ * Although, 1 second is pretty much "forever"
+ */
+#define USECS_WAIT 1000000
+ for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+ /* If the write is past the end of page, a writer is still updating it */
+ if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ break;
+
+ udelay(1);
+
+ /* Get the latest version of the reader write value */
+ smp_rmb();
+ }
+
+ /* The writer is not moving forward? Something is wrong */
+ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+ reader = NULL;
+
+ /*
+ * Make sure we see any padding after the write update
+ * (see rb_reset_tail())
+ */
+ smp_rmb();
+
+
return reader;
}
@@ -5164,6 +5292,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
local_set(&cpu_buffer->pages_touched, 0);
+ local_set(&cpu_buffer->pages_lost, 0);
local_set(&cpu_buffer->pages_read, 0);
cpu_buffer->last_pages_touch = 0;
cpu_buffer->shortest_full = 0;
@@ -5232,7 +5361,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
- * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
@@ -5302,7 +5431,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
- * rind_buffer_empty - is the ring buffer empty?
+ * ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
@@ -5616,7 +5745,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
- if (full)
+ /*
+ * If a full page is expected, this can still be returned
+ * if there's been a previous partial read and the
+ * rest of the page can be read and the commit page is off
+ * the reader page.
+ */
+ if (full &&
+ (!read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
@@ -5877,7 +6014,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
}
out:
- ring_buffer_unlock_commit(data->buffer, event);
+ ring_buffer_unlock_commit(data->buffer);
return 0;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 78e576575b79..aef34673d79d 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -258,7 +258,7 @@ static void ring_buffer_producer(void)
hit++;
entry = ring_buffer_event_data(event);
*entry = smp_processor_id();
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
}
end_time = ktime_get();
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 83cace53b9fa..b2b49a27e886 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -16,7 +16,7 @@
#include "wip.h"
-struct rv_monitor rv_wip;
+static struct rv_monitor rv_wip;
DECLARE_DA_MON_PER_CPU(wip, unsigned char);
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
@@ -60,7 +60,7 @@ static void disable_wip(void)
da_monitor_destroy_wip();
}
-struct rv_monitor rv_wip = {
+static struct rv_monitor rv_wip = {
.name = "wip",
.description = "wakeup in preemptive per-cpu testing monitor.",
.enable = enable_wip,
@@ -69,13 +69,13 @@ struct rv_monitor rv_wip = {
.enabled = 0,
};
-static int register_wip(void)
+static int __init register_wip(void)
{
rv_register_monitor(&rv_wip);
return 0;
}
-static void unregister_wip(void)
+static void __exit unregister_wip(void)
{
rv_unregister_monitor(&rv_wip);
}
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index dacc37b62a2c..2e373f2c65ed 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -27,7 +27,7 @@ struct automaton_wip {
bool final_states[state_max_wip];
};
-static struct automaton_wip automaton_wip = {
+static const struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
"non_preemptive"
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 599225d9cf38..0e43dd2db685 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -15,7 +15,7 @@
#include "wwnr.h"
-struct rv_monitor rv_wwnr;
+static struct rv_monitor rv_wwnr;
DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
static void handle_switch(void *data, bool preempt, struct task_struct *p,
@@ -59,7 +59,7 @@ static void disable_wwnr(void)
da_monitor_destroy_wwnr();
}
-struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_wwnr = {
.name = "wwnr",
.description = "wakeup while not running per-task testing model.",
.enable = enable_wwnr,
@@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = {
.enabled = 0,
};
-static int register_wwnr(void)
+static int __init register_wwnr(void)
{
rv_register_monitor(&rv_wwnr);
return 0;
}
-static void unregister_wwnr(void)
+static void __exit unregister_wwnr(void)
{
rv_unregister_monitor(&rv_wwnr);
}
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index 118e576b91b4..d0d9c4b8121b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -27,7 +27,7 @@ struct automaton_wwnr {
bool final_states[state_max_wwnr];
};
-static struct automaton_wwnr automaton_wwnr = {
+static const struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
"running"
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 0b15e975d2c2..8d77526892f4 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void)
/* Now generate a gen_synth_test event */
ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("gen_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
/*
@@ -227,15 +225,13 @@ static int __init test_empty_synth_event(void)
/* Now trace an empty_synth_test event */
ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("empty_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
static struct synth_field_desc create_synth_test_fields[] = {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d3005279165d..a555a861b978 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
#include <linux/kallsyms.h>
#include <linux/security.h>
#include <linux/seq_file.h>
-#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
@@ -85,7 +84,7 @@ void __init disable_tracing_selftest(const char *reason)
#endif
/* Pipe tracepoints to printk */
-struct trace_iterator *tracepoint_print_iter;
+static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
@@ -999,7 +998,7 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev
/* ring_buffer_unlock_commit() enables preemption */
preempt_enable_notrace();
} else
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
/**
@@ -1193,12 +1192,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
{
void *cond_data = NULL;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
cond_data = tr->cond_snapshot->cond_data;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return cond_data;
}
@@ -1334,9 +1335,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
mutex_unlock(&trace_types_lock);
@@ -1363,6 +1366,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
{
int ret = 0;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (!tr->cond_snapshot)
@@ -1373,6 +1377,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return ret;
}
@@ -1415,6 +1420,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
return false;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+#define free_snapshot(tr) do { } while (0)
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1686,6 +1692,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
}
unsigned long __read_mostly tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops;
#ifdef LATENCY_FS_NOTIFY
@@ -1742,18 +1750,14 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)
+#else /* !LATENCY_FS_NOTIFY */
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
d_tracer, &tr->max_latency, &tracing_max_lat_fops)
-#else
-#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -1828,14 +1832,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
ring_buffer_record_off(tr->max_buffer.buffer);
#ifdef CONFIG_TRACER_SNAPSHOT
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
- goto out_unlock;
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
#endif
swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
- out_unlock:
arch_spin_unlock(&tr->max_lock);
}
@@ -1882,6 +1887,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
}
+
#endif /* CONFIG_TRACER_MAX_TRACE */
static int wait_on_pipe(struct trace_iterator *iter, int full)
@@ -2174,10 +2180,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2189,6 +2197,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
@@ -2200,6 +2215,11 @@ static size_t tgid_map_max;
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -2412,7 +2432,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
*/
+ lockdep_assert_preemption_disabled();
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
@@ -5593,7 +5617,7 @@ static const char readme_msg[] =
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t <type>\\[<array-size>\\]\n"
+ "\t symstr, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
@@ -5654,6 +5678,7 @@ static const char readme_msg[] =
"\t [:size=#entries]\n"
"\t [:pause][:continue][:clear]\n"
"\t [:name=histname1]\n"
+ "\t [:nohitcount]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
"\t Note, special fields can be used as well:\n"
@@ -5700,7 +5725,9 @@ static const char readme_msg[] =
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
"\t .buckets=size display values in groups of size rather than raw number\n"
- "\t .usecs display a common_timestamp in microseconds\n\n"
+ "\t .usecs display a common_timestamp in microseconds\n"
+ "\t .percent display a number of percentage value\n"
+ "\t .graph display a bar-graph of a value\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
"\t until told to do so. 'continue' can be used to start or\n"
@@ -5708,6 +5735,8 @@ static const char readme_msg[] =
"\t The 'clear' parameter will clear the contents of a running\n"
"\t hist trigger and leave its current paused/active state\n"
"\t unchanged.\n\n"
+ "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n"
+ "\t raw hitcount in the histogram.\n\n"
"\t The enable_hist and disable_hist triggers can be used to\n"
"\t have one event conditionally start and stop another event's\n"
"\t already-attached hist trigger. The syntax is analogous to\n"
@@ -5890,9 +5919,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
char buf[64];
int r;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -5917,10 +5948,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -6373,10 +6406,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
}
@@ -6407,12 +6442,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->current_trace->use_max_tr;
+
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
-#ifdef CONFIG_TRACER_MAX_TRACE
- had_max_tr = tr->allocated_snapshot;
-
if (had_max_tr && !t->use_max_tr) {
/*
* We need to make sure that the update_max_tr sees that
@@ -6425,11 +6460,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
- if (t->use_max_tr && !had_max_tr) {
+ if (t->use_max_tr && !tr->allocated_snapshot) {
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
+#else
+ tr->current_trace = &nop_trace;
#endif
if (t->init) {
@@ -6540,7 +6577,7 @@ out:
return ret;
}
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -6634,6 +6671,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
+ kfree(iter->fmt);
mutex_destroy(&iter->mutex);
kfree(iter);
@@ -6658,7 +6696,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
@@ -6763,7 +6801,20 @@ waitagain:
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- /* don't print partial lines */
+ /*
+ * If one print_trace_line() fills entire trace_seq in one shot,
+ * trace_seq_to_user() will returns -EBUSY because save_len == 0,
+ * In this case, we need to consume it, otherwise, loop will peek
+ * this event next time, resulting in an infinite loop.
+ */
+ if (save_len == 0) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ trace_consume(iter);
+ break;
+ }
+
+ /* In other cases, don't print partial lines */
iter->seq.seq.len = save_len;
break;
}
@@ -7436,10 +7487,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
@@ -7552,7 +7605,7 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -7777,6 +7830,7 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
int len)
{
struct tracing_log_err *err;
+ char *cmd;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
err = alloc_tracing_log_err(len);
@@ -7785,12 +7839,12 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
return err;
}
-
+ cmd = kzalloc(len, GFP_KERNEL);
+ if (!cmd)
+ return ERR_PTR(-ENOMEM);
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
kfree(err->cmd);
- err->cmd = kzalloc(len, GFP_KERNEL);
- if (!err->cmd)
- return ERR_PTR(-ENOMEM);
+ err->cmd = cmd;
list_del(&err->list);
return err;
@@ -8137,6 +8191,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8290,6 +8350,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ long wait_index;
+
if (ret)
goto out;
@@ -8297,10 +8359,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
+ wait_index = READ_ONCE(iter->wait_index);
+
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
+ /* No need to wait after waking up when tracing is off */
+ if (!tracer_tracing_is_on(iter->tr))
+ goto out;
+
+ /* Make sure we see the new wait_index */
+ smp_rmb();
+ if (wait_index != iter->wait_index)
+ goto out;
+
goto again;
}
@@ -8311,12 +8384,34 @@ out:
return ret;
}
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ if (cmd)
+ return -ENOIOCTLCMD;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
+ .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
@@ -9005,6 +9100,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
+ /* Wake up any waiters */
+ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
mutex_unlock(&trace_types_lock);
}
@@ -9522,7 +9619,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_maxlat_file(tr, d_tracer);
+#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
@@ -9776,41 +9875,41 @@ static __init int tracer_init_tracefs(void)
fs_initcall(tracer_init_tracefs);
-static int trace_panic_handler(struct notifier_block *this,
- unsigned long event, void *unused)
-{
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- return NOTIFY_OK;
-}
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused);
static struct notifier_block trace_panic_notifier = {
- .notifier_call = trace_panic_handler,
- .next = NULL,
- .priority = 150 /* priority: INT_MAX >= x >= 0 */
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
};
-static int trace_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
-{
- switch (val) {
- case DIE_OOPS:
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
static struct notifier_block trace_die_notifier = {
- .notifier_call = trace_die_handler,
- .priority = 200
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
};
/*
+ * The idea is to execute the following die/panic callback early, in order
+ * to avoid showing irrelevant information in the trace (like other panic
+ * notifier functions); we are the 2nd to run, after hung_task/rcu_stall
+ * warnings get disabled (to prevent potential log flooding).
+ */
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused)
+{
+ if (!ftrace_dump_on_oops)
+ return NOTIFY_DONE;
+
+ /* The die notifier requires DIE_OOPS to trigger */
+ if (self == &trace_die_notifier && ev != DIE_OOPS)
+ return NOTIFY_DONE;
+
+ ftrace_dump(ftrace_dump_on_oops);
+
+ return NOTIFY_DONE;
+}
+
+/*
* printk is set to max of 1024, we really don't need it that big.
* Nothing should be printing 1000 characters anyway.
*/
@@ -10091,7 +10190,7 @@ __init static int tracer_alloc_buffers(void)
* buffer. The memory will be removed once the "instance" is removed.
*/
ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
- "trace/RB:preapre", trace_rb_cpu_prepare,
+ "trace/RB:prepare", trace_rb_cpu_prepare,
NULL);
if (ret < 0)
goto out_free_cpumask;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 900e75d96c84..e46a49269be2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -308,8 +308,7 @@ struct trace_array {
struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -580,6 +579,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
@@ -614,7 +614,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap);
+ va_list ap) __printf(2, 0);
int trace_empty(struct trace_iterator *iter);
@@ -687,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-#endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
+#ifdef CONFIG_FSNOTIFY
#define LATENCY_FS_NOTIFY
#endif
+#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
@@ -1435,8 +1434,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
struct filter_pred;
struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
enum regex_type {
@@ -1455,17 +1452,6 @@ struct regex {
regex_match_func match;
};
-struct filter_pred {
- filter_pred_fn_t fn;
- u64 val;
- struct regex regex;
- unsigned short *ops;
- struct ftrace_event_field *field;
- int offset;
- int not;
- int op;
-};
-
static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
@@ -1954,8 +1940,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif
-extern struct trace_iterator *tracepoint_print_iter;
-
/*
* Reset the state of the trace_iterator so that it can read consumed data.
* Normally, the trace_iterator is used for reading the data when it is not
@@ -1968,17 +1952,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
}
/* Check the name is good for event/group/fields */
-static inline bool is_good_name(const char *name)
+static inline bool __is_good_name(const char *name, bool hash_ok)
{
- if (!isalpha(*name) && *name != '_')
+ if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-'))
return false;
while (*++name != '\0') {
- if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_' &&
+ (!hash_ok || *name != '-'))
return false;
}
return true;
}
+/* Check the name is good for event/group/fields */
+static inline bool is_good_name(const char *name)
+{
+ return __is_good_name(name, false);
+}
+
+/* Check the name is good for system */
+static inline bool is_good_system_name(const char *name)
+{
+ return __is_good_name(name, true);
+}
+
/* Convert certain expected symbols into '_' when generating event names */
static inline void sanitize_event_name(char *name)
{
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 801c2a7f7605..54d5fa35c90a 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
local_irq_disable();
start = trace_clock_local();
- trace_benchmark_event(bm_str);
+ trace_benchmark_event(bm_str, bm_last);
stop = trace_clock_local();
local_irq_enable();
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 79e6fbe5b365..c3e91060dc94 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
TRACE_EVENT_FN(benchmark_event,
- TP_PROTO(const char *str),
+ TP_PROTO(const char *str, u64 delta),
- TP_ARGS(str),
+ TP_ARGS(str, delta),
TP_STRUCT__entry(
__array( char, str, BENCHMARK_EVENT_STRLEN )
+ __field( u64, delta)
),
TP_fast_assign(
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ __entry->delta = delta;
),
- TP_printk("%s", __entry->str),
+ TP_printk("%s delta=%llu", __entry->str, __entry->delta),
trace_benchmark_reg, trace_benchmark_unreg
);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 154996684fb5..4376887e0d8a 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -118,6 +118,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
if (ret)
break;
}
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
out:
argv_free(argv);
@@ -214,6 +215,7 @@ int dyn_events_release_all(struct dyn_event_operations *type)
break;
}
out:
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
return ret;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 1783e3478912..352b65e2b910 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -16,6 +16,7 @@
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -26,6 +27,9 @@ struct trace_eprobe {
/* tracepoint event */
const char *event_name;
+ /* filter string for the tracepoint */
+ char *filter_str;
+
struct trace_event_call *event;
struct dyn_event devent;
@@ -48,6 +52,7 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep)
kfree(ep->event_system);
if (ep->event)
trace_event_put_ref(ep->event);
+ kfree(ep->filter_str);
kfree(ep);
}
@@ -453,29 +458,14 @@ NOKPROBE_SYMBOL(process_fetch_insn)
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -485,21 +475,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -509,29 +485,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
@@ -610,6 +564,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
{
struct eprobe_data *edata = data->private_data;
+ if (unlikely(!rec))
+ return;
+
__eprobe_trace_func(edata, rec);
}
@@ -664,14 +621,15 @@ static struct event_trigger_data *
new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
{
struct event_trigger_data *trigger;
+ struct event_filter *filter = NULL;
struct eprobe_data *edata;
+ int ret;
edata = kzalloc(sizeof(*edata), GFP_KERNEL);
trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
if (!trigger || !edata) {
- kfree(edata);
- kfree(trigger);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto error;
}
trigger->flags = EVENT_TRIGGER_FL_PROBE;
@@ -686,13 +644,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
trigger->cmd_ops = &event_trigger_cmd;
INIT_LIST_HEAD(&trigger->list);
- RCU_INIT_POINTER(trigger->filter, NULL);
+
+ if (ep->filter_str) {
+ ret = create_event_filter(file->tr, ep->event,
+ ep->filter_str, false, &filter);
+ if (ret)
+ goto error;
+ }
+ RCU_INIT_POINTER(trigger->filter, filter);
edata->file = file;
edata->ep = ep;
trigger->private_data = edata;
return trigger;
+error:
+ free_event_filter(filter);
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(ret);
}
static int enable_eprobe(struct trace_eprobe *ep,
@@ -726,6 +696,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
{
struct event_trigger_data *trigger = NULL, *iter;
struct trace_event_file *file;
+ struct event_filter *filter;
struct eprobe_data *edata;
file = find_event_file(tr, ep->event_system, ep->event_name);
@@ -752,6 +723,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
/* Make sure nothing is using the edata or trigger */
tracepoint_synchronize_unregister();
+ filter = rcu_access_pointer(trigger->filter);
+
+ if (filter)
+ free_event_filter(filter);
kfree(edata);
kfree(trigger);
@@ -927,12 +902,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
return ret;
}
+static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
+{
+ struct event_filter *dummy = NULL;
+ int i, ret, len = 0;
+ char *p;
+
+ if (argc == 0) {
+ trace_probe_log_err(0, NO_EP_FILTER);
+ return -EINVAL;
+ }
+
+ /* Recover the filter string */
+ for (i = 0; i < argc; i++)
+ len += strlen(argv[i]) + 1;
+
+ ep->filter_str = kzalloc(len, GFP_KERNEL);
+ if (!ep->filter_str)
+ return -ENOMEM;
+
+ p = ep->filter_str;
+ for (i = 0; i < argc; i++) {
+ ret = snprintf(p, len, "%s ", argv[i]);
+ if (ret < 0)
+ goto error;
+ if (ret > len) {
+ ret = -E2BIG;
+ goto error;
+ }
+ p += ret;
+ len -= ret;
+ }
+ p[-1] = '\0';
+
+ /*
+ * Ensure the filter string can be parsed correctly. Note, this
+ * filter string is for the original event, not for the eprobe.
+ */
+ ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
+ true, &dummy);
+ free_event_filter(dummy);
+ if (ret)
+ goto error;
+
+ return 0;
+error:
+ kfree(ep->filter_str);
+ ep->filter_str = NULL;
+ return ret;
+}
+
static int __trace_eprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
- * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS]
- * Fetch args:
+ * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
+ * Fetch args (no space):
* <name>=$<field>[:TYPE]
*/
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
@@ -942,8 +967,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
char buf1[MAX_EVENT_NAME_LEN];
char buf2[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
- int ret = 0;
- int i;
+ int ret = 0, filter_idx = 0;
+ int i, filter_cnt;
if (argc < 2 || argv[0][0] != 'e')
return -ECANCELED;
@@ -968,11 +993,19 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (!event) {
- strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
- sanitize_event_name(buf1);
+ strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
event = buf1;
}
+ for (i = 2; i < argc; i++) {
+ if (!strcmp(argv[i], "if")) {
+ filter_idx = i + 1;
+ filter_cnt = argc - filter_idx;
+ argc = i;
+ break;
+ }
+ }
+
mutex_lock(&event_mutex);
event_call = find_and_get_event(sys_name, sys_event);
ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -988,6 +1021,14 @@ static int __trace_eprobe_create(int argc, const char *argv[])
goto error;
}
+ if (filter_idx) {
+ trace_probe_log_set_index(filter_idx);
+ ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
+ if (ret)
+ goto parse_error;
+ } else
+ ep->filter_str = NULL;
+
argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 61e3a2620fa3..05e791241812 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
struct trace_event_call *tp_event;
if (p_event->attr.kprobe_func) {
- func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
- if (!func)
- return -ENOMEM;
- ret = strncpy_from_user(
- func, u64_to_user_ptr(p_event->attr.kprobe_func),
- KSYM_NAME_LEN);
- if (ret == KSYM_NAME_LEN)
- ret = -E2BIG;
- if (ret < 0)
- goto out;
+ func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func),
+ KSYM_NAME_LEN);
+ if (IS_ERR(func)) {
+ ret = PTR_ERR(func);
+ return (ret == -EINVAL) ? -E2BIG : ret;
+ }
if (func[0] == '\0') {
kfree(func);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0356cae0cf74..33e0b4f8ebe6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2796,6 +2796,42 @@ trace_create_new_event(struct trace_event_call *call,
return file;
}
+#define MAX_BOOT_TRIGGERS 32
+
+static struct boot_triggers {
+ const char *event;
+ char *trigger;
+} bootup_triggers[MAX_BOOT_TRIGGERS];
+
+static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int nr_boot_triggers;
+
+static __init int setup_trace_triggers(char *str)
+{
+ char *trigger;
+ char *buf;
+ int i;
+
+ strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = true;
+ disable_tracing_selftest("running event triggers");
+
+ buf = bootup_trigger_buf;
+ for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ trigger = strsep(&buf, ",");
+ if (!trigger)
+ break;
+ bootup_triggers[i].event = strsep(&trigger, ".");
+ bootup_triggers[i].trigger = strsep(&trigger, ".");
+ if (!bootup_triggers[i].trigger)
+ break;
+ }
+
+ nr_boot_triggers = i;
+ return 1;
+}
+__setup("trace_trigger=", setup_trace_triggers);
+
/* Add an event to a trace directory */
static int
__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
@@ -2812,6 +2848,24 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
return event_define_fields(call);
}
+static void trace_early_triggers(struct trace_event_file *file, const char *name)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nr_boot_triggers; i++) {
+ if (strcmp(name, bootup_triggers[i].event))
+ continue;
+ mutex_lock(&event_mutex);
+ ret = trigger_process_regex(file, bootup_triggers[i].trigger);
+ mutex_unlock(&event_mutex);
+ if (ret)
+ pr_err("Failed to register trigger '%s' on event %s\n",
+ bootup_triggers[i].trigger,
+ bootup_triggers[i].event);
+ }
+}
+
/*
* Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
@@ -2822,12 +2876,19 @@ __trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
struct trace_event_file *file;
+ int ret;
file = trace_create_new_event(call, tr);
if (!file)
return -ENOMEM;
- return event_define_fields(call);
+ ret = event_define_fields(call);
+ if (ret)
+ return ret;
+
+ trace_early_triggers(file, trace_event_name(call));
+
+ return 0;
}
struct ftrace_module_file_ops;
@@ -2880,7 +2941,10 @@ static int probe_remove_event_call(struct trace_event_call *call)
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
- return -EBUSY;
+ goto busy;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
/*
* The do_for_each_event_file_safe() is
* a double loop. After finding the call for this
@@ -2893,6 +2957,12 @@ static int probe_remove_event_call(struct trace_event_call *call)
__trace_remove_event_call(call);
return 0;
+ busy:
+ /* No need to clear the trace now */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tr->clear_trace = false;
+ }
+ return -EBUSY;
}
/* Remove an event_call */
@@ -2972,7 +3042,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
@@ -3726,6 +3796,8 @@ static __init int event_trace_enable(void)
list_add(&call->list, &ftrace_events);
}
+ register_trigger_cmds();
+
/*
* We need the top trace array to have a working set of trace
* points at early init, before the debug files and directories
@@ -3740,7 +3812,6 @@ static __init int event_trace_enable(void)
register_event_cmds();
- register_trigger_cmds();
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4b1057ab9d96..96acc2b71ac7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -43,6 +43,42 @@ enum filter_op_ids { OPS };
static const char * ops[] = { OPS };
+enum filter_pred_fn {
+ FILTER_PRED_FN_NOP,
+ FILTER_PRED_FN_64,
+ FILTER_PRED_FN_S64,
+ FILTER_PRED_FN_U64,
+ FILTER_PRED_FN_32,
+ FILTER_PRED_FN_S32,
+ FILTER_PRED_FN_U32,
+ FILTER_PRED_FN_16,
+ FILTER_PRED_FN_S16,
+ FILTER_PRED_FN_U16,
+ FILTER_PRED_FN_8,
+ FILTER_PRED_FN_S8,
+ FILTER_PRED_FN_U8,
+ FILTER_PRED_FN_COMM,
+ FILTER_PRED_FN_STRING,
+ FILTER_PRED_FN_STRLOC,
+ FILTER_PRED_FN_STRRELLOC,
+ FILTER_PRED_FN_PCHAR_USER,
+ FILTER_PRED_FN_PCHAR,
+ FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_,
+ FILTER_PRED_TEST_VISITED,
+};
+
+struct filter_pred {
+ enum filter_pred_fn fn_num;
+ u64 val;
+ struct regex regex;
+ unsigned short *ops;
+ struct ftrace_event_field *field;
+ int offset;
+ int not;
+ int op;
+};
+
/*
* pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
* pred_funcs_##type below must match the order of them above.
@@ -590,44 +626,48 @@ out_free:
return ERR_PTR(ret);
}
+enum pred_cmp_types {
+ PRED_CMP_TYPE_NOP,
+ PRED_CMP_TYPE_LT,
+ PRED_CMP_TYPE_LE,
+ PRED_CMP_TYPE_GT,
+ PRED_CMP_TYPE_GE,
+ PRED_CMP_TYPE_BAND,
+};
+
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr < val; \
-} \
-static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr <= val; \
-} \
-static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr > val; \
-} \
-static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr >= val; \
-} \
-static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return !!(*addr & val); \
-} \
-static const filter_pred_fn_t pred_funcs_##type[] = { \
- filter_pred_LE_##type, \
- filter_pred_LT_##type, \
- filter_pred_GE_##type, \
- filter_pred_GT_##type, \
- filter_pred_BAND_##type, \
-};
+ switch (pred->op) { \
+ case OP_LT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr < val; \
+ } \
+ case OP_LE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr <= val; \
+ } \
+ case OP_GT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr > val; \
+ } \
+ case OP_GE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr >= val; \
+ } \
+ case OP_BAND: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return !!(*addr & val); \
+ } \
+ default: \
+ return 0; \
+ } \
+}
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
@@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
return cmp ^ pred->not;
}
-static int filter_pred_none(struct filter_pred *pred, void *event)
-{
- return 0;
-}
-
/*
* regex_match_foo - Basic regex callbacks
*
@@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
}
}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int test_pred_visited_fn(struct filter_pred *pred, void *event);
+#else
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+ return 0;
+}
+#endif
+
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event);
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
@@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
for (i = 0; prog[i].pred; i++) {
struct filter_pred *pred = prog[i].pred;
- int match = pred->fn(pred, rec);
+ int match = filter_pred_fn_call(pred, rec);
if (match == prog[i].when_to_branch)
i = prog[i].target;
}
@@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
- int field_size, int field_is_signed)
+static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
+ int field_size, int field_is_signed)
{
- filter_pred_fn_t fn = NULL;
+ enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
int pred_func_index = -1;
switch (op) {
@@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
break;
default:
if (WARN_ON_ONCE(op < PRED_FUNC_START))
- return NULL;
+ return fn;
pred_func_index = op - PRED_FUNC_START;
if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
- return NULL;
+ return fn;
}
switch (field_size) {
case 8:
if (pred_func_index < 0)
- fn = filter_pred_64;
+ fn = FILTER_PRED_FN_64;
else if (field_is_signed)
- fn = pred_funcs_s64[pred_func_index];
+ fn = FILTER_PRED_FN_S64;
else
- fn = pred_funcs_u64[pred_func_index];
+ fn = FILTER_PRED_FN_U64;
break;
case 4:
if (pred_func_index < 0)
- fn = filter_pred_32;
+ fn = FILTER_PRED_FN_32;
else if (field_is_signed)
- fn = pred_funcs_s32[pred_func_index];
+ fn = FILTER_PRED_FN_S32;
else
- fn = pred_funcs_u32[pred_func_index];
+ fn = FILTER_PRED_FN_U32;
break;
case 2:
if (pred_func_index < 0)
- fn = filter_pred_16;
+ fn = FILTER_PRED_FN_16;
else if (field_is_signed)
- fn = pred_funcs_s16[pred_func_index];
+ fn = FILTER_PRED_FN_S16;
else
- fn = pred_funcs_u16[pred_func_index];
+ fn = FILTER_PRED_FN_U16;
break;
case 1:
if (pred_func_index < 0)
- fn = filter_pred_8;
+ fn = FILTER_PRED_FN_8;
else if (field_is_signed)
- fn = pred_funcs_s8[pred_func_index];
+ fn = FILTER_PRED_FN_S8;
else
- fn = pred_funcs_u8[pred_func_index];
+ fn = FILTER_PRED_FN_U8;
break;
}
return fn;
}
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event)
+{
+ switch (pred->fn_num) {
+ case FILTER_PRED_FN_64:
+ return filter_pred_64(pred, event);
+ case FILTER_PRED_FN_S64:
+ return filter_pred_s64(pred, event);
+ case FILTER_PRED_FN_U64:
+ return filter_pred_u64(pred, event);
+ case FILTER_PRED_FN_32:
+ return filter_pred_32(pred, event);
+ case FILTER_PRED_FN_S32:
+ return filter_pred_s32(pred, event);
+ case FILTER_PRED_FN_U32:
+ return filter_pred_u32(pred, event);
+ case FILTER_PRED_FN_16:
+ return filter_pred_16(pred, event);
+ case FILTER_PRED_FN_S16:
+ return filter_pred_s16(pred, event);
+ case FILTER_PRED_FN_U16:
+ return filter_pred_u16(pred, event);
+ case FILTER_PRED_FN_8:
+ return filter_pred_8(pred, event);
+ case FILTER_PRED_FN_S8:
+ return filter_pred_s8(pred, event);
+ case FILTER_PRED_FN_U8:
+ return filter_pred_u8(pred, event);
+ case FILTER_PRED_FN_COMM:
+ return filter_pred_comm(pred, event);
+ case FILTER_PRED_FN_STRING:
+ return filter_pred_string(pred, event);
+ case FILTER_PRED_FN_STRLOC:
+ return filter_pred_strloc(pred, event);
+ case FILTER_PRED_FN_STRRELLOC:
+ return filter_pred_strrelloc(pred, event);
+ case FILTER_PRED_FN_PCHAR_USER:
+ return filter_pred_pchar_user(pred, event);
+ case FILTER_PRED_FN_PCHAR:
+ return filter_pred_pchar(pred, event);
+ case FILTER_PRED_FN_CPU:
+ return filter_pred_cpu(pred, event);
+ case FILTER_PRED_TEST_VISITED:
+ return test_pred_visited_fn(pred, event);
+ default:
+ return 0;
+ }
+}
+
/* Called when a predicate is encountered by predicate_parse() */
static int parse_pred(const char *str, void *data,
int pos, struct filter_parse_error *pe,
@@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
goto err_free;
}
- pred->fn = filter_pred_none;
+ pred->fn_num = FILTER_PRED_FN_NOP;
/*
* Quotes are not required, but if they exist then we need
@@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
filter_build_regex(pred);
if (field->filter_type == FILTER_COMM) {
- pred->fn = filter_pred_comm;
+ pred->fn_num = FILTER_PRED_FN_COMM;
} else if (field->filter_type == FILTER_STATIC_STRING) {
- pred->fn = filter_pred_string;
+ pred->fn_num = FILTER_PRED_FN_STRING;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
- pred->fn = filter_pred_strloc;
+ pred->fn_num = FILTER_PRED_FN_STRLOC;
} else if (field->filter_type == FILTER_RDYN_STRING)
- pred->fn = filter_pred_strrelloc;
+ pred->fn_num = FILTER_PRED_FN_STRRELLOC;
else {
if (!ustring_per_cpu) {
@@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
}
if (ustring)
- pred->fn = filter_pred_pchar_user;
+ pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
else
- pred->fn = filter_pred_pchar;
+ pred->fn_num = FILTER_PRED_FN_PCHAR;
}
/* go past the last quote */
i++;
@@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
pred->val = val;
if (field->filter_type == FILTER_CPU)
- pred->fn = filter_pred_cpu;
+ pred->fn_num = FILTER_PRED_FN_CPU;
else {
- pred->fn = select_comparison_fn(pred->op, field->size,
- field->is_signed);
+ pred->fn_num = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
if (pred->op == OP_NE)
pred->not = 1;
}
@@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
- WARN_ON_ONCE(!pred->fn);
+ WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
if (!field) {
WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
if (!strchr(fields, *field->name))
continue;
- pred->fn = test_pred_visited_fn;
+ pred->fn_num = FILTER_PRED_TEST_VISITED;
}
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fdf784620c28..fcaf226b7744 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -69,7 +69,8 @@
C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
C(EXPECT_NUMBER, "Expecting numeric literal"), \
C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
- C(DIVISION_BY_ZERO, "Division by zero"),
+ C(DIVISION_BY_ZERO, "Division by zero"), \
+ C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -104,6 +105,38 @@ enum field_op_id {
FIELD_OP_MULT,
};
+enum hist_field_fn {
+ HIST_FIELD_FN_NOP,
+ HIST_FIELD_FN_VAR_REF,
+ HIST_FIELD_FN_COUNTER,
+ HIST_FIELD_FN_CONST,
+ HIST_FIELD_FN_LOG2,
+ HIST_FIELD_FN_BUCKET,
+ HIST_FIELD_FN_TIMESTAMP,
+ HIST_FIELD_FN_CPU,
+ HIST_FIELD_FN_STRING,
+ HIST_FIELD_FN_DYNSTRING,
+ HIST_FIELD_FN_RELDYNSTRING,
+ HIST_FIELD_FN_PSTRING,
+ HIST_FIELD_FN_S64,
+ HIST_FIELD_FN_U64,
+ HIST_FIELD_FN_S32,
+ HIST_FIELD_FN_U32,
+ HIST_FIELD_FN_S16,
+ HIST_FIELD_FN_U16,
+ HIST_FIELD_FN_S8,
+ HIST_FIELD_FN_U8,
+ HIST_FIELD_FN_UMINUS,
+ HIST_FIELD_FN_MINUS,
+ HIST_FIELD_FN_PLUS,
+ HIST_FIELD_FN_DIV,
+ HIST_FIELD_FN_MULT,
+ HIST_FIELD_FN_DIV_POWER2,
+ HIST_FIELD_FN_DIV_NOT_POWER2,
+ HIST_FIELD_FN_DIV_MULT_SHIFT,
+ HIST_FIELD_FN_EXECNAME,
+};
+
/*
* A hist_var (histogram variable) contains variable information for
* hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
@@ -123,15 +156,15 @@ struct hist_var {
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
- hist_field_fn_t fn;
- unsigned int ref;
- unsigned int size;
- unsigned int offset;
- unsigned int is_signed;
unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
+ enum hist_field_fn fn_num;
+ unsigned int ref;
+ unsigned int size;
+ unsigned int offset;
+ unsigned int is_signed;
/*
* Variable fields contain variable-specific info in var.
@@ -166,14 +199,11 @@ struct hist_field {
u64 div_multiplier;
};
-static u64 hist_field_none(struct hist_field *field,
- struct tracing_map_elt *elt,
- struct trace_buffer *buffer,
- struct ring_buffer_event *rbe,
- void *event)
-{
- return 0;
-}
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event);
static u64 hist_field_const(struct hist_field *field,
struct tracing_map_elt *elt,
@@ -250,7 +280,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, elt, buffer, rbe, event);
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -264,7 +294,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
struct hist_field *operand = hist_field->operands[0];
unsigned long buckets = hist_field->buckets;
- u64 val = operand->fn(operand, elt, buffer, rbe, event);
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
if (WARN_ON_ONCE(!buckets))
return val;
@@ -285,8 +315,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
@@ -300,8 +330,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
@@ -315,8 +345,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
/* Return -1 for the undefined case */
if (!val2)
@@ -338,7 +368,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return val1 >> __ffs64(operand2->constant);
}
@@ -352,7 +382,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return div64_u64(val1, operand2->constant);
}
@@ -366,7 +396,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
/*
* If the divisor is a constant, do a multiplication and shift instead.
@@ -400,8 +430,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 * val2;
}
@@ -414,7 +444,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
- s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
+ s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -477,6 +507,8 @@ enum hist_field_flags {
HIST_FIELD_FL_ALIAS = 1 << 16,
HIST_FIELD_FL_BUCKET = 1 << 17,
HIST_FIELD_FL_CONST = 1 << 18,
+ HIST_FIELD_FL_PERCENT = 1 << 19,
+ HIST_FIELD_FL_GRAPH = 1 << 20,
};
struct var_defs {
@@ -495,6 +527,7 @@ struct hist_trigger_attrs {
bool cont;
bool clear;
bool ts_in_usecs;
+ bool no_hitcount;
unsigned int map_bits;
char *assignment_str[TRACING_MAP_VARS_MAX];
@@ -588,7 +621,7 @@ struct action_data {
* event param, and is passed to the synthetic event
* invocation.
*/
- unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
+ unsigned int var_ref_idx[SYNTH_FIELDS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -657,19 +690,19 @@ struct snapshot_context {
* Returns the specific division function to use if the divisor
* is constant. This avoids extra branches when the trigger is hit.
*/
-static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
{
u64 div = divisor->constant;
if (!(div & (div - 1)))
- return div_by_power_of_two;
+ return HIST_FIELD_FN_DIV_POWER2;
/* If the divisor is too large, do a regular division */
if (div > (1 << HIST_DIV_SHIFT))
- return div_by_not_power_of_two;
+ return HIST_FIELD_FN_DIV_NOT_POWER2;
divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
- return div_by_mult_and_shift;
+ return HIST_FIELD_FN_DIV_MULT_SHIFT;
}
static void track_data_free(struct track_data *track_data)
@@ -954,7 +987,7 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
* A trigger can define one or more variables. If any one of them is
* currently referenced by any other trigger, this function will
* determine that.
-
+ *
* Typically used to determine whether or not a trigger can be removed
* - if there are any references to a trigger's variables, it cannot.
*
@@ -1327,6 +1360,8 @@ static const char *hist_field_name(struct hist_field *field,
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
+ else if (field->flags & HIST_FIELD_FL_HITCOUNT)
+ field_name = "hitcount";
if (field_name == NULL)
field_name = "";
@@ -1334,38 +1369,32 @@ static const char *hist_field_name(struct hist_field *field,
return field_name;
}
-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
{
- hist_field_fn_t fn = NULL;
-
switch (field_size) {
case 8:
if (field_is_signed)
- fn = hist_field_s64;
+ return HIST_FIELD_FN_S64;
else
- fn = hist_field_u64;
- break;
+ return HIST_FIELD_FN_U64;
case 4:
if (field_is_signed)
- fn = hist_field_s32;
+ return HIST_FIELD_FN_S32;
else
- fn = hist_field_u32;
- break;
+ return HIST_FIELD_FN_U32;
case 2:
if (field_is_signed)
- fn = hist_field_s16;
+ return HIST_FIELD_FN_S16;
else
- fn = hist_field_u16;
- break;
+ return HIST_FIELD_FN_U16;
case 1:
if (field_is_signed)
- fn = hist_field_s8;
+ return HIST_FIELD_FN_S8;
else
- fn = hist_field_u8;
- break;
+ return HIST_FIELD_FN_U8;
}
- return fn;
+ return HIST_FIELD_FN_NOP;
}
static int parse_map_size(char *str)
@@ -1523,7 +1552,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
- } else if (strcmp(str, "pause") == 0)
+ } else if (strcmp(str, "nohitcount") == 0 ||
+ strcmp(str, "NOHC") == 0)
+ attrs->no_hitcount = true;
+ else if (strcmp(str, "pause") == 0)
attrs->pause = true;
else if ((strcmp(str, "cont") == 0) ||
(strcmp(str, "continue") == 0))
@@ -1682,6 +1714,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
+ else if (hist_field->flags & HIST_FIELD_FL_PERCENT)
+ flags_str = "percent";
+ else if (hist_field->flags & HIST_FIELD_FL_GRAPH)
+ flags_str = "graph";
return flags_str;
}
@@ -1922,19 +1958,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out; /* caller will populate */
if (flags & HIST_FIELD_FL_VAR_REF) {
- hist_field->fn = hist_field_var_ref;
+ hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
goto out;
}
if (flags & HIST_FIELD_FL_HITCOUNT) {
- hist_field->fn = hist_field_counter;
+ hist_field->fn_num = HIST_FIELD_FN_COUNTER;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CONST) {
- hist_field->fn = hist_field_const;
+ hist_field->fn_num = HIST_FIELD_FN_CONST;
hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
@@ -1943,14 +1979,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
- hist_field->fn = hist_field_none;
+ hist_field->fn_num = HIST_FIELD_FN_NOP;
goto out;
}
if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
- hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
- hist_field_bucket;
+ hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
+ HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
@@ -1960,14 +1996,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_TIMESTAMP) {
- hist_field->fn = hist_field_timestamp;
+ hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
- hist_field->fn = hist_field_cpu;
+ hist_field->fn_num = HIST_FIELD_FN_CPU;
hist_field->size = sizeof(int);
hist_field->type = "unsigned int";
goto out;
@@ -1987,14 +2023,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto free;
if (field->filter_type == FILTER_STATIC_STRING) {
- hist_field->fn = hist_field_string;
+ hist_field->fn_num = HIST_FIELD_FN_STRING;
hist_field->size = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
- hist_field->fn = hist_field_dynstring;
+ hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
} else if (field->filter_type == FILTER_RDYN_STRING)
- hist_field->fn = hist_field_reldynstring;
+ hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
- hist_field->fn = hist_field_pstring;
+ hist_field->fn_num = HIST_FIELD_FN_PSTRING;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
@@ -2002,9 +2038,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field->type)
goto free;
- hist_field->fn = select_value_fn(field->size,
- field->is_signed);
- if (!hist_field->fn) {
+ hist_field->fn_num = select_value_fn(field->size,
+ field->is_signed);
+ if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
destroy_hist_field(hist_field, 0);
return NULL;
}
@@ -2150,7 +2186,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
return ref_field;
}
}
-
+ /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */
+ if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX)
+ return NULL;
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
if (init_var_ref(ref_field, var_field, system, event_name)) {
@@ -2290,6 +2328,14 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
if (ret || !(*buckets))
goto error;
*flags |= HIST_FIELD_FL_BUCKET;
+ } else if (strncmp(modifier, "percent", 7) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_PERCENT;
+ } else if (strncmp(modifier, "graph", 5) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_GRAPH;
} else {
error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
@@ -2305,6 +2351,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->attrs->ts_in_usecs = true;
} else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
+ else if (strcmp(field_name, "hitcount") == 0)
+ *flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
@@ -2340,7 +2388,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
if (!alias)
return NULL;
- alias->fn = var_ref->fn;
+ alias->fn_num = var_ref->fn_num;
alias->operands[0] = var_ref;
if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -2523,7 +2571,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
- expr->fn = hist_field_unary_minus;
+ expr->fn_num = HIST_FIELD_FN_UMINUS;
expr->operands[0] = operand1;
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -2595,7 +2643,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
- hist_field_fn_t op_fn;
+ enum hist_field_fn op_fn;
bool combine_consts;
if (*n_subexprs > 3) {
@@ -2654,16 +2702,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
switch (field_op) {
case FIELD_OP_MINUS:
- op_fn = hist_field_minus;
+ op_fn = HIST_FIELD_FN_MINUS;
break;
case FIELD_OP_PLUS:
- op_fn = hist_field_plus;
+ op_fn = HIST_FIELD_FN_PLUS;
break;
case FIELD_OP_DIV:
- op_fn = hist_field_div;
+ op_fn = HIST_FIELD_FN_DIV;
break;
case FIELD_OP_MULT:
- op_fn = hist_field_mult;
+ op_fn = HIST_FIELD_FN_MULT;
break;
default:
ret = -EINVAL;
@@ -2719,13 +2767,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
op_fn = hist_field_get_div_fn(operand2);
}
+ expr->fn_num = op_fn;
+
if (combine_consts) {
if (var1)
expr->operands[0] = var1;
if (var2)
expr->operands[1] = var2;
- expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+ expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
+ expr->fn_num = HIST_FIELD_FN_CONST;
expr->operands[0] = NULL;
expr->operands[1] = NULL;
@@ -2739,8 +2790,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->name = expr_str(expr, 0);
} else {
- expr->fn = op_fn;
-
/* The operand sizes should be the same, so just pick one */
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -3065,7 +3114,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
- var_val = val->fn(val, elt, buffer, rbe, rec);
+ var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
if (val->flags & HIST_FIELD_FL_STRING) {
@@ -3202,7 +3251,7 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* This function creates a field variable with the name var_name on
* the hist trigger currently being defined on the target event. If
* subsys_name and event_name are specified, this function simply
@@ -3562,6 +3611,7 @@ static int parse_action_params(struct trace_array *tr, char *params,
while (params) {
if (data->n_params >= SYNTH_FIELDS_MAX) {
hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
+ ret = -EINVAL;
goto out;
}
@@ -3898,6 +3948,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
lockdep_assert_held(&event_mutex);
+ /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */
+ if (data->n_params > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
if (data->use_trace_keyword)
synth_event_name = data->synth_event_name;
else
@@ -4186,6 +4240,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
return (u64)(unsigned long)(elt_data->comm);
}
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ switch (hist_field->fn_num) {
+ case HIST_FIELD_FN_VAR_REF:
+ return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COUNTER:
+ return hist_field_counter(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CONST:
+ return hist_field_const(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_LOG2:
+ return hist_field_log2(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_BUCKET:
+ return hist_field_bucket(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_TIMESTAMP:
+ return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CPU:
+ return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STRING:
+ return hist_field_string(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DYNSTRING:
+ return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_RELDYNSTRING:
+ return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PSTRING:
+ return hist_field_pstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S64:
+ return hist_field_s64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U64:
+ return hist_field_u64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S32:
+ return hist_field_s32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U32:
+ return hist_field_u32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S16:
+ return hist_field_s16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U16:
+ return hist_field_u16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S8:
+ return hist_field_s8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U8:
+ return hist_field_u8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_UMINUS:
+ return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MINUS:
+ return hist_field_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PLUS:
+ return hist_field_plus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV:
+ return hist_field_div(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MULT:
+ return hist_field_mult(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_POWER2:
+ return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_NOT_POWER2:
+ return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_MULT_SHIFT:
+ return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_EXECNAME:
+ return hist_field_execname(hist_field, elt, buffer, rbe, event);
+ default:
+ return 0;
+ }
+}
+
/* Convert a var that points to common_pid.execname to a string */
static void update_var_execname(struct hist_field *hist_field)
{
@@ -4197,7 +4319,7 @@ static void update_var_execname(struct hist_field *hist_field)
kfree_const(hist_field->type);
hist_field->type = "char[]";
- hist_field->fn = hist_field_execname;
+ hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
}
static int create_var_field(struct hist_trigger_data *hist_data,
@@ -4236,8 +4358,8 @@ static int create_var_field(struct hist_trigger_data *hist_data,
static int create_val_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
+ unsigned int i, j = 1, n_hitcount = 0;
char *fields_str, *field_str;
- unsigned int i, j = 1;
int ret;
ret = create_hitcount_val(hist_data);
@@ -4254,8 +4376,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!field_str)
break;
- if (strcmp(field_str, "hitcount") == 0)
- continue;
+ if (strcmp(field_str, "hitcount") == 0) {
+ if (!n_hitcount++)
+ continue;
+ }
ret = create_val_field(hist_data, j++, file, field_str);
if (ret)
@@ -4265,6 +4389,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (fields_str && (strcmp(fields_str, "hitcount") != 0))
ret = -EINVAL;
out:
+ /* There is only raw hitcount but nohitcount suppresses it. */
+ if (j == 1 && hist_data->attrs->no_hitcount) {
+ hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0);
+ ret = -ENOENT;
+ }
+
return ret;
}
@@ -4956,7 +5086,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
@@ -4987,7 +5117,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
- hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
@@ -5051,6 +5181,9 @@ static void event_hist_trigger(struct event_trigger_data *data,
void *key = NULL;
unsigned int i;
+ if (unlikely(!rbe))
+ return;
+
memset(compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
@@ -5062,7 +5195,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
HIST_STACKTRACE_SKIP);
key = entries;
} else {
- field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -5190,33 +5323,101 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_puts(m, "}");
}
+/* Get the 100 times of the percentage of @val in @total */
+static inline unsigned int __get_percentage(u64 val, u64 total)
+{
+ if (!total)
+ goto div0;
+
+ if (val < (U64_MAX / 10000))
+ return (unsigned int)div64_ul(val * 10000, total);
+
+ total = div64_u64(total, 10000);
+ if (!total)
+ goto div0;
+
+ return (unsigned int)div64_ul(val, total);
+div0:
+ return val ? UINT_MAX : 0;
+}
+
+#define BAR_CHAR '#'
+
+static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max)
+{
+ unsigned int len = __get_percentage(val, max);
+ int i;
+
+ if (len == UINT_MAX) {
+ snprintf(buf, size, "[ERROR]");
+ return buf;
+ }
+
+ len = len * size / 10000;
+ for (i = 0; i < len && i < size; i++)
+ buf[i] = BAR_CHAR;
+ while (i < size)
+ buf[i++] = ' ';
+ buf[size] = '\0';
+
+ return buf;
+}
+
+struct hist_val_stat {
+ u64 max;
+ u64 total;
+};
+
+static void hist_trigger_print_val(struct seq_file *m, unsigned int idx,
+ const char *field_name, unsigned long flags,
+ struct hist_val_stat *stats,
+ struct tracing_map_elt *elt)
+{
+ u64 val = tracing_map_read_sum(elt, idx);
+ unsigned int pc;
+ char bar[21];
+
+ if (flags & HIST_FIELD_FL_PERCENT) {
+ pc = __get_percentage(val, stats[idx].total);
+ if (pc == UINT_MAX)
+ seq_printf(m, " %s (%%):[ERROR]", field_name);
+ else
+ seq_printf(m, " %s (%%): %3u.%02u", field_name,
+ pc / 100, pc % 100);
+ } else if (flags & HIST_FIELD_FL_GRAPH) {
+ seq_printf(m, " %s: %20s", field_name,
+ __fill_bar_str(bar, 20, val, stats[idx].max));
+ } else if (flags & HIST_FIELD_FL_HEX) {
+ seq_printf(m, " %s: %10llx", field_name, val);
+ } else {
+ seq_printf(m, " %s: %10llu", field_name, val);
+ }
+}
+
static void hist_trigger_entry_print(struct seq_file *m,
struct hist_trigger_data *hist_data,
+ struct hist_val_stat *stats,
void *key,
struct tracing_map_elt *elt)
{
const char *field_name;
- unsigned int i;
+ unsigned int i = HITCOUNT_IDX;
+ unsigned long flags;
hist_trigger_print_key(m, hist_data, key, elt);
- seq_printf(m, " hitcount: %10llu",
- tracing_map_read_sum(elt, HITCOUNT_IDX));
+ /* At first, show the raw hitcount if !nohitcount */
+ if (!hist_data->attrs->no_hitcount)
+ hist_trigger_print_val(m, i, "hitcount", 0, stats, elt);
for (i = 1; i < hist_data->n_vals; i++) {
field_name = hist_field_name(hist_data->fields[i], 0);
-
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
- hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+ flags = hist_data->fields[i]->flags;
+ if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR)
continue;
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx", field_name,
- tracing_map_read_sum(elt, i));
- } else {
- seq_printf(m, " %s: %10llu", field_name,
- tracing_map_read_sum(elt, i));
- }
+ seq_puts(m, " ");
+ hist_trigger_print_val(m, i, field_name, flags, stats, elt);
}
print_actions(m, hist_data, elt);
@@ -5229,7 +5430,9 @@ static int print_entries(struct seq_file *m,
{
struct tracing_map_sort_entry **sort_entries = NULL;
struct tracing_map *map = hist_data->map;
- int i, n_entries;
+ int i, j, n_entries;
+ struct hist_val_stat *stats = NULL;
+ u64 val;
n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
hist_data->n_sort_keys,
@@ -5237,11 +5440,34 @@ static int print_entries(struct seq_file *m,
if (n_entries < 0)
return n_entries;
+ /* Calculate the max and the total for each field if needed. */
+ for (j = 0; j < hist_data->n_vals; j++) {
+ if (!(hist_data->fields[j]->flags &
+ (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH)))
+ continue;
+ if (!stats) {
+ stats = kcalloc(hist_data->n_vals, sizeof(*stats),
+ GFP_KERNEL);
+ if (!stats) {
+ n_entries = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < n_entries; i++) {
+ val = tracing_map_read_sum(sort_entries[i]->elt, j);
+ stats[j].total += val;
+ if (stats[j].max < val)
+ stats[j].max = val;
+ }
+ }
+
for (i = 0; i < n_entries; i++)
- hist_trigger_entry_print(m, hist_data,
+ hist_trigger_entry_print(m, hist_data, stats,
sort_entries[i]->key,
sort_entries[i]->elt);
+ kfree(stats);
+out:
tracing_map_destroy_sort_entries(sort_entries, n_entries);
return n_entries;
@@ -5631,6 +5857,7 @@ static int event_hist_trigger_print(struct seq_file *m,
struct hist_trigger_data *hist_data = data->private_data;
struct hist_field *field;
bool have_var = false;
+ bool show_val = false;
unsigned int i;
seq_puts(m, HIST_PREFIX);
@@ -5661,12 +5888,16 @@ static int event_hist_trigger_print(struct seq_file *m,
continue;
}
- if (i == HITCOUNT_IDX)
+ if (i == HITCOUNT_IDX) {
+ if (hist_data->attrs->no_hitcount)
+ continue;
seq_puts(m, "hitcount");
- else {
- seq_puts(m, ",");
+ } else {
+ if (show_val)
+ seq_puts(m, ",");
hist_field_print(m, field);
}
+ show_val = true;
}
if (have_var) {
@@ -5717,6 +5948,8 @@ static int event_hist_trigger_print(struct seq_file *m,
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
if (hist_data->enable_timestamps)
seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+ if (hist_data->attrs->no_hitcount)
+ seq_puts(m, ":nohitcount");
print_actions_spec(m, hist_data);
@@ -6343,7 +6576,7 @@ enable:
if (se)
se->ref++;
out:
- if (ret == 0)
+ if (ret == 0 && glob[0])
hist_err_clear();
return ret;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 5e8c07aef071..67592eed0be8 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -17,6 +17,8 @@
/* for gfp flag names */
#include <linux/trace_events.h>
#include <trace/events/mmflags.h>
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
#include "trace_synth.h"
@@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
{
unsigned int len = 0;
char *str_field;
+ int ret;
if (is_dynamic) {
u32 data_offset;
@@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry,
data_offset += event->n_u64 * sizeof(u64);
data_offset += data_size;
- str_field = (char *)entry + data_offset;
-
- len = strlen(str_val) + 1;
- strscpy(str_field, str_val, len);
+ len = kern_fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
+ ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+
(*n_u64)++;
} else {
str_field = (char *)&entry->fields[*n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)str_val < TASK_SIZE)
+ ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+ else
+#endif
+ ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+
+ if (ret < 0)
+ strcpy(str_field, FAULT_STRING);
+
(*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
}
@@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
- len = strlen(str_val) + 1;
+ len = kern_fetch_store_strlen((unsigned long)str_val);
fields_size += len;
}
@@ -817,10 +828,9 @@ static int register_synth_event(struct synth_event *event)
}
ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
+ /* unregister_trace_event() will be called inside */
+ if (ret < 0)
trace_remove_event_call(call);
- goto err;
- }
out:
return ret;
err:
@@ -1272,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields)
goto err_free_arg;
}
- fields[n_fields++] = field;
if (n_fields == SYNTH_FIELDS_MAX) {
synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
ret = -EINVAL;
goto err_free_arg;
}
+ fields[n_fields++] = field;
n_fields_this_loop++;
}
@@ -1415,7 +1425,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1427,7 +1436,6 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 918730d74932..e535959939d3 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str,
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
- filter_str, false, &filter);
+ filter_str, true, &filter);
+
+ /* Only enabled set_str for error handling */
+ if (filter) {
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ }
+
/*
* If create_event_filter() fails, filter still needs to be freed.
* Which the calling code will do with data->filter.
@@ -1078,8 +1085,14 @@ int set_trigger_filter(char *filter_str,
rcu_assign_pointer(data->filter, filter);
if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
+ /*
+ * Make sure the call is done with the filter.
+ * It is possible that a filter could fail at boot up,
+ * and then this path will be called. Avoid the synchronization
+ * in that case.
+ */
+ if (system_state != SYSTEM_BOOTING)
+ tracepoint_synchronize_unregister();
free_event_filter(tmp);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index a6621c52ce45..908e8a13c675 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -14,6 +14,7 @@
#include <linux/uio.h>
#include <linux/ioctl.h>
#include <linux/jhash.h>
+#include <linux/refcount.h>
#include <linux/trace_events.h>
#include <linux/tracefs.h>
#include <linux/types.h>
@@ -39,28 +40,69 @@
*/
#define MAX_PAGE_ORDER 0
#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
+#define MAX_EVENTS (MAX_BYTES * 8)
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
-#define MAX_FIELD_ARG_NAME 256
-static char *register_page_data;
+/*
+ * The MAP_STATUS_* macros are used for taking a index and determining the
+ * appropriate byte and the bit in the byte to set/reset for an event.
+ *
+ * The lower 3 bits of the index decide which bit to set.
+ * The remaining upper bits of the index decide which byte to use for the bit.
+ *
+ * This is used when an event has a probe attached/removed to reflect live
+ * status of the event wanting tracing or not to user-programs via shared
+ * memory maps.
+ */
+#define MAP_STATUS_BYTE(index) ((index) >> 3)
+#define MAP_STATUS_MASK(index) BIT((index) & 7)
+
+/*
+ * Internal bits (kernel side only) to keep track of connected probes:
+ * These are used when status is requested in text form about an event. These
+ * bits are compared against an internal byte on the event to determine which
+ * probes to print out to the user.
+ *
+ * These do not reflect the mapped bytes between the user and kernel space.
+ */
+#define EVENT_STATUS_FTRACE BIT(0)
+#define EVENT_STATUS_PERF BIT(1)
+#define EVENT_STATUS_OTHER BIT(7)
+
+/*
+ * Stores the pages, tables, and locks for a group of events.
+ * Each logical grouping of events has its own group, with a
+ * matching page for status checks within user programs. This
+ * allows for isolation of events to user programs by various
+ * means.
+ */
+struct user_event_group {
+ struct page *pages;
+ char *register_page_data;
+ char *system_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
+ DECLARE_HASHTABLE(register_table, 8);
+ DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+};
-static DEFINE_MUTEX(reg_mutex);
-static DEFINE_HASHTABLE(register_table, 4);
-static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+/* Group for init_user_ns mapping, top-most group */
+static struct user_event_group *init_group;
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
* already exist. These are globally used and their lifetime
* is tied to the refcnt member. These cannot go away until the
- * refcnt reaches zero.
+ * refcnt reaches one.
*/
struct user_event {
+ struct user_event_group *group;
struct tracepoint tracepoint;
struct trace_event_call call;
struct trace_event_class class;
@@ -68,10 +110,11 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
- atomic_t refcnt;
+ refcount_t refcnt;
int index;
int flags;
int min_size;
+ char status;
};
/*
@@ -86,6 +129,11 @@ struct user_event_refs {
struct user_event *events[];
};
+struct user_event_file_info {
+ struct user_event_group *group;
+ struct user_event_refs *refs;
+};
+
#define VALIDATOR_ENSURE_NULL (1 << 0)
#define VALIDATOR_REL (1 << 1)
@@ -98,7 +146,8 @@ struct user_event_validator {
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
void *tpdata, bool *faulted);
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
struct user_event **newuser);
static u32 user_event_key(char *name)
@@ -106,6 +155,144 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
+static void set_page_reservations(char *pages, bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = pages + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static void user_event_group_destroy(struct user_event_group *group)
+{
+ if (group->register_page_data)
+ set_page_reservations(group->register_page_data, false);
+
+ if (group->pages)
+ __free_pages(group->pages, MAX_PAGE_ORDER);
+
+ kfree(group->system_name);
+ kfree(group);
+}
+
+static char *user_event_group_system_name(struct user_namespace *user_ns)
+{
+ char *system_name;
+ int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+
+ if (user_ns != &init_user_ns) {
+ /*
+ * Unexpected at this point:
+ * We only currently support init_user_ns.
+ * When we enable more, this will trigger a failure so log.
+ */
+ pr_warn("user_events: Namespace other than init_user_ns!\n");
+ return NULL;
+ }
+
+ system_name = kmalloc(len, GFP_KERNEL);
+
+ if (!system_name)
+ return NULL;
+
+ snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+
+ return system_name;
+}
+
+static inline struct user_event_group
+*user_event_group_from_user_ns(struct user_namespace *user_ns)
+{
+ if (user_ns == &init_user_ns)
+ return init_group;
+
+ return NULL;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ struct user_event_group *group = NULL;
+
+ while (user_ns) {
+ group = user_event_group_from_user_ns(user_ns);
+
+ if (group)
+ break;
+
+ user_ns = user_ns->parent;
+ }
+
+ return group;
+}
+
+static struct user_event_group
+*user_event_group_create(struct user_namespace *user_ns)
+{
+ struct user_event_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+ if (!group)
+ return NULL;
+
+ group->system_name = user_event_group_system_name(user_ns);
+
+ if (!group->system_name)
+ goto error;
+
+ group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+
+ if (!group->pages)
+ goto error;
+
+ group->register_page_data = page_address(group->pages);
+
+ set_page_reservations(group->register_page_data, true);
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(group->page_bitmap, MAX_EVENTS);
+ set_bit(0, group->page_bitmap);
+
+ mutex_init(&group->reg_mutex);
+ hash_init(group->register_table);
+
+ return group;
+error:
+ if (group)
+ user_event_group_destroy(group);
+
+ return NULL;
+};
+
+static __always_inline
+void user_event_register_set(struct user_event *user)
+{
+ int i = user->index;
+
+ user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+}
+
+static __always_inline
+void user_event_register_clear(struct user_event *user)
+{
+ int i = user->index;
+
+ user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+}
+
+static __always_inline __must_check
+bool user_event_last_ref(struct user_event *user)
+{
+ return refcount_read(&user->refcnt) == 1;
+}
+
static __always_inline __must_check
size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
{
@@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
*
* Upon success user_event has its ref count increased by 1.
*/
-static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+static int user_event_parse_cmd(struct user_event_group *group,
+ char *raw_command, struct user_event **newuser)
{
char *name = raw_command;
char *args = strpbrk(name, " ");
@@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
if (flags)
*flags++ = '\0';
- return user_event_parse(name, args, flags, newuser);
+ return user_event_parse(group, name, args, flags, newuser);
}
static int user_field_array_size(const char *type)
@@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
goto add_field;
add_validator:
- if (strstr(type, "char") != 0)
+ if (strstr(type, "char") != NULL)
validator_flags |= VALIDATOR_ENSURE_NULL;
validator = kmalloc(sizeof(*validator), GFP_KERNEL);
@@ -458,7 +646,7 @@ static const char *user_field_format(const char *type)
return "%d";
if (strcmp(type, "unsigned char") == 0)
return "%u";
- if (strstr(type, "char[") != 0)
+ if (strstr(type, "char[") != NULL)
return "%s";
/* Unknown, likely struct, allowed treat as 64-bit */
@@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
return false;
check:
- return strstr(type, "char") != 0;
+ return strstr(type, "char") != NULL;
}
#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
+ char *buf, int len, bool *colon)
+{
+ int pos = 0, i = *iout;
+
+ *colon = false;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ *colon = true;
+ break;
+ }
+ }
+
+ /* Actual set, advance i */
+ if (len != 0)
+ *iout = i;
+
+ return pos + 1;
+}
+
+static int user_field_set_string(struct ftrace_event_field *field,
+ char *buf, int len, bool colon)
+{
+ int pos = 0;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+
+ if (colon)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
+
+ return pos + 1;
+}
+
static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
{
struct ftrace_event_field *field, *next;
@@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user)
dyn_event_remove(&user->devent);
- register_page_data[user->index] = 0;
- clear_bit(user->index, page_bitmap);
+ user_event_register_clear(user);
+ clear_bit(user->index, user->group->page_bitmap);
hash_del(&user->node);
user_event_destroy_validators(user);
@@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user)
return ret;
}
-static struct user_event *find_user_event(char *name, u32 *outkey)
+static struct user_event *find_user_event(struct user_event_group *group,
+ char *name, u32 *outkey)
{
struct user_event *user;
u32 key = user_event_key(name);
*outkey = key;
- hash_for_each_possible(register_table, user, node, key)
+ hash_for_each_possible(group->register_table, user, node, key)
if (!strcmp(EVENT_NAME(user), name)) {
- atomic_inc(&user->refcnt);
+ refcount_inc(&user->refcnt);
return user;
}
@@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}
- register_page_data[user->index] = status;
+ if (status)
+ user_event_register_set(user);
+ else
+ user_event_register_clear(user);
+
+ user->status = status;
}
/*
@@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
- atomic_inc(&user->refcnt);
+ refcount_inc(&user->refcnt);
update_reg_page_for(user);
return 0;
dec:
update_reg_page_for(user);
- atomic_dec(&user->refcnt);
+ refcount_dec(&user->refcnt);
return 0;
}
static int user_event_create(const char *raw_command)
{
+ struct user_event_group *group;
struct user_event *user;
char *name;
int ret;
@@ -861,14 +1098,21 @@ static int user_event_create(const char *raw_command)
if (!name)
return -ENOMEM;
- mutex_lock(&reg_mutex);
+ group = current_user_event_group();
- ret = user_event_parse_cmd(name, &user);
+ if (!group) {
+ kfree(name);
+ return -ENOENT;
+ }
+
+ mutex_lock(&group->reg_mutex);
+
+ ret = user_event_parse_cmd(group, name, &user);
if (!ret)
- atomic_dec(&user->refcnt);
+ refcount_dec(&user->refcnt);
- mutex_unlock(&reg_mutex);
+ mutex_unlock(&group->reg_mutex);
if (ret)
kfree(name);
@@ -910,14 +1154,14 @@ static bool user_event_is_busy(struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
- return atomic_read(&user->refcnt) != 0;
+ return !user_event_last_ref(user);
}
static int user_event_free(struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
- if (atomic_read(&user->refcnt) != 0)
+ if (!user_event_last_ref(user))
return -EBUSY;
return destroy_user_event(user);
@@ -926,49 +1170,35 @@ static int user_event_free(struct dyn_event *ev)
static bool user_field_match(struct ftrace_event_field *field, int argc,
const char **argv, int *iout)
{
- char *field_name, *arg_name;
- int len, pos, i = *iout;
+ char *field_name = NULL, *dyn_field_name = NULL;
bool colon = false, match = false;
+ int dyn_len, len;
- if (i >= argc)
+ if (*iout >= argc)
return false;
- len = MAX_FIELD_ARG_NAME;
- field_name = kmalloc(len, GFP_KERNEL);
- arg_name = kmalloc(len, GFP_KERNEL);
+ dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ 0, &colon);
- if (!arg_name || !field_name)
- goto out;
+ len = user_field_set_string(field, field_name, 0, colon);
- pos = 0;
-
- for (; i < argc; ++i) {
- if (i != *iout)
- pos += snprintf(arg_name + pos, len - pos, " ");
-
- pos += snprintf(arg_name + pos, len - pos, argv[i]);
-
- if (strchr(argv[i], ';')) {
- ++i;
- colon = true;
- break;
- }
- }
+ if (dyn_len != len)
+ return false;
- pos = 0;
+ dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
+ field_name = kmalloc(len, GFP_KERNEL);
- pos += snprintf(field_name + pos, len - pos, field->type);
- pos += snprintf(field_name + pos, len - pos, " ");
- pos += snprintf(field_name + pos, len - pos, field->name);
+ if (!dyn_field_name || !field_name)
+ goto out;
- if (colon)
- pos += snprintf(field_name + pos, len - pos, ";");
+ user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ dyn_len, &colon);
- *iout = i;
+ user_field_set_string(field, field_name, len, colon);
- match = strcmp(arg_name, field_name) == 0;
+ match = strcmp(dyn_field_name, field_name) == 0;
out:
- kfree(arg_name);
+ kfree(dyn_field_name);
kfree(field_name);
return match;
@@ -1036,7 +1266,8 @@ static int user_event_trace_register(struct user_event *user)
* The name buffer lifetime is owned by this method for success cases only.
* Upon success the returned user_event has its ref count increased by 1.
*/
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
struct user_event **newuser)
{
int ret;
@@ -1046,7 +1277,7 @@ static int user_event_parse(char *name, char *args, char *flags,
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
- user = find_user_event(name, &key);
+ user = find_user_event(group, name, &key);
mutex_unlock(&event_mutex);
if (user) {
@@ -1059,7 +1290,7 @@ static int user_event_parse(char *name, char *args, char *flags,
return 0;
}
- index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+ index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
if (index == MAX_EVENTS)
return -EMFILE;
@@ -1073,6 +1304,7 @@ static int user_event_parse(char *name, char *args, char *flags,
INIT_LIST_HEAD(&user->fields);
INIT_LIST_HEAD(&user->validators);
+ user->group = group;
user->tracepoint.name = name;
ret = user_event_parse_fields(user, args);
@@ -1091,8 +1323,8 @@ static int user_event_parse(char *name, char *args, char *flags,
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
user->call.tp = &user->tracepoint;
user->call.event.funcs = &user_event_funcs;
+ user->class.system = group->system_name;
- user->class.system = USER_EVENTS_SYSTEM;
user->class.fields_array = user_event_fields_array;
user->class.get_fields = user_event_get_fields;
user->class.reg = user_event_reg;
@@ -1110,13 +1342,13 @@ static int user_event_parse(char *name, char *args, char *flags,
user->index = index;
- /* Ensure we track ref */
- atomic_inc(&user->refcnt);
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
- set_bit(user->index, page_bitmap);
- hash_add(register_table, &user->node, key);
+ set_bit(user->index, group->page_bitmap);
+ hash_add(group->register_table, &user->node, key);
mutex_unlock(&event_mutex);
@@ -1127,6 +1359,7 @@ put_user_lock:
put_user:
user_event_destroy_fields(user);
user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
kfree(user);
return ret;
}
@@ -1134,32 +1367,20 @@ put_user:
/*
* Deletes a previously created event if it is no longer being used.
*/
-static int delete_user_event(char *name)
+static int delete_user_event(struct user_event_group *group, char *name)
{
u32 key;
- int ret;
- struct user_event *user = find_user_event(name, &key);
+ struct user_event *user = find_user_event(group, name, &key);
if (!user)
return -ENOENT;
- /* Ensure we are the last ref */
- if (atomic_read(&user->refcnt) != 1) {
- ret = -EBUSY;
- goto put_ref;
- }
-
- ret = destroy_user_event(user);
-
- if (ret)
- goto put_ref;
+ refcount_dec(&user->refcnt);
- return ret;
-put_ref:
- /* No longer have this ref */
- atomic_dec(&user->refcnt);
+ if (!user_event_last_ref(user))
+ return -EBUSY;
- return ret;
+ return destroy_user_event(user);
}
/*
@@ -1167,6 +1388,7 @@ put_ref:
*/
static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
{
+ struct user_event_file_info *info = file->private_data;
struct user_event_refs *refs;
struct user_event *user = NULL;
struct tracepoint *tp;
@@ -1178,7 +1400,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
rcu_read_lock_sched();
- refs = rcu_dereference_sched(file->private_data);
+ refs = rcu_dereference_sched(info->refs);
/*
* The refs->events array is protected by RCU, and new items may be
@@ -1236,6 +1458,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
return ret;
}
+static int user_events_open(struct inode *node, struct file *file)
+{
+ struct user_event_group *group;
+ struct user_event_file_info *info;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return -ENOMEM;
+
+ info->group = group;
+
+ file->private_data = info;
+
+ return 0;
+}
+
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
@@ -1245,7 +1489,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
+ count, &iov, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
@@ -1256,13 +1501,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
return user_events_write_core(kp->ki_filp, i);
}
-static int user_events_ref_add(struct file *file, struct user_event *user)
+static int user_events_ref_add(struct user_event_file_info *info,
+ struct user_event *user)
{
+ struct user_event_group *group = info->group;
struct user_event_refs *refs, *new_refs;
int i, size, count = 0;
- refs = rcu_dereference_protected(file->private_data,
- lockdep_is_held(&reg_mutex));
+ refs = rcu_dereference_protected(info->refs,
+ lockdep_is_held(&group->reg_mutex));
if (refs) {
count = refs->count;
@@ -1286,9 +1533,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
new_refs->events[i] = user;
- atomic_inc(&user->refcnt);
+ refcount_inc(&user->refcnt);
- rcu_assign_pointer(file->private_data, new_refs);
+ rcu_assign_pointer(info->refs, new_refs);
if (refs)
kfree_rcu(refs, rcu);
@@ -1309,13 +1556,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (size > PAGE_SIZE)
return -E2BIG;
- return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+ if (size < offsetofend(struct user_reg, write_index))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ if (ret)
+ return ret;
+
+ kreg->size = size;
+
+ return 0;
}
/*
* Registers a user_event on behalf of a user process.
*/
-static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+static long user_events_ioctl_reg(struct user_event_file_info *info,
+ unsigned long uarg)
{
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
struct user_reg reg;
@@ -1336,24 +1594,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
return ret;
}
- ret = user_event_parse_cmd(name, &user);
+ ret = user_event_parse_cmd(info->group, name, &user);
if (ret) {
kfree(name);
return ret;
}
- ret = user_events_ref_add(file, user);
+ ret = user_events_ref_add(info, user);
/* No longer need parse ref, ref_add either worked or not */
- atomic_dec(&user->refcnt);
+ refcount_dec(&user->refcnt);
/* Positive number is index and valid */
if (ret < 0)
return ret;
put_user((u32)ret, &ureg->write_index);
- put_user(user->index, &ureg->status_index);
+ put_user(user->index, &ureg->status_bit);
return 0;
}
@@ -1361,7 +1619,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
/*
* Deletes a user_event on behalf of a user process.
*/
-static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+static long user_events_ioctl_del(struct user_event_file_info *info,
+ unsigned long uarg)
{
void __user *ubuf = (void __user *)uarg;
char *name;
@@ -1374,7 +1633,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
/* event_mutex prevents dyn_event from racing */
mutex_lock(&event_mutex);
- ret = delete_user_event(name);
+ ret = delete_user_event(info->group, name);
mutex_unlock(&event_mutex);
kfree(name);
@@ -1388,19 +1647,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
static long user_events_ioctl(struct file *file, unsigned int cmd,
unsigned long uarg)
{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group = info->group;
long ret = -ENOTTY;
switch (cmd) {
case DIAG_IOCSREG:
- mutex_lock(&reg_mutex);
- ret = user_events_ioctl_reg(file, uarg);
- mutex_unlock(&reg_mutex);
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_reg(info, uarg);
+ mutex_unlock(&group->reg_mutex);
break;
case DIAG_IOCSDEL:
- mutex_lock(&reg_mutex);
- ret = user_events_ioctl_del(file, uarg);
- mutex_unlock(&reg_mutex);
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_del(info, uarg);
+ mutex_unlock(&group->reg_mutex);
break;
}
@@ -1412,17 +1673,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
*/
static int user_events_release(struct inode *node, struct file *file)
{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group;
struct user_event_refs *refs;
struct user_event *user;
int i;
+ if (!info)
+ return -EINVAL;
+
+ group = info->group;
+
/*
* Ensure refs cannot change under any situation by taking the
* register mutex during the final freeing of the references.
*/
- mutex_lock(&reg_mutex);
+ mutex_lock(&group->reg_mutex);
- refs = file->private_data;
+ refs = info->refs;
if (!refs)
goto out;
@@ -1436,37 +1704,56 @@ static int user_events_release(struct inode *node, struct file *file)
user = refs->events[i];
if (user)
- atomic_dec(&user->refcnt);
+ refcount_dec(&user->refcnt);
}
out:
file->private_data = NULL;
- mutex_unlock(&reg_mutex);
+ mutex_unlock(&group->reg_mutex);
kfree(refs);
+ kfree(info);
return 0;
}
static const struct file_operations user_data_fops = {
+ .open = user_events_open,
.write = user_events_write,
.write_iter = user_events_write_iter,
.unlocked_ioctl = user_events_ioctl,
.release = user_events_release,
};
+static struct user_event_group *user_status_group(struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ if (!m)
+ return NULL;
+
+ return m->private;
+}
+
/*
* Maps the shared page into the user process for checking if event is enabled.
*/
static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
{
+ char *pages;
+ struct user_event_group *group = user_status_group(file);
unsigned long size = vma->vm_end - vma->vm_start;
- if (size != MAX_EVENTS)
+ if (size != MAX_BYTES)
+ return -EINVAL;
+
+ if (!group)
return -EINVAL;
+ pages = group->register_page_data;
+
return remap_pfn_range(vma, vma->vm_start,
- virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ virt_to_phys(pages) >> PAGE_SHIFT,
size, vm_get_page_prot(VM_READ));
}
@@ -1490,14 +1777,18 @@ static void user_seq_stop(struct seq_file *m, void *p)
static int user_seq_show(struct seq_file *m, void *p)
{
+ struct user_event_group *group = m->private;
struct user_event *user;
char status;
int i, active = 0, busy = 0, flags;
- mutex_lock(&reg_mutex);
+ if (!group)
+ return -EINVAL;
+
+ mutex_lock(&group->reg_mutex);
- hash_for_each(register_table, i, user, node) {
- status = register_page_data[user->index];
+ hash_for_each(group->register_table, i, user, node) {
+ status = user->status;
flags = user->flags;
seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
@@ -1520,7 +1811,7 @@ static int user_seq_show(struct seq_file *m, void *p)
active++;
}
- mutex_unlock(&reg_mutex);
+ mutex_unlock(&group->reg_mutex);
seq_puts(m, "\n");
seq_printf(m, "Active: %d\n", active);
@@ -1539,7 +1830,24 @@ static const struct seq_operations user_seq_ops = {
static int user_status_open(struct inode *node, struct file *file)
{
- return seq_open(file, &user_seq_ops);
+ struct user_event_group *group;
+ int ret;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ ret = seq_open(file, &user_seq_ops);
+
+ if (!ret) {
+ /* Chain group to seq_file */
+ struct seq_file *m = file->private_data;
+
+ m->private = group;
+ }
+
+ return ret;
}
static const struct file_operations user_status_fops = {
@@ -1580,42 +1888,21 @@ err:
return -ENODEV;
}
-static void set_page_reservations(bool set)
-{
- int page;
-
- for (page = 0; page < MAX_PAGES; ++page) {
- void *addr = register_page_data + (PAGE_SIZE * page);
-
- if (set)
- SetPageReserved(virt_to_page(addr));
- else
- ClearPageReserved(virt_to_page(addr));
- }
-}
-
static int __init trace_events_user_init(void)
{
- struct page *pages;
int ret;
- /* Zero all bits beside 0 (which is reserved for failures) */
- bitmap_zero(page_bitmap, MAX_EVENTS);
- set_bit(0, page_bitmap);
+ init_group = user_event_group_create(&init_user_ns);
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
- if (!pages)
+ if (!init_group)
return -ENOMEM;
- register_page_data = page_address(pages);
-
- set_page_reservations(true);
ret = create_user_tracefs();
if (ret) {
pr_warn("user_events could not register with tracefs\n");
- set_page_reservations(false);
- __free_pages(pages, MAX_PAGE_ORDER);
+ user_event_group_destroy(init_group);
+ init_group = NULL;
return ret;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 23f7f0ec4f4c..ee77c8203bd5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -20,6 +20,7 @@
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
@@ -1394,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
@@ -1435,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 313439920a8c..94c1b5eb1dc0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -49,6 +49,28 @@
#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
/*
+ * osnoise/options entries.
+ */
+enum osnoise_options_index {
+ OSN_DEFAULTS = 0,
+ OSN_WORKLOAD,
+ OSN_PANIC_ON_STOP,
+ OSN_PREEMPT_DISABLE,
+ OSN_IRQ_DISABLE,
+ OSN_MAX
+};
+
+static const char * const osnoise_options_str[OSN_MAX] = {
+ "DEFAULTS",
+ "OSNOISE_WORKLOAD",
+ "PANIC_ON_STOP",
+ "OSNOISE_PREEMPT_DISABLE",
+ "OSNOISE_IRQ_DISABLE" };
+
+#define OSN_DEFAULT_OPTIONS 0x2
+static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
+
+/*
* trace_array of the enabled osnoise/timerlat instances.
*/
struct osnoise_instance {
@@ -917,7 +939,7 @@ void osnoise_trace_irq_entry(int id)
void osnoise_trace_irq_exit(int id, const char *desc)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1048,7 +1070,7 @@ static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1144,7 +1166,7 @@ thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
static void
thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
{
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1173,11 +1195,12 @@ trace_sched_switch_callback(void *data, bool preempt,
unsigned int prev_state)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
+ int workload = test_bit(OSN_WORKLOAD, &osnoise_options);
- if (p->pid != osn_var->pid)
+ if ((p->pid != osn_var->pid) || !workload)
thread_exit(osn_var, p);
- if (n->pid != osn_var->pid)
+ if ((n->pid != osn_var->pid) || !workload)
thread_entry(osn_var, n);
}
@@ -1255,6 +1278,9 @@ static __always_inline void osnoise_stop_tracing(void)
trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
"stop tracing hit on cpu %d\n", smp_processor_id());
+ if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
+ panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
+
tracer_tracing_off(tr);
}
rcu_read_unlock();
@@ -1289,12 +1315,14 @@ static void notify_new_max_latency(u64 latency)
*/
static int run_osnoise(void)
{
+ bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options);
struct osnoise_variables *osn_var = this_cpu_osn_var();
u64 start, sample, last_sample;
u64 last_int_count, int_count;
s64 noise = 0, max_noise = 0;
s64 total, last_total = 0;
struct osnoise_sample s;
+ bool disable_preemption;
unsigned int threshold;
u64 runtime, stop_in;
u64 sum_noise = 0;
@@ -1302,6 +1330,12 @@ static int run_osnoise(void)
int ret = -1;
/*
+ * Disabling preemption is only required if IRQs are enabled,
+ * and the options is set on.
+ */
+ disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options);
+
+ /*
* Considers the current thread as the workload.
*/
osn_var->pid = current->pid;
@@ -1317,6 +1351,15 @@ static int run_osnoise(void)
threshold = tracing_thresh ? : 5000;
/*
+ * Apply PREEMPT and IRQ disabled options.
+ */
+ if (disable_irq)
+ local_irq_disable();
+
+ if (disable_preemption)
+ preempt_disable();
+
+ /*
* Make sure NMIs see sampling first
*/
osn_var->sampling = true;
@@ -1403,16 +1446,21 @@ static int run_osnoise(void)
* cond_resched()
*/
if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
+
rcu_momentary_dyntick_idle();
- local_irq_enable();
+
+ if (!disable_irq)
+ local_irq_enable();
}
/*
* For the non-preemptive kernel config: let threads runs, if
- * they so wish.
+ * they so wish, unless set not do to so.
*/
- cond_resched();
+ if (!disable_irq && !disable_preemption)
+ cond_resched();
last_sample = sample;
last_int_count = int_count;
@@ -1432,6 +1480,15 @@ static int run_osnoise(void)
barrier();
/*
+ * Return to the preemptive state.
+ */
+ if (disable_preemption)
+ preempt_enable();
+
+ if (disable_irq)
+ local_irq_enable();
+
+ /*
* Save noise info.
*/
s.noise = time_to_us(sum_noise);
@@ -1710,9 +1767,16 @@ static void stop_kthread(unsigned int cpu)
struct task_struct *kthread;
kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
- if (kthread)
+ if (kthread) {
kthread_stop(kthread);
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ } else {
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
+ barrier();
+ return;
+ }
+ }
}
/*
@@ -1746,6 +1810,13 @@ static int start_kthread(unsigned int cpu)
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
} else {
+ /* if no workload, just return */
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = true;
+ barrier();
+ return 0;
+ }
+
snprintf(comm, 24, "osnoise/%d", cpu);
}
@@ -1786,8 +1857,9 @@ static int start_per_cpu_kthreads(void)
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
if (retval) {
+ cpus_read_unlock();
stop_per_cpu_kthreads();
- break;
+ return retval;
}
}
@@ -1860,6 +1932,150 @@ static void osnoise_init_hotplug_support(void)
#endif /* CONFIG_HOTPLUG_CPU */
/*
+ * seq file functions for the osnoise/options file.
+ */
+static void *s_options_start(struct seq_file *s, loff_t *pos)
+{
+ int option = *pos;
+
+ mutex_lock(&interface_lock);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static void *s_options_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ int option = ++(*pos);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static int s_options_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ int option = *pos;
+
+ if (option == OSN_DEFAULTS) {
+ if (osnoise_options == OSN_DEFAULT_OPTIONS)
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+ goto out;
+ }
+
+ if (test_bit(option, &osnoise_options))
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+
+out:
+ if (option != OSN_MAX)
+ seq_puts(s, " ");
+
+ return 0;
+}
+
+static void s_options_stop(struct seq_file *s, void *v)
+{
+ seq_puts(s, "\n");
+ mutex_unlock(&interface_lock);
+}
+
+static const struct seq_operations osnoise_options_seq_ops = {
+ .start = s_options_start,
+ .next = s_options_next,
+ .show = s_options_show,
+ .stop = s_options_stop
+};
+
+static int osnoise_options_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &osnoise_options_seq_ops);
+};
+
+/**
+ * osnoise_options_write - Write function for "options" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * Writing the option name sets the option, writing the "NO_"
+ * prefix in front of the option name disables it.
+ *
+ * Writing "DEFAULTS" resets the option values to the default ones.
+ */
+static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int running, option, enable, retval;
+ char buf[256], *option_str;
+
+ if (cnt >= 256)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "NO_", 3)) {
+ option_str = strstrip(buf);
+ enable = true;
+ } else {
+ option_str = strstrip(&buf[3]);
+ enable = false;
+ }
+
+ option = match_string(osnoise_options_str, OSN_MAX, option_str);
+ if (option < 0)
+ return -EINVAL;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop.
+ */
+ mutex_lock(&trace_types_lock);
+ running = osnoise_has_registered_instances();
+ if (running)
+ stop_per_cpu_kthreads();
+
+ mutex_lock(&interface_lock);
+ /*
+ * avoid CPU hotplug operations that might read options.
+ */
+ cpus_read_lock();
+
+ retval = cnt;
+
+ if (enable) {
+ if (option == OSN_DEFAULTS)
+ osnoise_options = OSN_DEFAULT_OPTIONS;
+ else
+ set_bit(option, &osnoise_options);
+ } else {
+ if (option == OSN_DEFAULTS)
+ retval = -EINVAL;
+ else
+ clear_bit(option, &osnoise_options);
+ }
+
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+
+ if (running)
+ start_per_cpu_kthreads();
+ mutex_unlock(&trace_types_lock);
+
+ return retval;
+}
+
+/*
* osnoise_cpus_read - Read function for reading the "cpus" file
* @filp: The active open file structure
* @ubuf: The userspace provided buffer to read value into
@@ -2041,6 +2257,14 @@ static const struct file_operations cpus_fops = {
.llseek = generic_file_llseek,
};
+static const struct file_operations osnoise_options_fops = {
+ .open = osnoise_options_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = osnoise_options_write
+};
+
#ifdef CONFIG_TIMERLAT_TRACER
#ifdef CONFIG_STACKTRACE
static int init_timerlat_stack_tracefs(struct dentry *top_dir)
@@ -2127,6 +2351,11 @@ static int init_tracefs(void)
if (!tmp)
goto err;
+ tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL,
+ &osnoise_options_fops);
+ if (!tmp)
+ goto err;
+
ret = init_timerlat_tracefs(top_dir);
if (ret)
goto err;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 67f47ea27921..57a13b61f186 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -11,6 +11,7 @@
#include <linux/kprobes.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/idr.h>
#include "trace_output.h"
@@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
-static int next_event_type = __TRACE_LAST_TYPE;
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -323,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
}
EXPORT_SYMBOL(trace_event_printf);
-static int trace_output_raw(struct trace_iterator *iter, char *name,
- char *fmt, va_list ap)
+static __printf(3, 0)
+int trace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
@@ -688,38 +688,23 @@ struct trace_event *ftrace_find_event(int type)
return NULL;
}
-static LIST_HEAD(ftrace_event_list);
+static DEFINE_IDA(trace_event_ida);
-static int trace_search_list(struct list_head **list)
+static void free_trace_event_type(int type)
{
- struct trace_event *e = NULL, *iter;
- int next = __TRACE_LAST_TYPE;
-
- if (list_empty(&ftrace_event_list)) {
- *list = &ftrace_event_list;
- return next;
- }
+ if (type >= __TRACE_LAST_TYPE)
+ ida_free(&trace_event_ida, type);
+}
- /*
- * We used up all possible max events,
- * lets see if somebody freed one.
- */
- list_for_each_entry(iter, &ftrace_event_list, list) {
- if (iter->type != next) {
- e = iter;
- break;
- }
- next++;
- }
+static int alloc_trace_event_type(void)
+{
+ int next;
- /* Did we used up all 65 thousand events??? */
- if (next > TRACE_EVENT_TYPE_MAX)
+ /* Skip static defined type numbers */
+ next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
+ TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
+ if (next < 0)
return 0;
-
- if (e)
- *list = &e->list;
- else
- *list = &ftrace_event_list;
return next;
}
@@ -761,28 +746,10 @@ int register_trace_event(struct trace_event *event)
if (WARN_ON(!event->funcs))
goto out;
- INIT_LIST_HEAD(&event->list);
-
if (!event->type) {
- struct list_head *list = NULL;
-
- if (next_event_type > TRACE_EVENT_TYPE_MAX) {
-
- event->type = trace_search_list(&list);
- if (!event->type)
- goto out;
-
- } else {
-
- event->type = next_event_type++;
- list = &ftrace_event_list;
- }
-
- if (WARN_ON(ftrace_find_event(event->type)))
+ event->type = alloc_trace_event_type();
+ if (!event->type)
goto out;
-
- list_add_tail(&event->list, list);
-
} else if (WARN(event->type > __TRACE_LAST_TYPE,
"Need to add type to trace.h")) {
goto out;
@@ -819,7 +786,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
int __unregister_trace_event(struct trace_event *event)
{
hlist_del(&event->node);
- list_del(&event->list);
+ free_trace_event_type(event->type);
return 0;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 36dff277de46..01ebabbbe8c9 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -76,9 +76,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Fetch type information table */
static const struct fetch_type probe_fetch_types[] = {
/* Special types */
- __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
- __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1,
+ "__data_loc char[]"),
+ __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
/* Basic types */
ASSIGN_FETCH_TYPE(u8, u8, 0),
@@ -98,10 +100,15 @@ static const struct fetch_type probe_fetch_types[] = {
ASSIGN_FETCH_TYPE_END
};
-static const struct fetch_type *find_fetch_type(const char *type)
+static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags)
{
int i;
+ /* Reject the symbol/symstr for uprobes */
+ if (type && (flags & TPARG_FL_USER) &&
+ (!strcmp(type, "symbol") || !strcmp(type, "symstr")))
+ return NULL;
+
if (!type)
type = DEFAULT_FETCH_TYPE_STR;
@@ -119,13 +126,13 @@ static const struct fetch_type *find_fetch_type(const char *type)
switch (bs) {
case 8:
- return find_fetch_type("u8");
+ return find_fetch_type("u8", flags);
case 16:
- return find_fetch_type("u16");
+ return find_fetch_type("u16", flags);
case 32:
- return find_fetch_type("u32");
+ return find_fetch_type("u32", flags);
case 64:
- return find_fetch_type("u64");
+ return find_fetch_type("u64", flags);
default:
goto fail;
}
@@ -246,7 +253,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
return -EINVAL;
}
strlcpy(buf, event, slash - event + 1);
- if (!is_good_name(buf)) {
+ if (!is_good_system_name(buf)) {
trace_probe_log_err(offset, BAD_GROUP_NAME);
return -EINVAL;
}
@@ -478,7 +485,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
DEREF_OPEN_BRACE);
return -EINVAL;
} else {
- const struct fetch_type *t2 = find_fetch_type(NULL);
+ const struct fetch_type *t2 = find_fetch_type(NULL, flags);
*tmp = '\0';
ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
@@ -630,9 +637,9 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
- parg->type = find_fetch_type("string");
+ parg->type = find_fetch_type("string", flags);
} else
- parg->type = find_fetch_type(t);
+ parg->type = find_fetch_type(t, flags);
if (!parg->type) {
trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
@@ -662,16 +669,26 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
ret = -EINVAL;
/* Store operation */
- if (!strcmp(parg->type->name, "string") ||
- !strcmp(parg->type->name, "ustring")) {
- if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
- code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
- code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
- BAD_STRING);
- goto fail;
+ if (parg->type->is_string) {
+ if (!strcmp(parg->type->name, "symstr")) {
+ if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
+ code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
+ code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_SYMSTRING);
+ goto fail;
+ }
+ } else {
+ if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
+ code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_STRING);
+ goto fail;
+ }
}
- if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
+ if (!strcmp(parg->type->name, "symstr") ||
+ (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
parg->count) {
/*
@@ -679,6 +696,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
* must be kept, and if parg->count != 0, this is an
* array of string pointers instead of string address
* itself.
+ * For the symstr, it doesn't need to dereference, thus
+ * it just get the value.
*/
code++;
if (code->op != FETCH_OP_NOP) {
@@ -690,6 +709,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
code->op = FETCH_OP_ST_USTRING;
+ else if (!strcmp(parg->type->name, "symstr"))
+ code->op = FETCH_OP_ST_SYMSTR;
else
code->op = FETCH_OP_ST_STRING;
code->size = parg->type->size;
@@ -919,8 +940,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
if (parg->count) {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s[%d])";
else
fmt = ", REC->%s[%d]";
@@ -928,8 +948,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
pos += snprintf(buf + pos, LEN_OR_ZERO,
fmt, parg->name, j);
} else {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s)";
else
fmt = ", REC->%s";
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 3b3869ae8cfd..23acfd1c3812 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -98,6 +98,7 @@ enum fetch_op {
FETCH_OP_ST_UMEM, /* Mem: .offset, .size */
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
+ FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -133,7 +134,8 @@ struct fetch_insn {
struct fetch_type {
const char *name; /* Name of type */
size_t size; /* Byte size of type */
- int is_signed; /* Signed flag */
+ bool is_signed; /* Signed flag */
+ bool is_string; /* String flag */
print_type_func_t print; /* Print functions */
const char *fmt; /* Format string */
const char *fmttype; /* Name in format file */
@@ -177,16 +179,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t)
#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- {.name = _name, \
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \
+ {.name = _name, \
.size = _size, \
- .is_signed = sign, \
+ .is_signed = (bool)sign, \
+ .is_string = (bool)str, \
.print = PRINT_TYPE_FUNC_NAME(ptype), \
.fmt = PRINT_TYPE_FMT_NAME(ptype), \
.fmttype = _fmttype, \
}
+
+/* Non string types can use these macros */
#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype)
+ __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype)
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
_ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype)
@@ -353,7 +358,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
#define TPARG_FL_TPOINT BIT(3)
-#define TPARG_FL_MASK GENMASK(3, 0)
+#define TPARG_FL_USER BIT(4)
+#define TPARG_FL_MASK GENMASK(4, 0)
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char *argv, unsigned int flags);
@@ -431,6 +437,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(ARRAY_TOO_BIG, "Array number is too big"), \
C(BAD_TYPE, "Unknown type is specified"), \
C(BAD_STRING, "String accepts only memory argument"), \
+ C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \
C(BAD_BITFIELD, "Invalid bitfield"), \
C(ARG_NAME_TOO_LONG, "Argument name is too long"), \
C(NO_ARG_NAME, "Argument name is not specified"), \
@@ -445,7 +452,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(SAME_PROBE, "There is already the exact same probe event"),\
C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
- C(BAD_ATTACH_ARG, "Attached event does not have this field"),
+ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
+ C(NO_EP_FILTER, "No filter rule after 'if'"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
new file mode 100644
index 000000000000..77dbd9ff9782
--- /dev/null
+++ b/kernel/trace/trace_probe_kernel.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_PROBE_KERNEL_H_
+#define __TRACE_PROBE_KERNEL_H_
+
+#define FAULT_STRING "(fault)"
+
+/*
+ * This depends on trace_probe.h, but can not include it due to
+ * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
+ * Which means that any other user must include trace_probe.h before including
+ * this file.
+ */
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int ret;
+
+ ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ /*
+ * strnlen_user_nofault returns zero on fault, insert the
+ * FAULT_STRING when that occurs.
+ */
+ if (ret <= 0)
+ return strlen(FAULT_STRING) + 1;
+ return ret;
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return kern_fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ /* For faults, return enough to hold the FAULT_STRING */
+ return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
+}
+
+static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
+{
+ if (ret >= 0) {
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+ } else {
+ strscpy(__dest, FAULT_STRING, len);
+ ret = strlen(__dest) + 1;
+ }
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return kern_fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index b3bdb8ddb862..5cea672243f6 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -67,6 +67,37 @@ probe_mem_read(void *dest, void *src, size_t size);
static nokprobe_inline int
probe_mem_read_user(void *dest, void *src, size_t size);
+static nokprobe_inline int
+fetch_store_symstrlen(unsigned long addr)
+{
+ char namebuf[KSYM_SYMBOL_LEN];
+ int ret;
+
+ ret = sprint_symbol(namebuf, addr);
+ if (ret < 0)
+ return 0;
+
+ return ret + 1;
+}
+
+/*
+ * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_symstring(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ return sprint_symbol(__dest, addr);
+}
+
/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
@@ -99,16 +130,22 @@ stage2:
stage3:
/* 3rd stage: store value to buffer */
if (unlikely(!dest)) {
- if (code->op == FETCH_OP_ST_STRING) {
+ switch (code->op) {
+ case FETCH_OP_ST_STRING:
ret = fetch_store_strlen(val + code->offset);
code++;
goto array;
- } else if (code->op == FETCH_OP_ST_USTRING) {
+ case FETCH_OP_ST_USTRING:
ret += fetch_store_strlen_user(val + code->offset);
code++;
goto array;
- } else
+ case FETCH_OP_ST_SYMSTR:
+ ret += fetch_store_symstrlen(val + code->offset);
+ code++;
+ goto array;
+ default:
return -EILSEQ;
+ }
}
switch (code->op) {
@@ -129,6 +166,10 @@ stage3:
loc = *(u32 *)dest;
ret = fetch_store_string_user(val + code->offset, dest, base);
break;
+ case FETCH_OP_ST_SYMSTR:
+ loc = *(u32 *)dest;
+ ret = fetch_store_symstring(val + code->offset, dest, base);
+ break;
default:
return -EILSEQ;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a2d301f58ced..ff0536cea968 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = {
};
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-noinline __noclone static void trace_direct_tramp(void) { }
+#ifndef CALL_DEPTH_ACCOUNT
+#define CALL_DEPTH_ACCOUNT ""
+#endif
+
+noinline __noclone static void trace_direct_tramp(void)
+{
+ asm(CALL_DEPTH_ACCOUNT);
+}
#endif
/*
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b69e207012c9..942ddbdace4a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-extern char *__bad_type_size(void);
-
#define SYSCALL_FIELD(_type, _name) { \
.type = #_type, .name = #_name, \
.size = sizeof(_type), .align = __alignof__(_type), \
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fb58e86dd117..8d64b6553aed 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -691,7 +691,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
- is_return ? TPARG_FL_RETURN : 0);
+ (is_return ? TPARG_FL_RETURN : 0) |
+ TPARG_FL_USER);
if (ret)
goto error;
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9901708ce6b8..c774e560f2f9 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
- unsigned int dups = 0, total_dups = 0;
+ unsigned int total_dups = 0;
int i;
void *key;
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
key = sort_entries[0]->key;
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
- dups++; total_dups++;
+ total_dups++;
continue;
}
key = sort_entries[i]->key;
- dups = 0;
}
WARN_ONCE(total_dups > 0,
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ef42c1a11920..f23144af5743 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
static int tracepoint_module_coming(struct module *mod)
{
struct tp_module *tp_mod;
- int ret = 0;
if (!mod->num_tracepoints)
return 0;
@@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod)
*/
if (trace_module_has_bad_taint(mod))
return 0;
- mutex_lock(&tracepoint_module_list_mutex);
+
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
- if (!tp_mod) {
- ret = -ENOMEM;
- goto end;
- }
+ if (!tp_mod)
+ return -ENOMEM;
tp_mod->mod = mod;
+
+ mutex_lock(&tracepoint_module_list_mutex);
list_add_tail(&tp_mod->list, &tracepoint_module_list);
blocking_notifier_call_chain(&tracepoint_notify_list,
MODULE_STATE_COMING, tp_mod);
-end:
mutex_unlock(&tracepoint_module_list_mutex);
- return ret;
+ return 0;
}
static void tracepoint_module_going(struct module *mod)
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 06ea04d44685..ee8e57fd6f90 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -87,10 +87,6 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_fanotify_groups"),
UCOUNT_ENTRY("max_fanotify_marks"),
#endif
- { },
- { },
- { },
- { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -263,29 +259,29 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
put_ucounts(ucounts);
}
-long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
struct ucounts *iter;
long max = LONG_MAX;
long ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long new = atomic_long_add_return(v, &iter->ucount[type]);
+ long new = atomic_long_add_return(v, &iter->rlimit[type]);
if (new < 0 || new > max)
ret = LONG_MAX;
else if (iter == ucounts)
ret = new;
- max = READ_ONCE(iter->ns->ucount_max[type]);
+ max = get_userns_rlimit_max(iter->ns, type);
}
return ret;
}
-bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
struct ucounts *iter;
long new = -1; /* Silence compiler warning */
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long dec = atomic_long_sub_return(v, &iter->ucount[type]);
+ long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
if (iter == ucounts)
new = dec;
@@ -294,11 +290,11 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
}
static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
- struct ucounts *last, enum ucount_type type)
+ struct ucounts *last, enum rlimit_type type)
{
struct ucounts *iter, *next;
for (iter = ucounts; iter != last; iter = next) {
- long dec = atomic_long_sub_return(1, &iter->ucount[type]);
+ long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
next = iter->ns->ucounts;
if (dec == 0)
@@ -306,12 +302,12 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
}
}
-void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
/* Caller must hold a reference to ucounts */
struct ucounts *iter;
@@ -319,12 +315,12 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
long dec, ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long new = atomic_long_add_return(1, &iter->ucount[type]);
+ long new = atomic_long_add_return(1, &iter->rlimit[type]);
if (new < 0 || new > max)
goto unwind;
if (iter == ucounts)
ret = new;
- max = READ_ONCE(iter->ns->ucount_max[type]);
+ max = get_userns_rlimit_max(iter->ns, type);
/*
* Grab an extra ucount reference for the caller when
* the rlimit count was previously 0.
@@ -336,24 +332,24 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
}
return ret;
dec_unwind:
- dec = atomic_long_sub_return(1, &iter->ucount[type]);
+ dec = atomic_long_sub_return(1, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
unwind:
do_dec_rlimit_put_ucounts(ucounts, iter, type);
return 0;
}
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit)
+bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
{
struct ucounts *iter;
long max = rlimit;
if (rlimit > LONG_MAX)
max = LONG_MAX;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long val = get_ucounts_value(iter, type);
+ long val = get_rlimit_value(iter, type);
if (val < 0 || val > max)
return true;
- max = READ_ONCE(iter->ns->ucount_max[type]);
+ max = get_userns_rlimit_max(iter->ns, type);
}
return false;
}
diff --git a/kernel/umh.c b/kernel/umh.c
index b989736e8707..850631518665 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -28,6 +28,7 @@
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
+#include <linux/freezer.h>
#include <trace/events/module.h>
@@ -403,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE;
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
@@ -436,18 +438,22 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
+ if (wait & UMH_KILLABLE)
+ state |= TASK_KILLABLE;
+
+ if (wait & UMH_FREEZABLE)
+ state |= TASK_FREEZABLE;
+ retval = wait_for_completion_state(&done, state);
+ if (!retval)
+ goto wait_done;
+
+ if (wait & UMH_KILLABLE) {
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
- /* fallthrough, umh_complete() was already called */
}
- wait_for_completion(&done);
wait_done:
retval = sub_info->retval;
out:
diff --git a/kernel/user.c b/kernel/user.c
index e2cf8c22b539..d667debeafd6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,6 +185,7 @@ void free_uid(struct user_struct *up)
if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
}
+EXPORT_SYMBOL_GPL(free_uid);
struct user_struct *alloc_uid(kuid_t uid)
{
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5481ba44a8d6..54211dbd516c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
+#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
@@ -113,6 +114,10 @@ int create_user_ns(struct cred *new)
!kgid_has_mapping(parent_ns, group))
goto fail_dec;
+ ret = security_create_user_ns(new);
+ if (ret < 0)
+ goto fail_dec;
+
ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
@@ -131,13 +136,13 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
INIT_WORK(&ns->work, free_user_ns);
- for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ca61d49885b..f50398cb790d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/random.h>
#include <linux/sysctl.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
@@ -57,6 +58,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write,
* theoretically be incorrect if there are two parallel writes
* at non-zero offsets to the same sysctl.
*/
+ add_device_randomness(tmp_data, sizeof(tmp_data));
down_write(&uts_sem);
memcpy(get_uts(table), tmp_data, sizeof(tmp_data));
up_write(&uts_sem);
@@ -72,8 +74,16 @@ static int proc_do_uts_string(struct ctl_table *table, int write,
static DEFINE_CTL_TABLE_POLL(hostname_poll);
static DEFINE_CTL_TABLE_POLL(domainname_poll);
+// Note: update 'enum uts_proc' to match any changes to this table
static struct ctl_table uts_kern_table[] = {
{
+ .procname = "arch",
+ .data = init_uts_ns.name.machine,
+ .maxlen = sizeof(init_uts_ns.name.machine),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
.procname = "ostype",
.data = init_uts_ns.name.sysname,
.maxlen = sizeof(init_uts_ns.name.sysname),
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeea9731ef80..07895deca271 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -1771,7 +1771,7 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
rwork->wq = wq;
- call_rcu(&rwork->rcu, rcu_work_rcufn);
+ call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
return true;
}
@@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!work->func))
return false;
- if (!from_cancel) {
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
- }
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);