aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec150
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/auditfilter.c19
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/bpf/Kconfig1
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/bloom_filter.c3
-rw-r--r--kernel/bpf/bpf_local_storage.c3
-rw-r--r--kernel/bpf/bpf_lru_list.c21
-rw-r--r--kernel/bpf/bpf_lru_list.h8
-rw-r--r--kernel/bpf/bpf_struct_ops.c24
-rw-r--r--kernel/bpf/btf.c117
-rw-r--r--kernel/bpf/cgroup.c15
-rw-r--r--kernel/bpf/core.c214
-rw-r--r--kernel/bpf/cpumap.c143
-rw-r--r--kernel/bpf/cpumask.c58
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/disasm.c58
-rw-r--r--kernel/bpf/hashtab.c28
-rw-r--r--kernel/bpf/helpers.c198
-rw-r--r--kernel/bpf/inode.c33
-rw-r--r--kernel/bpf/log.c3
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/map_iter.c42
-rw-r--r--kernel/bpf/memalloc.c409
-rw-r--r--kernel/bpf/mprog.c447
-rw-r--r--kernel/bpf/offload.c1
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c4
-rw-r--r--kernel/bpf/preload/iterators/Makefile2
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c9
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h526
-rw-r--r--kernel/bpf/queue_stack_maps.c4
-rw-r--r--kernel/bpf/reuseport_array.c3
-rw-r--r--kernel/bpf/ringbuf.c26
-rw-r--r--kernel/bpf/stackmap.c3
-rw-r--r--kernel/bpf/syscall.c614
-rw-r--r--kernel/bpf/tcx.c352
-rw-r--r--kernel/bpf/trampoline.c32
-rw-r--r--kernel/bpf/verifier.c1585
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c205
-rw-r--r--kernel/cgroup/cpuset.c525
-rw-r--r--kernel/cgroup/misc.c56
-rw-r--r--kernel/cgroup/namespace.c6
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/cgroup/rstat.c38
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config2
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/tiny-base.config2
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/configs/x86_debug.config1
-rw-r--r--kernel/configs/xen.config2
-rw-r--r--kernel/context_tracking.c12
-rw-r--r--kernel/cpu.c542
-rw-r--r--kernel/crash_core.c395
-rw-r--r--kernel/cred.c27
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c18
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c2
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/dma/Kconfig35
-rw-r--r--kernel/dma/contiguous.c108
-rw-r--r--kernel/dma/direct.c4
-rw-r--r--kernel/dma/direct.h3
-rw-r--r--kernel/dma/mapping.c6
-rw-r--r--kernel/dma/remap.c4
-rw-r--r--kernel/dma/swiotlb.c751
-rw-r--r--kernel/entry/common.c3
-rw-r--r--kernel/events/core.c139
-rw-r--r--kernel/events/hw_breakpoint.c28
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/events/uprobes.c17
-rw-r--r--kernel/fork.c56
-rw-r--r--kernel/futex/core.c3
-rw-r--r--kernel/gcov/Makefile2
-rwxr-xr-xkernel/gen_kheaders.sh9
-rw-r--r--kernel/iomem.c13
-rw-r--r--kernel/irq/chip.c28
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/internals.h17
-rw-r--r--kernel/irq/irqdesc.c77
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c26
-rw-r--r--kernel/irq/resend.c50
-rw-r--r--kernel/kallsyms.c127
-rw-r--r--kernel/kallsyms_selftest.c45
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/kcsan/core.c2
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kexec_core.c136
-rw-r--r--kernel/kexec_file.c202
-rw-r--r--kernel/kprobes.c29
-rw-r--r--kernel/ksyms_common.c43
-rw-r--r--kernel/ksysfs.c15
-rw-r--r--kernel/kthread.c17
-rw-r--r--kernel/livepatch/transition.c2
-rw-r--r--kernel/locking/lock_events.h4
-rw-r--r--kernel/locking/lockdep.c154
-rw-r--r--kernel/locking/locktorture.c63
-rw-r--r--kernel/locking/qspinlock_paravirt.h20
-rw-r--r--kernel/locking/rtmutex.c170
-rw-r--r--kernel/locking/rtmutex_api.c2
-rw-r--r--kernel/locking/rtmutex_common.h47
-rw-r--r--kernel/locking/ww_mutex.h12
-rw-r--r--kernel/module/decompress.c4
-rw-r--r--kernel/module/internal.h12
-rw-r--r--kernel/module/kallsyms.c28
-rw-r--r--kernel/module/main.c145
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/pid_sysctl.h29
-rw-r--r--kernel/power/hibernate.c180
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/power.h15
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/qos.c18
-rw-r--r--kernel/power/snapshot.c241
-rw-r--r--kernel/power/swap.c30
-rw-r--r--kernel/printk/internal.h2
-rw-r--r--kernel/printk/printk.c217
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/printk/printk_safe.c9
-rw-r--r--kernel/rcu/Kconfig18
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcuscale.c282
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/refscale.c37
-rw-r--r--kernel/rcu/tasks.h148
-rw-r--r--kernel/rcu/tree.c147
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_nocb.h56
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h2
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/scftorture.c12
-rw-r--r--kernel/sched/clock.c21
-rw-r--r--kernel/sched/completion.c26
-rw-r--r--kernel/sched/core.c823
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c124
-rw-r--r--kernel/sched/debug.c51
-rw-r--r--kernel/sched/fair.c1652
-rw-r--r--kernel/sched/features.h24
-rw-r--r--kernel/sched/psi.c50
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h179
-rw-r--r--kernel/sched/swait.c8
-rw-r--r--kernel/sched/topology.c26
-rw-r--r--kernel/sched/wait.c12
-rw-r--r--kernel/seccomp.c84
-rw-r--r--kernel/signal.c40
-rw-r--r--kernel/smp.c56
-rw-r--r--kernel/smpboot.c163
-rw-r--r--kernel/softirq.c24
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/sys_ni.c112
-rw-r--r--kernel/sysctl.c107
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clocksource.c10
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/posix-timers.c525
-rw-r--r--kernel/time/sched_clock.c24
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/time/time.c169
-rw-r--r--kernel/time/time_test.c2
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/torture.c39
-rw-r--r--kernel/trace/Kconfig41
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/bpf_trace.c440
-rw-r--r--kernel/trace/fgraph.c27
-rw-r--r--kernel/trace/fprobe.c38
-rw-r--r--kernel/trace/ftrace.c110
-rw-r--r--kernel/trace/ftrace_internal.h5
-rw-r--r--kernel/trace/rethook.c16
-rw-r--r--kernel/trace/ring_buffer.c83
-rw-r--r--kernel/trace/trace.c193
-rw-r--r--kernel/trace/trace.h39
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_btf.c122
-rw-r--r--kernel/trace/trace_btf.h11
-rw-r--r--kernel/trace/trace_entries.h28
-rw-r--r--kernel/trace/trace_eprobe.c80
-rw-r--r--kernel/trace/trace_events.c94
-rw-r--r--kernel/trace/trace_events_filter.c315
-rw-r--r--kernel/trace/trace_events_hist.c9
-rw-r--r--kernel/trace/trace_events_inject.c4
-rw-r--r--kernel/trace/trace_events_synth.c104
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_events_user.c23
-rw-r--r--kernel/trace/trace_export.c9
-rw-r--r--kernel/trace/trace_fprobe.c1230
-rw-r--r--kernel/trace/trace_functions_graph.c93
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kprobe.c51
-rw-r--r--kernel/trace/trace_kprobe_selftest.c3
-rw-r--r--kernel/trace/trace_osnoise.c477
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c898
-rw-r--r--kernel/trace/trace_probe.h73
-rw-r--r--kernel/trace/trace_probe_kernel.h30
-rw-r--r--kernel/trace/trace_probe_tmpl.h10
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_seq.c1
-rw-r--r--kernel/trace/trace_syscalls.c12
-rw-r--r--kernel/trace/trace_uprobe.c22
-rw-r--r--kernel/trace/tracing_map.h4
-rw-r--r--kernel/ucount.c5
-rw-r--r--kernel/umh.c11
-rw-r--r--kernel/watch_queue.c12
-rw-r--r--kernel/watchdog.c353
-rw-r--r--kernel/watchdog_buddy.c113
-rw-r--r--kernel/watchdog_perf.c (renamed from kernel/watchdog_hld.c)105
-rw-r--r--kernel/workqueue.c1883
-rw-r--r--kernel/workqueue_internal.h26
226 files changed, 16256 insertions, 6640 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index 000000000000..9bfe68fe9676
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_CORE
+ bool
+
+config KEXEC_CORE
+ select CRASH_CORE
+ bool
+
+config KEXEC_ELF
+ bool
+
+config HAVE_IMA_KEXEC
+ bool
+
+config KEXEC
+ bool "Enable kexec system call"
+ depends on ARCH_SUPPORTS_KEXEC
+ select KEXEC_CORE
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "Enable kexec file based system call"
+ depends on ARCH_SUPPORTS_KEXEC_FILE
+ select KEXEC_CORE
+ help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by kexec system call.
+
+config KEXEC_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG
+ depends on KEXEC_FILE
+ help
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image. The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+ bool "Require a valid signature in kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ depends on KEXEC_SIG
+ help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_IMAGE_VERIFY_SIG
+ bool "Enable Image signature verification support (ARM)"
+ default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
+ depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on EFI && SIGNED_PE_FILE_VERIFICATION
+ help
+ Enable Image signature verification support.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on ARCH_SUPPORTS_KEXEC_JUMP
+ depends on KEXEC && HIBERNATION
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ depends on ARCH_SUPPORTS_CRASH_DUMP
+ depends on ARCH_SUPPORTS_KEXEC
+ select CRASH_CORE
+ select KEXEC_CORE
+ select KEXEC
+ help
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/admin-guide/kdump/kdump.rst
+
+ For s390, this option also enables zfcpdump.
+ See also <file:Documentation/s390/zfcpdump.rst>
+
+config CRASH_HOTPLUG
+ bool "Update the crash elfcorehdr on system configuration changes"
+ default y
+ depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+ depends on ARCH_SUPPORTS_CRASH_HOTPLUG
+ help
+ Enable direct update to the crash elfcorehdr (which contains
+ the list of CPUs and memory regions to be dumped upon a crash)
+ in response to hot plug/unplug or online/offline of CPUs or
+ memory. This is a much more advanced approach than userspace
+ attempting that.
+
+ If unsure, say Y.
+
+config CRASH_MAX_MEMORY_RANGES
+ int "Specify the maximum number of memory regions for the elfcorehdr"
+ default 8192
+ depends on CRASH_HOTPLUG
+ help
+ For the kexec_file_load() syscall path, specify the maximum number of
+ memory regions that the elfcorehdr buffer/segment can accommodate.
+ These regions are obtained via walk_system_ram_res(); eg. the
+ 'System RAM' entries in /proc/iomem.
+ This value is combined with NR_CPUS_DEFAULT and multiplied by
+ sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/
+ segment size.
+ The value 8192, for example, covers a (sparsely populated) 1TiB system
+ consisting of 128MiB memblocks, while resulting in an elfcorehdr
+ memory buffer/segment size under 1MiB. This represents a sane choice
+ to accommodate both baremetal and virtual machine configurations.
+
+ For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of
+ the computation behind the value provided through the
+ /sys/kernel/crash_elfcorehdr_size attribute.
+
+endmenu
diff --git a/kernel/Makefile b/kernel/Makefile
index b69c95315480..3947122d618b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o regset.o
+ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -91,7 +91,8 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 010667ce6080..1a9f929fe629 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -445,7 +445,7 @@ static void fill_ac(acct_t *ac)
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+ strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -470,7 +470,7 @@ static void fill_ac(acct_t *ac)
do_div(elapsed, AHZ);
btime = ktime_get_real_seconds() - elapsed;
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
-#if ACCT_VERSION==2
+#if ACCT_VERSION == 2
ac->ac_ahz = AHZ;
#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 9bc0b0301198..16205dd29843 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -53,9 +53,7 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#ifdef CONFIG_SECURITY
#include <linux/security.h>
-#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -323,7 +321,8 @@ static inline int audit_rate_check(void)
unsigned long now;
int retval = 0;
- if (!audit_rate_limit) return 1;
+ if (!audit_rate_limit)
+ return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
diff --git a/kernel/audit.h b/kernel/audit.h
index c57b008b9914..a60d2840559e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -259,8 +259,8 @@ extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
/* audit watch/mark/tree functions */
-#ifdef CONFIG_AUDITSYSCALL
extern unsigned int audit_serial(void);
+#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial);
@@ -334,7 +334,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED
+#define audit_filter_inodes(t, c) do { } while (0)
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 42d99896e7a6..8317a37dea0b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -221,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry)
entry->rule.mask));
}
- switch(audit_classify_arch(arch->val)) {
+ switch (audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
@@ -243,7 +243,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
- switch(listnr) {
+ switch (listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
@@ -344,7 +344,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
- switch(f->type) {
+ switch (f->type) {
case AUDIT_FSTYPE:
case AUDIT_FILTERKEY:
break;
@@ -651,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
- switch(f->type) {
+ switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -694,7 +694,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = f->val;
}
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ data->mask[i] = krule->mask[i];
return data;
}
@@ -717,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
a->fields[i].op != b->fields[i].op)
return 1;
- switch(a->fields[i].type) {
+ switch (a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -946,7 +947,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1029,7 +1030,7 @@ int audit_del_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1083,7 +1084,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index addeed3df15d..b0cb7631e48b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -880,7 +880,8 @@ static void audit_filter_syscall(struct task_struct *tsk,
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
- struct audit_context *ctx) {
+ struct audit_context *ctx)
+{
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
@@ -1064,7 +1065,8 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
- if (!(context = audit_alloc_context(state))) {
+ context = audit_alloc_context(state);
+ if (!context) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
@@ -2124,7 +2126,7 @@ retry:
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
- for(;;) {
+ for (;;) {
struct inode *inode = d_backing_inode(d);
if (inode && unlikely(inode->i_fsnotify_marks)) {
@@ -2456,6 +2458,8 @@ void __audit_inode_child(struct inode *parent,
}
}
+ cond_resched();
+
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 2dfe1079f772..6a906ff93006 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -31,6 +31,7 @@ config BPF_SYSCALL
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select NET_XGRESS if NET
select PAGE_POOL if NET
default n
help
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1d3892168d32..f526b7573e97 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
-obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
@@ -21,6 +21,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
+obj-$(CONFIG_BPF_SYSCALL) += tcx.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 540331b610a9..addf3dd57b59 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -86,9 +86,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_bloom_filter *bloom;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->key_size != 0 || attr->value_size == 0 ||
attr->max_entries == 0 ||
attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 47d9948d768f..b5149cfce7d4 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -723,9 +723,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!bpf_capable())
- return -EPERM;
-
if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
return -E2BIG;
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index d99e89f113c4..3dabdd137d10 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
- return node->ref;
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
@@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, local_pending_list(loc_l));
}
@@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
@@ -522,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -568,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
@@ -594,7 +599,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 4ea227c9c1ad..cbd8d3720c2b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -64,11 +64,8 @@ struct bpf_lru {
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
- /* ref is an approximation on access frequency. It does not
- * have to be very accurate. Hence, no protection is used.
- */
- if (!node->ref)
- node->ref = 1;
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
@@ -78,6 +75,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
-void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d3f0a4825fa6..fdc3e8705a3c 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -374,9 +374,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
- struct bpf_tramp_links *tlinks = NULL;
+ struct bpf_tramp_links *tlinks;
void *udata, *kdata;
- int prog_fd, err = 0;
+ int prog_fd, err;
void *image, *image_end;
u32 i;
@@ -509,9 +509,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
if (st_map->map.map_flags & BPF_F_LINK) {
- err = st_ops->validate(kdata);
- if (err)
- goto reset_unlock;
+ err = 0;
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
set_memory_rox((long)st_map->image, 1);
/* Let bpf_link handle registration & unregistration.
*
@@ -655,9 +658,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
const struct btf_type *t, *vt;
struct bpf_map *map;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
return ERR_PTR(-ENOTSUPP);
@@ -666,9 +666,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
- if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
- return ERR_PTR(-EOPNOTSUPP);
-
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -818,7 +815,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
struct bpf_struct_ops_map *st_map, *old_st_map;
struct bpf_map *old_map;
struct bpf_struct_ops_link *st_link;
- int err = 0;
+ int err;
st_link = container_of(link, struct bpf_struct_ops_link, link);
st_map = container_of(new_map, struct bpf_struct_ops_map, map);
@@ -826,6 +823,9 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
if (!bpf_struct_ops_valid_to_reg(new_map))
return -EINVAL;
+ if (!st_map->st_ops->update)
+ return -EOPNOTSUPP;
+
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 72b32b7cd9cd..1095bbe29859 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_bpf_link.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
@@ -222,10 +223,17 @@ enum btf_kfunc_hook {
enum {
BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
+ BTF_KFUNC_FILTER_MAX_CNT = 16,
+};
+
+struct btf_kfunc_hook_filter {
+ btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
+ u32 nr_filters;
};
struct btf_kfunc_set_tab {
struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+ struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
};
struct btf_id_dtor_kfunc_tab {
@@ -485,25 +493,26 @@ static bool btf_type_is_fwd(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}
-static bool btf_type_nosize(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return btf_type_is_void(t) || btf_type_is_fwd(t) ||
- btf_type_is_func(t) || btf_type_is_func_proto(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_nosize_or_null(const struct btf_type *t)
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return !t || btf_type_nosize(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
-static bool btf_type_is_datasec(const struct btf_type *t)
+static bool btf_type_nosize(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+ btf_type_is_decl_tag(t);
}
-static bool btf_type_is_decl_tag(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+ return !t || btf_type_nosize(t);
}
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
@@ -544,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
-static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
{
struct btf *btf;
s32 ret;
@@ -6125,8 +6134,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
- *flag = 0;
again:
+ if (btf_type_is_modifier(t))
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
@@ -6134,6 +6144,14 @@ again:
}
vlen = btf_type_vlen(t);
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
@@ -6294,15 +6312,6 @@ error:
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
- if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
- btf_type_vlen(mtype) != 1)
- /*
- * walking unions yields untrusted pointers
- * with exception of __bpf_md_ptr and other
- * unions with a single member
- */
- *flag |= PTR_UNTRUSTED;
-
/* our field must be inside that union or struct */
t = mtype;
@@ -6360,7 +6369,7 @@ error:
* that also allows using an array of int as a scratch
* space. e.g. skb->cb[].
*/
- if (off + size > mtrue_end) {
+ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
bpf_log(log,
"access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
mname, mtrue_end, tname, off, size);
@@ -6468,7 +6477,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
bool strict)
{
const struct btf_type *type;
- enum bpf_type_flag flag;
+ enum bpf_type_flag flag = 0;
int err;
/* Are we already done? */
@@ -7665,9 +7674,12 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- struct btf_id_set8 *add_set)
+ const struct btf_kfunc_id_set *kset)
{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *add_set = kset->set;
bool vmlinux_set = !btf_is_module(btf);
+ bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
u32 set_cnt;
@@ -7682,6 +7694,24 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
return 0;
tab = btf->kfunc_set_tab;
+
+ if (tab && add_filter) {
+ u32 i;
+
+ hook_filter = &tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i] == kset->filter) {
+ add_filter = false;
+ break;
+ }
+ }
+
+ if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+ }
+
if (!tab) {
tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
if (!tab)
@@ -7704,7 +7734,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
*/
if (!vmlinux_set) {
tab->sets[hook] = add_set;
- return 0;
+ goto do_add_filter;
}
/* In case of vmlinux sets, there may be more than one set being
@@ -7746,6 +7776,11 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+do_add_filter:
+ if (add_filter) {
+ hook_filter = &tab->hook_filters[hook];
+ hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
+ }
return 0;
end:
btf_free_kfunc_set_tab(btf);
@@ -7754,15 +7789,22 @@ end:
static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
enum btf_kfunc_hook hook,
- u32 kfunc_btf_id)
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
+ struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *set;
- u32 *id;
+ u32 *id, i;
if (hook >= BTF_KFUNC_HOOK_MAX)
return NULL;
if (!btf->kfunc_set_tab)
return NULL;
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return NULL;
+ }
set = btf->kfunc_set_tab->sets[hook];
if (!set)
return NULL;
@@ -7817,23 +7859,25 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
u32 *btf_kfunc_id_set_contains(const struct btf *btf,
- enum bpf_prog_type prog_type,
- u32 kfunc_btf_id)
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
enum btf_kfunc_hook hook;
u32 *kfunc_flags;
- kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
if (kfunc_flags)
return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
}
-u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
- return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
}
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
@@ -7848,10 +7892,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
pr_err("missing vmlinux BTF, cannot register kfuncs\n");
return -ENOENT;
}
- if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
- pr_err("missing module BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register kfuncs\n");
return 0;
}
if (IS_ERR(btf))
@@ -7864,7 +7906,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
goto err_out;
}
- ret = btf_populate_kfunc_set(btf, hook, kset->set);
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+
err_out:
btf_put(btf);
return ret;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 517b6a5928cc..5b2741aa0d9b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1826,6 +1826,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ret = 1;
} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
+ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = 0;
+ goto out;
+ }
ret = -EFAULT;
} else {
/* optlen within bounds, run kernel handler */
@@ -1881,8 +1887,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.optname = optname,
.current_task = current,
};
+ int orig_optlen;
int ret;
+ orig_optlen = max_optlen;
ctx.optlen = max_optlen;
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
@@ -1905,6 +1913,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ret = -EFAULT;
goto out;
}
+ orig_optlen = ctx.optlen;
if (copy_from_user(ctx.optval, optval,
min(ctx.optlen, max_optlen)) != 0) {
@@ -1922,6 +1931,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = retval;
+ goto out;
+ }
ret = -EFAULT;
goto out;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7421487422d4..0f8f036d8bd1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -61,6 +61,7 @@
#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
+#define OFF insn->off
#define IMM insn->imm
struct bpf_mem_alloc bpf_global_ma;
@@ -372,7 +373,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
s32 delta = end_new - end_old;
- s32 off = insn->off;
+ s32 off;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ off = insn->imm;
+ else
+ off = insn->off;
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
@@ -380,8 +386,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
- if (!probe_pass)
- insn->off = off;
+ if (!probe_pass) {
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ insn->imm = off;
+ else
+ insn->off = off;
+ }
return 0;
}
@@ -1271,7 +1281,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU | BPF_MOD | BPF_K:
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_ALU64 | BPF_ADD | BPF_K:
@@ -1285,7 +1295,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU64 | BPF_MOD | BPF_K:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_JMP | BPF_JEQ | BPF_K:
@@ -1523,6 +1533,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(ALU64, DIV, X), \
INSN_3(ALU64, MOD, X), \
INSN_2(ALU64, NEG), \
+ INSN_3(ALU64, END, TO_LE), \
/* Immediate based. */ \
INSN_3(ALU64, ADD, K), \
INSN_3(ALU64, SUB, K), \
@@ -1591,6 +1602,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSLE, K), \
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
+ INSN_2(JMP32, JA), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
@@ -1610,6 +1622,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, H), \
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
+ INSN_3(LDX, MEMSX, B), \
+ INSN_3(LDX, MEMSX, H), \
+ INSN_3(LDX, MEMSX, W), \
/* Immediate based. */ \
INSN_3(LD, IMM, DW)
@@ -1635,12 +1650,6 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
-{
- memset(dst, 0, size);
- return -EFAULT;
-}
-
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1666,6 +1675,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
};
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
@@ -1733,13 +1745,36 @@ select_insn:
DST = -DST;
CONT;
ALU_MOV_X:
- DST = (u32) SRC;
+ switch (OFF) {
+ case 0:
+ DST = (u32) SRC;
+ break;
+ case 8:
+ DST = (u32)(s8) SRC;
+ break;
+ case 16:
+ DST = (u32)(s16) SRC;
+ break;
+ }
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
- DST = SRC;
+ switch (OFF) {
+ case 0:
+ DST = SRC;
+ break;
+ case 8:
+ DST = (s8) SRC;
+ break;
+ case 16:
+ DST = (s16) SRC;
+ break;
+ case 32:
+ DST = (s32) SRC;
+ break;
+ }
CONT;
ALU64_MOV_K:
DST = IMM;
@@ -1761,36 +1796,114 @@ select_insn:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- div64_u64_rem(DST, SRC, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, SRC);
+ DST = DST - AX * SRC;
+ break;
+ }
CONT;
ALU_MOD_X:
- AX = (u32) DST;
- DST = do_div(AX, (u32) SRC);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)SRC));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, IMM);
+ DST = DST - AX * IMM;
+ break;
+ }
CONT;
ALU_MOD_K:
- AX = (u32) DST;
- DST = do_div(AX, (u32) IMM);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)IMM));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_DIV_X:
- DST = div64_u64(DST, SRC);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, SRC);
+ break;
+ case 1:
+ DST = div64_s64(DST, SRC);
+ break;
+ }
CONT;
ALU_DIV_X:
- AX = (u32) DST;
- do_div(AX, (u32) SRC);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)SRC));
+ if (((s32)DST < 0) == ((s32)SRC < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU64_DIV_K:
- DST = div64_u64(DST, IMM);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, IMM);
+ break;
+ case 1:
+ DST = div64_s64(DST, IMM);
+ break;
+ }
CONT;
ALU_DIV_K:
- AX = (u32) DST;
- do_div(AX, (u32) IMM);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)IMM));
+ if (((s32)DST < 0) == ((s32)IMM < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1818,6 +1931,19 @@ select_insn:
break;
}
CONT;
+ ALU64_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) __swab16(DST);
+ break;
+ case 32:
+ DST = (__force u32) __swab32(DST);
+ break;
+ case 64:
+ DST = (__force u64) __swab64(DST);
+ break;
+ }
+ CONT;
/* CALL */
JMP_CALL:
@@ -1867,6 +1993,9 @@ out:
JMP_JA:
insn += insn->off;
CONT;
+ JMP32_JA:
+ insn += insn->imm;
+ CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
@@ -1931,8 +2060,8 @@ out:
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT; \
LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, sizeof(SIZE), \
- (const void *)(long) (SRC + insn->off)); \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
DST = *((SIZE *)&DST); \
CONT;
@@ -1942,6 +2071,21 @@ out:
LDST(DW, u64)
#undef LDST
+#define LDSX(SIZEOP, SIZE) \
+ LDX_MEMSX_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEMSX_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
+ CONT;
+
+ LDSX(B, s8)
+ LDSX(H, s16)
+ LDSX(W, s32)
+#undef LDSX
+
#define ATOMIC_ALU_OP(BOP, KOP) \
case BOP: \
if (BPF_SIZE(insn->code) == BPF_W) \
@@ -2064,14 +2208,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
-static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
- const struct bpf_insn *insn) = {
+static __maybe_unused
+u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
+#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
@@ -2080,7 +2226,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
__bpf_call_base_args;
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
-
+#endif
#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ec18faa74ac..e42a1bdb7f53 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -28,7 +28,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/capability.h>
+#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -61,8 +61,6 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
- struct bpf_cpu_map *cmap;
-
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -70,10 +68,8 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
- atomic_t refcnt; /* Control when this struct can be free'ed */
- struct rcu_head rcu;
-
- struct work_struct kthread_stop_wq;
+ struct completion kthread_running;
+ struct rcu_work free_work;
};
struct bpf_cpu_map {
@@ -89,9 +85,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
(value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
@@ -121,27 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
}
-static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- atomic_inc(&rcpu->refcnt);
-}
-
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
-{
- struct bpf_cpu_map_entry *rcpu;
-
- rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
-
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
- * as it waits until all in-flight call_rcu() callbacks complete.
- */
- rcu_barrier();
-
- /* kthread_stop will wake_up_process and wait for it to complete */
- kthread_stop(rcpu->kthread);
-}
-
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
@@ -149,23 +121,16 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
- struct xdp_frame *xdpf;
-
- while ((xdpf = ptr_ring_consume(ring)))
- if (WARN_ON_ONCE(xdpf))
- xdp_return_frame(xdpf);
-}
+ void *ptr;
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- if (rcpu->prog)
- bpf_prog_put(rcpu->prog);
- /* The queue should be empty at this point */
- __cpu_map_ring_cleanup(rcpu->queue);
- ptr_ring_cleanup(rcpu->queue, NULL);
- kfree(rcpu->queue);
- kfree(rcpu);
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
}
}
@@ -294,11 +259,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
return nframes;
}
-
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
/* When kthread gives stop order, then rcpu have been disconnected
@@ -393,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
}
__set_current_state(TASK_RUNNING);
- put_cpu_map_entry(rcpu);
return 0;
}
@@ -463,19 +427,23 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
goto free_ptr_ring;
/* Setup kthread */
+ init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
if (IS_ERR(rcpu->kthread))
goto free_prog;
- get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
- get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
-
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
return rcpu;
free_prog:
@@ -492,40 +460,40 @@ free_rcu:
return NULL;
}
-static void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
- rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
free_percpu(rcpu->bulkq);
- /* Cannot kthread_stop() here, last put free rcpu resources */
- put_cpu_map_entry(rcpu);
+ kfree(rcpu);
}
-/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
- * ensure any driver rcu critical sections have completed, but this
- * does not guarantee a flush has happened yet. Because driver side
- * rcu_read_lock/unlock only protects the running XDP program. The
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
- * pending flush op doesn't fail.
- *
- * The bpf_cpu_map_entry is still used by the kthread, and there can
- * still be pending packets (in queue and percpu bulkq). A refcnt
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
- * resources.
- *
- * The rcu callback __cpu_map_entry_free flush remaining packets in
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
- * preemption, cannot call kthread_stop() to make sure queue is empty.
- * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU grace period before
- * stopping kthread, emptying the queue.
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
@@ -534,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
- call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
- INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
- schedule_work(&old_rcpu->kthread_stop_wq);
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_wq, &old_rcpu->free_work);
}
}
@@ -548,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries)
return -EINVAL;
- /* notice caller map_delete_elem() use preempt_disable() */
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0;
}
@@ -584,7 +551,6 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
- rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -600,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
- * It does __not__ ensure pending flush operations (if any) are
- * complete.
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
*/
-
synchronize_rcu();
- /* For cpu_map the remote CPUs can still be using the entries
- * (struct bpf_cpu_map_entry).
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
*/
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
@@ -618,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU grace-period */
- __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
}
bpf_map_area_free(cmap->cpu_map);
bpf_map_area_free(cmap);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 7efdf5d770ca..6983af8e093c 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,7 +9,6 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
- * @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
@@ -25,7 +24,6 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
- struct rcu_head rcu;
refcount_t usage;
};
@@ -82,16 +80,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
-static void cpumask_free_cb(struct rcu_head *head)
-{
- struct bpf_cpumask *cpumask;
-
- cpumask = container_of(head, struct bpf_cpumask, rcu);
- migrate_disable();
- bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
- migrate_enable();
-}
-
/**
* bpf_cpumask_release() - Release a previously acquired BPF cpumask.
* @cpumask: The cpumask being released.
@@ -102,8 +90,12 @@ static void cpumask_free_cb(struct rcu_head *head)
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
- if (refcount_dec_and_test(&cpumask->usage))
- call_rcu(&cpumask->rcu, cpumask_free_cb);
+ if (!refcount_dec_and_test(&cpumask->usage))
+ return;
+
+ migrate_disable();
+ bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
}
/**
@@ -132,6 +124,21 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
}
/**
+ * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the
+ * AND of two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Find the index of the first nonzero bit of the AND of two cpumasks.
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_first_and(src1, src2);
+}
+
+/**
* bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
* @cpu: The CPU to be set in the cpumask.
* @cpumask: The BPF cpumask in which a bit is being set.
@@ -367,7 +374,7 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask
}
/**
- * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask.
* @cpumask: The cpumask being queried.
*
* Return:
@@ -376,26 +383,28 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask
*
* A struct bpf_cpumask pointer may be safely passed to @src.
*/
-__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
+__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask)
{
- return cpumask_any(cpumask);
+ return cpumask_any_distribute(cpumask);
}
/**
- * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
- * cpumasks.
+ * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of
+ * two cpumasks.
* @src1: The first cpumask.
* @src2: The second cpumask.
*
* Return:
- * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at
+ * least one bit is set.
* * >= num_cpus if no bit is set.
*
* struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
*/
-__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2)
{
- return cpumask_any_and(src1, src2);
+ return cpumask_any_and_distribute(src1, src2);
}
__diag_pop();
@@ -406,6 +415,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
@@ -422,8 +432,8 @@ BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 802692fa3905..4d42f6ed6c11 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -65,7 +65,6 @@ struct xdp_dev_bulk_queue {
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
- struct bpf_dtab *dtab;
struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
@@ -160,9 +159,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
struct bpf_dtab *dtab;
int err;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -877,7 +873,6 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
}
dev->idx = idx;
- dev->dtab = dtab;
if (prog) {
dev->xdp_prog = prog;
dev->val.bpf_prog.id = prog->aux->id;
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 7b4afb7d96db..49940c26a227 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -87,6 +87,17 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_alu_sign_string[16] = {
+ [BPF_DIV >> 4] = "s/=",
+ [BPF_MOD >> 4] = "s%=",
+};
+
+static const char *const bpf_movsx_string[4] = {
+ [0] = "(s8)",
+ [1] = "(s16)",
+ [3] = "(s32)",
+};
+
static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
@@ -101,6 +112,12 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
+static const char *const bpf_ldsx_string[] = {
+ [BPF_W >> 3] = "s32",
+ [BPF_H >> 3] = "s16",
+ [BPF_B >> 3] = "s8",
+};
+
static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
@@ -128,6 +145,27 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose,
insn->imm, insn->dst_reg);
}
+static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
+ insn->code, insn->dst_reg,
+ insn->imm, insn->dst_reg);
+}
+
+static bool is_sdiv_smod(const struct bpf_insn *insn)
+{
+ return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
+ insn->off == 1;
+}
+
+static bool is_movsx(const struct bpf_insn *insn)
+{
+ return BPF_OP(insn->code) == BPF_MOV &&
+ (insn->off == 8 || insn->off == 16 || insn->off == 32);
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -138,7 +176,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
+ print_bpf_bswap_insn(verbose, cbs->private_data, insn);
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
@@ -147,17 +185,20 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
insn->imm);
}
} else if (class == BPF_STX) {
@@ -218,13 +259,15 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
}
} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ BPF_MODE(insn->code) == BPF_MEM ?
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
+ bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
@@ -279,6 +322,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
+ insn->code, insn->imm);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9901efee4339..a8c7e1c5abfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -302,6 +302,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ bpf_map_inc_elem_count(&htab->map);
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size);
return l;
@@ -422,12 +423,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !bpf_capable())
- /* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_BPF.
- */
- return -EPERM;
-
if (zero_seed && !capable(CAP_SYS_ADMIN))
/* Guard against local DoS, and discourage production use. */
return -EPERM;
@@ -516,12 +511,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
+ err = bpf_map_init_elem_count(&htab->map);
+ if (err)
+ goto free_htab;
+
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
+ goto free_elem_count;
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
@@ -599,6 +598,8 @@ free_map_locked:
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
bpf_mem_alloc_destroy(&htab->ma);
+free_elem_count:
+ bpf_map_free_elem_count(&htab->map);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
@@ -810,6 +811,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
check_and_free_fields(htab, l);
+ bpf_map_dec_elem_count(&htab->map);
break;
}
@@ -906,6 +908,8 @@ static bool is_map_full(struct bpf_htab *htab)
static void inc_elem_count(struct bpf_htab *htab)
{
+ bpf_map_inc_elem_count(&htab->map);
+
if (htab->use_percpu_counter)
percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
else
@@ -914,6 +918,8 @@ static void inc_elem_count(struct bpf_htab *htab)
static void dec_elem_count(struct bpf_htab *htab)
{
+ bpf_map_dec_elem_count(&htab->map);
+
if (htab->use_percpu_counter)
percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
else
@@ -926,6 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -1006,6 +1013,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
+ bpf_map_inc_elem_count(&htab->map);
}
} else {
if (is_map_full(htab))
@@ -1174,6 +1182,7 @@ err:
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
check_and_free_fields(htab, elem);
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
@@ -1363,8 +1372,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
err:
htab_unlock_bucket(htab, b, hash, flags);
err_lock_bucket:
- if (l_new)
+ if (l_new) {
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ }
return ret;
}
@@ -1529,6 +1540,7 @@ static void htab_map_free(struct bpf_map *map)
prealloc_destroy(htab);
}
+ bpf_map_free_elem_count(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8d368fa353f9..8bd3812fb8df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -286,6 +286,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
arch_spin_lock(l);
}
@@ -294,6 +295,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
+ preempt_enable();
}
#else
@@ -1423,7 +1425,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1443,11 +1445,18 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
+u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+{
+ u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+ ptr->size = new_size | metadata;
+}
+
int bpf_dynptr_check_size(u32 size)
{
return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
@@ -1469,7 +1478,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
- u32 size = bpf_dynptr_get_size(ptr);
+ u32 size = __bpf_dynptr_size(ptr);
if (len > size || offset > size - len)
return -E2BIG;
@@ -1563,7 +1572,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
enum bpf_dynptr_type type;
int err;
- if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || __bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1619,7 +1628,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
if (err)
return 0;
- if (bpf_dynptr_is_rdonly(ptr))
+ if (__bpf_dynptr_is_rdonly(ptr))
return 0;
type = bpf_dynptr_get_type(ptr);
@@ -1906,7 +1915,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
- bpf_mem_free(&bpf_global_ma, p);
+
+ if (rec && rec->refcount_off >= 0)
+ bpf_mem_free_rcu(&bpf_global_ma, p);
+ else
+ bpf_mem_free(&bpf_global_ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1926,28 +1939,38 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
* bpf_refcount type so that it is emitted in vmlinux BTF
*/
ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+ if (!refcount_inc_not_zero((refcount_t *)ref))
+ return NULL;
- refcount_inc((refcount_t *)ref);
+ /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
+ * in verifier.c
+ */
return (void *)p__refcounted_kptr;
}
-static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+static int __bpf_list_add(struct bpf_list_node_kern *node,
+ struct bpf_list_head *head,
bool tail, struct btf_record *rec, u64 off)
{
- struct list_head *n = (void *)node, *h = (void *)head;
+ struct list_head *n = &node->list_head, *h = (void *)head;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
*/
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
- if (!list_empty(n)) {
+
+ /* node->owner != NULL implies !list_empty(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl(n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec);
return -EINVAL;
}
tail ? list_add_tail(n, h) : list_add(n, h);
+ WRITE_ONCE(node->owner, head);
return 0;
}
@@ -1956,25 +1979,26 @@ __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ struct bpf_list_node_kern *n = (void *)node;
struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(node, head, false,
- meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ struct bpf_list_node_kern *n = (void *)node;
struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(node, head, true,
- meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
{
struct list_head *n, *h = (void *)head;
+ struct bpf_list_node_kern *node;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
@@ -1983,8 +2007,14 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
INIT_LIST_HEAD(h);
if (list_empty(h))
return NULL;
+
n = tail ? h->prev : h->next;
+ node = container_of(n, struct bpf_list_node_kern, list_head);
+ if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ return NULL;
+
list_del_init(n);
+ WRITE_ONCE(node->owner, NULL);
return (struct bpf_list_node *)n;
}
@@ -2001,31 +2031,40 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
struct rb_root_cached *r = (struct rb_root_cached *)root;
- struct rb_node *n = (struct rb_node *)node;
+ struct rb_node *n = &node_internal->rb_node;
- if (RB_EMPTY_NODE(n))
+ /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
+ * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
+ */
+ if (READ_ONCE(node_internal->owner) != root)
return NULL;
rb_erase_cached(n, r);
RB_CLEAR_NODE(n);
+ WRITE_ONCE(node_internal->owner, NULL);
return (struct bpf_rb_node *)n;
}
/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
* program
*/
-static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+static int __bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node_kern *node,
void *less, struct btf_record *rec, u64 off)
{
struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
- struct rb_node *parent = NULL, *n = (struct rb_node *)node;
+ struct rb_node *parent = NULL, *n = &node->rb_node;
bpf_callback_t cb = (bpf_callback_t)less;
bool leftmost = true;
- if (!RB_EMPTY_NODE(n)) {
+ /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl(n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec);
return -EINVAL;
}
@@ -2041,6 +2080,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
rb_link_node(n, parent, link);
rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ WRITE_ONCE(node->owner, root);
return 0;
}
@@ -2049,8 +2089,9 @@ __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node
void *meta__ign, u64 off)
{
struct btf_struct_meta *meta = meta__ign;
+ struct bpf_rb_node_kern *n = (void *)node;
- return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
+ return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
}
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -2142,6 +2183,22 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
return NULL;
return cgrp;
}
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+ struct cgroup *ancestor)
+{
+ return task_under_cgroup_hierarchy(task, ancestor);
+}
#endif /* CONFIG_CGROUPS */
/**
@@ -2167,13 +2224,15 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @ptr: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- * requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
* If the intention is to write to the data slice, please use
* bpf_dynptr_slice_rdwr.
*
@@ -2190,7 +2249,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
- void *buffer, u32 buffer__szk)
+ void *buffer__opt, u32 buffer__szk)
{
enum bpf_dynptr_type type;
u32 len = buffer__szk;
@@ -2210,15 +2269,20 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
case BPF_DYNPTR_TYPE_RINGBUF:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
- return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ if (buffer__opt)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ else
+ return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
case BPF_DYNPTR_TYPE_XDP:
{
void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
- if (xdp_ptr)
+ if (!IS_ERR_OR_NULL(xdp_ptr))
return xdp_ptr;
- bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
- return buffer;
+ if (!buffer__opt)
+ return NULL;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+ return buffer__opt;
}
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
@@ -2230,13 +2294,15 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
* @ptr: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- * requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
* The returned pointer is writable and may point to either directly the dynptr
* data at the requested offset or to the buffer if unable to obtain a direct
* data pointer to (example: the requested slice is to the paged area of an skb
@@ -2267,9 +2333,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
- void *buffer, u32 buffer__szk)
+ void *buffer__opt, u32 buffer__szk)
{
- if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
@@ -2294,7 +2360,59 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+ return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+}
+
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+{
+ u32 size;
+
+ if (!ptr->data || start > end)
+ return -EINVAL;
+
+ size = __bpf_dynptr_size(ptr);
+
+ if (start > size || end > size)
+ return -ERANGE;
+
+ ptr->offset += start;
+ bpf_dynptr_set_size(ptr, end - start);
+
+ return 0;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+{
+ return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return false;
+
+ return __bpf_dynptr_is_rdonly(ptr);
+}
+
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return -EINVAL;
+
+ return __bpf_dynptr_size(ptr);
+}
+
+__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
+ struct bpf_dynptr_kern *clone__uninit)
+{
+ if (!ptr->data) {
+ bpf_dynptr_set_null(clone__uninit);
+ return -EINVAL;
+ }
+
+ *clone__uninit = *ptr;
+
+ return 0;
}
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
@@ -2325,7 +2443,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
@@ -2341,6 +2459,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_SET8_END(generic_btf_ids)
@@ -2369,6 +2488,11 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9948b542a470..99d0625b6c82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
@@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ dir->i_mtime = inode_set_ctime_current(dir);
}
static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
@@ -435,7 +433,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
return ret;
}
-static int bpf_obj_do_pin(const char __user *pathname, void *raw,
+static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -444,22 +442,21 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
+ dentry = user_path_create(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-
- ret = security_path_mknod(&path, dentry, mode, 0);
- if (ret)
- goto out;
-
dir = d_inode(path.dentry);
if (dir->i_op != &bpf_dir_iops) {
ret = -EPERM;
goto out;
}
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ ret = security_path_mknod(&path, dentry, mode, 0);
+ if (ret)
+ goto out;
+
switch (type) {
case BPF_TYPE_PROG:
ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
@@ -478,7 +475,7 @@ out:
return ret;
}
-int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
{
enum bpf_type type;
void *raw;
@@ -488,14 +485,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
if (IS_ERR(raw))
return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pathname, raw, type);
+ ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
return ret;
}
-static void *bpf_obj_do_get(const char __user *pathname,
+static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -503,7 +500,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
void *raw;
int ret;
- ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -527,7 +524,7 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname, int flags)
+int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
int f_flags;
@@ -538,7 +535,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
if (f_flags < 0)
return f_flags;
- raw = bpf_obj_do_get(pathname, &type, f_flags);
+ raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
if (IS_ERR(raw))
return PTR_ERR(raw);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 046ddff37a76..850494423530 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -62,9 +62,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
if (log->level == BPF_LOG_KERNEL) {
bool newline = n > 0 && log->kbuf[n - 1] == '\n';
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e0d3ddf2037a..17c7e7782a1f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -544,9 +544,6 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index b0fa190b0979..6fc9dae9edc8 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_map_seq_ops = {
.show = bpf_map_seq_show,
};
-BTF_ID_LIST(btf_bpf_map_id)
-BTF_ID(struct, bpf_map)
+BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map)
static const struct bpf_iter_seq_info bpf_map_seq_info = {
.seq_ops = &bpf_map_seq_ops,
@@ -93,7 +92,7 @@ static struct bpf_iter_reg bpf_map_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &bpf_map_seq_info,
};
@@ -193,3 +192,40 @@ static int __init bpf_map_iter_init(void)
}
late_initcall(bpf_map_iter_init);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
+{
+ s64 *pcount;
+ s64 ret = 0;
+ int cpu;
+
+ if (!map || !map->elem_count)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(map->elem_count, cpu);
+ ret += READ_ONCE(*pcount);
+ }
+ return ret;
+}
+
+__diag_pop();
+
+BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_SET8_END(bpf_map_iter_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_map_iter_kfunc_ids,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 410637c225fb..9c49ae53deaf 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -98,11 +98,23 @@ struct bpf_mem_cache {
int free_cnt;
int low_watermark, high_watermark, batch;
int percpu_size;
+ bool draining;
+ struct bpf_mem_cache *tgt;
- struct rcu_head rcu;
+ /* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
+ struct llist_node *free_by_rcu_tail;
struct llist_head waiting_for_gp;
+ struct llist_node *waiting_for_gp_tail;
+ struct rcu_head rcu;
atomic_t call_rcu_in_progress;
+ struct llist_head free_llist_extra_rcu;
+
+ /* list of objects to be freed after RCU tasks trace GP */
+ struct llist_head free_by_rcu_ttrace;
+ struct llist_head waiting_for_gp_ttrace;
+ struct rcu_head rcu_ttrace;
+ atomic_t call_rcu_ttrace_in_progress;
};
struct bpf_mem_caches {
@@ -153,67 +165,95 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
#endif
}
+static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(*flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+}
+
+static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
+{
+ unsigned long flags;
+
+ inc_active(c, &flags);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ dec_active(c, &flags);
+}
+
/* Mostly runs from irq_work except __init phase. */
-static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
{
struct mem_cgroup *memcg = NULL, *old_memcg;
- unsigned long flags;
+ gfp_t gfp;
void *obj;
int i;
- memcg = get_memcg(c);
- old_memcg = set_active_memcg(memcg);
+ gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+ gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
for (i = 0; i < cnt; i++) {
/*
- * free_by_rcu is only manipulated by irq work refill_work().
- * IRQ works on the same CPU are called sequentially, so it is
- * safe to use __llist_del_first() here. If alloc_bulk() is
- * invoked by the initial prefill, there will be no running
- * refill_work(), so __llist_del_first() is fine as well.
- *
- * In most cases, objects on free_by_rcu are from the same CPU.
- * If some objects come from other CPUs, it doesn't incur any
- * harm because NUMA_NO_NODE means the preference for current
- * numa node and it is not a guarantee.
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
*/
- obj = __llist_del_first(&c->free_by_rcu);
- if (!obj) {
- /* Allocate, but don't deplete atomic reserves that typical
- * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
- * will allocate from the current numa node which is what we
- * want here.
- */
- obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!obj)
- break;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- /* In RT irq_work runs in per-cpu kthread, so disable
- * interrupts to avoid preemption and interrupts and
- * reduce the chance of bpf prog executing on this cpu
- * when active counter is busy.
- */
- local_irq_save(flags);
- /* alloc_bulk runs from irq_work which will not preempt a bpf
- * program that does unit_alloc/unit_free since IRQs are
- * disabled there. There is no race to increment 'active'
- * counter. It protects free_llist from corruption in case NMI
- * bpf prog preempted this loop.
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ for (; i < cnt; i++) {
+ obj = llist_del_first(&c->waiting_for_gp_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (; i < cnt; i++) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
*/
- WARN_ON_ONCE(local_inc_return(&c->active) != 1);
- __llist_add(obj, &c->free_llist);
- c->free_cnt++;
- local_dec(&c->active);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(flags);
+ obj = __alloc(c, node, gfp);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
}
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
-static void free_one(struct bpf_mem_cache *c, void *obj)
+static void free_one(void *obj, bool percpu)
{
- if (c->percpu_size) {
+ if (percpu) {
free_percpu(((void **)obj)[1]);
kfree(obj);
return;
@@ -222,15 +262,24 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
kfree(obj);
}
-static void __free_rcu(struct rcu_head *head)
+static int free_all(struct llist_node *llnode, bool percpu)
{
- struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
- struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
struct llist_node *pos, *t;
+ int cnt = 0;
- llist_for_each_safe(pos, t, llnode)
- free_one(c, pos);
- atomic_set(&c->call_rcu_in_progress, 0);
+ llist_for_each_safe(pos, t, llnode) {
+ free_one(pos, percpu);
+ cnt++;
+ }
+ return cnt;
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
+
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
static void __free_rcu_tasks_trace(struct rcu_head *head)
@@ -249,60 +298,128 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj)
struct llist_node *llnode = obj;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
- * Nothing races to add to free_by_rcu list.
+ * Nothing races to add to free_by_rcu_ttrace list.
*/
- __llist_add(llnode, &c->free_by_rcu);
+ llist_add(llnode, &c->free_by_rcu_ttrace);
}
-static void do_call_rcu(struct bpf_mem_cache *c)
+static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
- if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+ if (unlikely(READ_ONCE(c->draining))) {
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
+ free_all(llnode, !!c->percpu_size);
+ }
return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
+ llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ __free_rcu(&c->rcu_ttrace);
+ return;
+ }
- WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
- /* There is no concurrent __llist_add(waiting_for_gp) access.
- * It doesn't race with llist_del_all either.
- * But there could be two concurrent llist_del_all(waiting_for_gp):
- * from __free_rcu() and from drain_mem_cache().
- */
- __llist_add(llnode, &c->waiting_for_gp);
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
* If RCU Tasks Trace grace period implies RCU grace period, free
* these elements directly, else use call_rcu() to wait for normal
* progs to finish and finally do free_one() on each element.
*/
- call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
}
static void free_bulk(struct bpf_mem_cache *c)
{
+ struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode, *t;
unsigned long flags;
int cnt;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+
do {
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_save(flags);
- WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ inc_active(c, &flags);
llnode = __llist_del_first(&c->free_llist);
if (llnode)
cnt = --c->free_cnt;
else
cnt = 0;
- local_dec(&c->active);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(flags);
+ dec_active(c, &flags);
if (llnode)
- enque_to_free(c, llnode);
+ enque_to_free(tgt, llnode);
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
/* and drain free_llist_extra */
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
- enque_to_free(c, llnode);
- do_call_rcu(c);
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
+}
+
+static void __free_by_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode;
+
+ llnode = llist_del_all(&c->waiting_for_gp);
+ if (!llnode)
+ goto out;
+
+ llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+ /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+ do_call_rcu_ttrace(tgt);
+out:
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+
+ /* drain free_llist_extra_rcu */
+ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+ inc_active(c, &flags);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ dec_active(c, &flags);
+ }
+
+ if (llist_empty(&c->free_by_rcu))
+ return;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+ /*
+ * Instead of kmalloc-ing new rcu_head and triggering 10k
+ * call_rcu() to hit rcutree.qhimark and force RCU to notice
+ * the overload just ask RCU to hurry up. There could be many
+ * objects in free_by_rcu list.
+ * This hint reduces memory consumption for an artificial
+ * benchmark from 2 Gbyte to 150 Mbyte.
+ */
+ rcu_request_urgent_qs_task(current);
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+ inc_active(c, &flags);
+ WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+ c->waiting_for_gp_tail = c->free_by_rcu_tail;
+ dec_active(c, &flags);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ atomic_set(&c->call_rcu_in_progress, 0);
+ } else {
+ call_rcu_hurry(&c->rcu, __free_by_rcu);
+ }
}
static void bpf_mem_refill(struct irq_work *work)
@@ -316,9 +433,11 @@ static void bpf_mem_refill(struct irq_work *work)
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here.
*/
- alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
else if (cnt > c->high_watermark)
free_bulk(c);
+
+ check_free_by_rcu(c);
}
static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -362,7 +481,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
*/
- alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
}
/* When size != 0 bpf_mem_cache for each cpu.
@@ -401,6 +520,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -423,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
}
@@ -432,27 +553,61 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
static void drain_mem_cache(struct bpf_mem_cache *c)
{
- struct llist_node *llnode, *t;
+ bool percpu = !!c->percpu_size;
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free() for all remaining elements and they can be in
- * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
*
- * Except for waiting_for_gp list, there are no concurrent operations
+ * Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
- free_one(c, llnode);
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+}
+
+static void check_mem_cache(struct bpf_mem_cache *c)
+{
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ check_mem_cache(c);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ check_mem_cache(c);
+ }
+ }
+ }
}
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
+ check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
ma->cache = NULL;
@@ -461,8 +616,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
- /* waiting_for_gp lists was drained, but __free_rcu might
- * still execute. Wait for it now before we freeing percpu caches.
+ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ * might still execute. Wait for them.
*
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -471,7 +626,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
* using rcu_trace_implies_rcu_gp() as well.
*/
- rcu_barrier_tasks_trace();
+ rcu_barrier(); /* wait for __free_by_rcu */
+ rcu_barrier_tasks_trace(); /* wait for __free_rcu */
if (!rcu_trace_implies_rcu_gp())
rcu_barrier();
free_mem_alloc_no_barrier(ma);
@@ -497,7 +653,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
return;
}
- copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
if (!copy) {
/* Slow path with inline barrier-s */
free_mem_alloc(ma);
@@ -505,10 +661,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
}
/* Defer barriers into worker to let the rest of map memory to be freed */
- copy->cache = ma->cache;
- ma->cache = NULL;
- copy->caches = ma->caches;
- ma->caches = NULL;
+ memset(ma, 0, sizeof(*ma));
INIT_WORK(&copy->work, free_mem_alloc_deferred);
queue_work(system_unbound_wq, &copy->work);
}
@@ -523,17 +676,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
- /*
- * refill_work may be unfinished for PREEMPT_RT kernel
- * in which irq work is invoked in a per-CPU RT thread.
- * It is also possible for kernel with
- * arch_irq_work_has_interrupt() being false and irq
- * work is invoked in timer interrupt. So waiting for
- * the completion of irq work to ease the handling of
- * concurrency.
- */
+ WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
/* objcg is the same across cpus */
@@ -547,8 +693,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
cc = per_cpu_ptr(ma->caches, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
+ WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
@@ -580,8 +728,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
llnode = __llist_del_first(&c->free_llist);
- if (llnode)
+ if (llnode) {
cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
}
local_dec(&c->active);
local_irq_restore(flags);
@@ -605,6 +755,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
@@ -626,6 +782,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
irq_work_raise(c);
}
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ } else {
+ llist_add(llnode, &c->free_llist_extra_rcu);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (!atomic_read(&c->call_rcu_in_progress))
+ irq_work_raise(c);
+}
+
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled.
*/
@@ -659,6 +836,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
}
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
{
void *ret;
@@ -675,6 +866,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp.
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
new file mode 100644
index 000000000000..32d2c4829eb8
--- /dev/null
+++ b/kernel/bpf/mprog.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+static int bpf_mprog_link(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ if (type && link->prog->type != type) {
+ bpf_link_put(link);
+ return -EINVAL;
+ }
+
+ tuple->link = link;
+ tuple->prog = link->prog;
+ return 0;
+}
+
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
+
+static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool link = flags & BPF_F_LINK;
+ bool id = flags & BPF_F_ID;
+
+ memset(tuple, 0, sizeof(*tuple));
+ if (link)
+ return bpf_mprog_link(tuple, id_or_fd, flags, type);
+ /* If no relevant flag is set and no id_or_fd was passed, then
+ * tuple link/prog is just NULLed. This is the case when before/
+ * after selects first/last position without passing fd.
+ */
+ if (!id && !id_or_fd)
+ return 0;
+ return bpf_mprog_prog(tuple, id_or_fd, flags, type);
+}
+
+static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
+{
+ if (tuple->link)
+ bpf_link_put(tuple->link);
+ else if (tuple->prog)
+ bpf_prog_put(tuple->prog);
+}
+
+/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
+ * one exception that for deletion we support delete from front/back. In
+ * case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
+ * Adjustment to first and last entry is trivial. The bpf_mprog_insert()
+ * we have to deal with the following cases:
+ *
+ * idx + before:
+ *
+ * Insert P4 before P3: idx for old array is 1, idx for new array is 2,
+ * hence we adjust target idx for the new array, so that memmove copies
+ * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
+ * before P1 would have old idx -1 and new idx 0.
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ *
+ * idx + after:
+ *
+ * Insert P4 after P2: idx for old array is 2, idx for new array is 2.
+ * Again, memmove copies P1 and P2 to the new entry, and we insert P4
+ * into idx 2. Inserting after P3 would have both old/new idx at 4 aka
+ * bpf_mprog_total(entry).
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ */
+static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *oprog;
+
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ oprog = READ_ONCE(fp->prog);
+ bpf_mprog_write(fp, cp, ntuple);
+ if (!ntuple->link) {
+ WARN_ON_ONCE(cp->link);
+ bpf_prog_put(oprog);
+ }
+ *entry_new = entry;
+ return 0;
+}
+
+static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx, u32 flags)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == total)
+ goto insert;
+ else if (flags & BPF_F_BEFORE)
+ idx += 1;
+ bpf_mprog_entry_grow(peer, idx);
+insert:
+ bpf_mprog_read(peer, idx, &fp, &cp);
+ bpf_mprog_write(fp, cp, ntuple);
+ bpf_mprog_inc(peer);
+ *entry_new = peer;
+ return 0;
+}
+
+static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *dtuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_entry_shrink(peer, idx);
+ bpf_mprog_dec(peer);
+ bpf_mprog_mark_for_release(peer, dtuple);
+ *entry_new = peer;
+ return 0;
+}
+
+/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
+ * program/link that needs to be replaced, inserted or deleted for
+ * each "rule" independently. If all rules agree on that position
+ * or existing element, then enact replacement, addition or deletion.
+ * If this is not the case, then the request cannot be satisfied and
+ * we bail out with an error.
+ */
+static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog))
+ return tuple->link == cp->link ? i : -EBUSY;
+ }
+ return -ENOENT;
+}
+
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_REPLACE)
+ ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
+ else
+ ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_cp *cp;
+ struct bpf_mprog_fp *fp;
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ link = cp->link;
+ /* The deletion request can either be without filled tuple in which
+ * case it gets populated here based on idx, or with filled tuple
+ * where the only thing we end up doing is the WARN_ON_ONCE() assert.
+ * If we hit a BPF link at the given index, it must not be removed
+ * from opts path.
+ */
+ if (link && !tuple->link)
+ return -EBUSY;
+ WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
+ WARN_ON_ONCE(tuple->link && tuple->link != link);
+ tuple->prog = prog;
+ tuple->link = link;
+ return 0;
+}
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, dtuple = {
+ .prog = prog,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (flags & BPF_F_REPLACE)
+ return -EINVAL;
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (!bpf_mprog_total(entry))
+ return -ENOENT;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
+ prog ? prog->type :
+ BPF_PROG_TYPE_UNSPEC);
+ if (ret)
+ return ret;
+ if (dtuple.prog) {
+ tidx = bpf_mprog_pos_exact(entry, &dtuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ ret = bpf_mprog_fetch(entry, &dtuple, idx);
+ if (ret)
+ goto out;
+ ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry)
+{
+ u32 __user *uprog_flags, *ulink_flags;
+ u32 __user *uprog_id, *ulink_id;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *prog;
+ const u32 flags = 0;
+ int i, ret = 0;
+ u32 id, count;
+ u64 revision;
+
+ if (attr->query.query_flags || attr->query.attach_flags)
+ return -EINVAL;
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
+ return -EFAULT;
+ uprog_id = u64_to_user_ptr(attr->query.prog_ids);
+ uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ ulink_id = u64_to_user_ptr(attr->query.link_ids);
+ ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
+ if (attr->query.count == 0 || !uprog_id || !count)
+ return 0;
+ if (attr->query.count < count) {
+ count = attr->query.count;
+ ret = -ENOSPC;
+ }
+ for (i = 0; i < bpf_mprog_max(); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ if (!prog)
+ break;
+ id = prog->aux->id;
+ if (copy_to_user(uprog_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (uprog_flags &&
+ copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ id = cp->link ? cp->link->id : 0;
+ if (ulink_id &&
+ copy_to_user(ulink_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (ulink_flags &&
+ copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (i + 1 == count)
+ break;
+ }
+ return ret;
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8a26cd8814c1..3e4f2ec1af06 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,7 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
* of all progs.
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index b56f9f3314fd..0c63bc2cd895 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -23,9 +23,9 @@ static void free_links_and_skel(void)
static int preload(struct bpf_preload_info *obj)
{
- strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
obj[0].link = maps_link;
- strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
obj[1].link = progs_link;
return 0;
}
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 8937dc6bc8d0..b83c2f5e9be1 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -50,7 +50,7 @@ iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
$(Q)mkdir -p $(@D)
- $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \
+ $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 03af863314ea..b78968b63fab 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -73,6 +73,8 @@ static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
return str + name_off;
}
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{
@@ -84,9 +86,12 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
return 0;
if (seq_num == 0)
- BPF_SEQ_PRINTF(seq, " id name max_entries\n");
+ BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n",
+ map->id, map->name, map->max_entries,
+ bpf_map_sum_elem_count(map));
- BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries);
return 0;
}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
index 70f236a82fe1..5b98ab02025e 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-/* THIS FILE IS AUTOGENERATED! */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
#ifndef __ITERATORS_BPF_SKEL_H__
#define __ITERATORS_BPF_SKEL_H__
@@ -18,8 +18,6 @@ struct iterators_bpf {
int dump_bpf_map_fd;
int dump_bpf_prog_fd;
} links;
- struct iterators_bpf__rodata {
- } *rodata;
};
static inline int
@@ -68,7 +66,6 @@ iterators_bpf__destroy(struct iterators_bpf *skel)
iterators_bpf__detach(skel);
skel_closenz(skel->progs.dump_bpf_map.prog_fd);
skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
- skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
skel_closenz(skel->maps.rodata.map_fd);
skel_free(skel);
}
@@ -81,15 +78,6 @@ iterators_bpf__open(void)
if (!skel)
goto cleanup;
skel->ctx.sz = (void *)&skel->links - (void *)skel;
- skel->rodata = skel_prep_map_data((void *)"\
-\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
-\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
-\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
- if (!skel->rodata)
- goto cleanup;
- skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
return skel;
cleanup:
iterators_bpf__destroy(skel);
@@ -103,7 +91,7 @@ iterators_bpf__load(struct iterators_bpf *skel)
int err;
opts.ctx = (struct bpf_loader_ctx *)skel;
- opts.data_sz = 6056;
+ opts.data_sz = 6208;
opts.data = (void *)"\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
@@ -138,190 +126,197 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
-\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\
\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
-\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
-\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
-\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
-\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
-\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
-\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
-\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
-\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
-\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
-\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
-\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
-\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
-\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
-\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
-\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
-\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
-\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
-\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
-\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
-\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
-\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
-\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
-\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
-\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
-\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
-\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
-\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
-\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
-\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
-\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
-\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
-\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
-\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
-\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
-\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
-\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
-\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
-\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
-\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
-\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
-\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
-\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
-\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
-\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
-\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
-\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
-\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
-\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
-\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
-\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
-\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
-\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
-\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
-\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
-\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
-\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
-\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
-\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
-\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
-\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
-\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
-\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
-\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
-\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
-\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
-\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
-\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
-\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
-\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
-\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
-\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
-\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
-\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
-\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
-\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
-\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
-\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
-\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
-\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
-\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
-\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
-\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
-\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
-\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
-\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
-\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
-\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
-\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
-\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
-\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
-\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
-\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
-\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
-\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
-\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
-\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
-\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
-\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
-\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
-\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
-\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
-\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
-\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
-\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
-\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
-\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
-\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
-\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
-\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
-\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
-\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
-\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
-\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
-\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
-\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
-\0\0\0\0";
- opts.insns_sz = 2216;
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\
+\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\
+\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\
+\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\
+\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\
+\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\
+\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\
+\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\
+\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\
+\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\
+\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\
+\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\
+\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\
+\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\
+\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\
+\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\
+\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\
+\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\
+\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\
+\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\
+\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\
+\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\
+\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\
+\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\
+\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\
+\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\
+\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\
+\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\
+\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\
+\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\
+\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\
+\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\
+\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\
+\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\
+\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\
+\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\
+\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\
+\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\
+\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\
+\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\
+\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\
+\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\
+\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\
+\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\
+\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\
+\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\
+\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\
+\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\
+\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\
+\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\
+\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\
+\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\
+\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\
+\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\
+\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\
+\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\
+\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\
+\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\
+\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\
+\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\
+\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\
+\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\
+\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\
+\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\
+\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\
+\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\
+\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\
+\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\
+\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\
+\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\
+\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\
+\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\
+\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\
+\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\
+\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\
+\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\
+\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\
+\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\
+\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\
+\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\
+\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\
+\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\
+\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\
+\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\
+\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\
+\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\
+\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\
+\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\
+\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\
+\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\
+\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\
+\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\
+\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\
+\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\
+\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\
+\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\
+\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\
+\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\
+\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\
+\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\
+\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\
+\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\
+\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\
+\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\
+\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\
+\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\
+\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\
+\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\
+\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\
+\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\
+\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\
+\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\
+\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\
+\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\
+\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\
+\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2456;
opts.insns = (void *)"\
\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
@@ -331,79 +326,83 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
-\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
-\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
-\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
-\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
-\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\
\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
-\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
-\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
-\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\
+\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
-\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
-\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
-\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
-\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\
+\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
-\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\
\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
-\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
-\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
-\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
-\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
-\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
-\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
-\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
-\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
-\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
-\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
-\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
-\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
-\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
-\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
-\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
-\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
-\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
-\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
-\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
-\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
-\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
-\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
-\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
-\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
-\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
-\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
-\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
-\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
-\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
-\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
-\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
-\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
-\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
-\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
-\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
-\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
-\0\0\0\0\x95\0\0\0\0\0\0\0";
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\
+\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\
+\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\
+\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\
+\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\
+\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\
+\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\
+\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\
+\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\
+\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\
+\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\
+\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\
+\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\
+\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\
+\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\
+\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\
+\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\
+\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\
+\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\
+\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\
+\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\
+\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
err = bpf_load_and_run(&opts);
if (err < 0)
return err;
- skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
- 4096, PROT_READ, skel->maps.rodata.map_fd);
- if (!skel->rodata)
- return -ENOMEM;
return 0;
}
@@ -422,4 +421,15 @@ iterators_bpf__open_and_load(void)
return skel;
}
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 601609164ef3..8d2ddcb7566b 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/capability.h>
#include <linux/btf_ids.h>
#include "percpu_freelist.h"
@@ -46,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!bpf_capable())
- return -EPERM;
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index cbf2d8d784b8..4b4f9670f1a9 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 875ac9b698d9..f045fde632e5 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -23,15 +23,6 @@
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
-/* Maximum size of ring buffer area is limited by 32-bit page offset within
- * record header, counted in pages. Reserve 8 bits for extensibility, and take
- * into account few extra pages for consumer/producer pages and
- * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
- * ring buffer.
- */
-#define RINGBUF_MAX_DATA_SZ \
- (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
-
struct bpf_ringbuf {
wait_queue_head_t waitq;
struct irq_work work;
@@ -161,6 +152,17 @@ static void bpf_ringbuf_notify(struct irq_work *work)
wake_up_all(&rb->waitq);
}
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and
+ * take into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts, the current maximum size would be:
+ *
+ * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+ *
+ * This gives 64GB limit, which seems plenty for single ring buffer. Now
+ * considering that the maximum value of data_sz is (4GB - 1), there
+ * will be no overflow, so just note the size limit in the comments.
+ */
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
{
struct bpf_ringbuf *rb;
@@ -193,12 +195,6 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
!PAGE_ALIGNED(attr->max_entries))
return ERR_PTR(-EINVAL);
-#ifdef CONFIG_64BIT
- /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
- if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
- return ERR_PTR(-E2BIG);
-#endif
-
rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b25fce425b2c..458bb80b14d5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -74,9 +74,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f1c8733f76b8..ebeb0695305a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,8 @@
#include <linux/trace_events.h>
#include <net/netfilter/nf_bpf_link.h>
+#include <net/tcx.h>
+
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
@@ -109,37 +111,6 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_mem_usage = bpf_map_offload_map_mem_usage,
};
-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
-{
- const struct bpf_map_ops *ops;
- u32 type = attr->map_type;
- struct bpf_map *map;
- int err;
-
- if (type >= ARRAY_SIZE(bpf_map_types))
- return ERR_PTR(-EINVAL);
- type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
- ops = bpf_map_types[type];
- if (!ops)
- return ERR_PTR(-EINVAL);
-
- if (ops->map_alloc_check) {
- err = ops->map_alloc_check(attr);
- if (err)
- return ERR_PTR(err);
- }
- if (attr->map_ifindex)
- ops = &bpf_map_offload_ops;
- if (!ops->map_mem_usage)
- return ERR_PTR(-EINVAL);
- map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = ops;
- map->map_type = type;
- return map;
-}
-
static void bpf_map_write_active_inc(struct bpf_map *map)
{
atomic64_inc(&map->writecnt);
@@ -686,7 +657,6 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- WARN_ON_ONCE(!pointee_struct_meta);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record :
@@ -1127,7 +1097,9 @@ free_map_tab:
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 map_type = attr->map_type;
struct bpf_map *map;
int f_flags;
int err;
@@ -1158,9 +1130,85 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
+ map_type = attr->map_type;
+ if (map_type >= ARRAY_SIZE(bpf_map_types))
+ return -EINVAL;
+ map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[map_type];
+ if (!ops)
+ return -EINVAL;
+
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return err;
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return -EINVAL;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF map
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
+
+ /* check privileged map type permissions */
+ switch (map_type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ /* unprivileged */
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ case BPF_MAP_TYPE_LPM_TRIE:
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ case BPF_MAP_TYPE_STACK_TRACE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_STRUCT_OPS:
+ case BPF_MAP_TYPE_CPUMAP:
+ if (!bpf_capable())
+ return -EPERM;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ case BPF_MAP_TYPE_XSKMAP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ WARN(1, "unsupported map type %d", map_type);
+ return -EPERM;
+ }
+
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return PTR_ERR(map);
+ map->ops = ops;
+ map->map_type = map_type;
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
@@ -1931,6 +1979,11 @@ static int map_freeze(const union bpf_attr *attr)
return -ENOTSUPP;
}
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ fdput(f);
+ return -EPERM;
+ }
+
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
err = -EBUSY;
@@ -1940,10 +1993,6 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!bpf_capable()) {
- err = -EPERM;
- goto err_put;
- }
WRITE_ONCE(map->frozen, true);
err_put:
@@ -2506,7 +2555,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct btf *attach_btf = NULL;
int err;
char license[128];
- bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
@@ -2525,15 +2573,15 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
!bpf_capable())
return -EPERM;
- /* copy eBPF program license from user space */
- if (strncpy_from_bpfptr(license,
- make_bpfptr(attr->license, uattr.is_kernel),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
-
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- is_gpl = license_is_gpl_compatible(license);
+ /* Intent here is for unprivileged_bpf_disabled to block BPF program
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out for these
+ * and other operations.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
@@ -2617,12 +2665,20 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
+ /* copy eBPF program license from user space */
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
+ goto free_prog_sec;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_dev_bound_init(prog, attr);
@@ -2701,23 +2757,38 @@ free_prog:
return err;
}
-#define BPF_OBJ_LAST_FIELD file_flags
+#define BPF_OBJ_LAST_FIELD path_fd
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
+ return -EINVAL;
+
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_pin_user(attr->bpf_fd, path_fd,
+ u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
{
+ int path_fd;
+
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
- attr->file_flags & ~BPF_OBJ_FLAG_MASK)
+ attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
attr->file_flags);
}
@@ -2743,10 +2814,12 @@ static void bpf_link_free_id(int id)
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper marksbpf_link as
+ * anon_inode's release() call. This helper marks bpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
@@ -2781,28 +2854,31 @@ static void bpf_link_put_deferred(struct work_struct *work)
bpf_link_free(link);
}
-/* bpf_link_put can be called from atomic context, but ensures that resources
- * are freed from process context
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
- if (in_atomic()) {
- INIT_WORK(&link->work, bpf_link_put_deferred);
- schedule_work(&link->work);
- } else {
- bpf_link_free(link);
- }
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
}
EXPORT_SYMBOL(bpf_link_put);
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
+}
+
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return 0;
}
@@ -2972,10 +3048,17 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link.link);
+ u32 target_btf_id, target_obj_id;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &target_obj_id, &target_btf_id);
seq_printf(seq,
- "attach_type:\t%d\n",
- tr_link->attach_type);
+ "attach_type:\t%d\n"
+ "target_obj_id:\t%u\n"
+ "target_btf_id:\t%u\n",
+ tr_link->attach_type,
+ target_obj_id,
+ target_btf_id);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
@@ -3215,6 +3298,25 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
raw_tp_link->btp->tp->name);
}
+static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
+{
+ if (ulen >= len + 1) {
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, buf, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
@@ -3233,20 +3335,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
if (!ubuf)
return 0;
- if (ulen >= tp_len + 1) {
- if (copy_to_user(ubuf, tp_name, tp_len + 1))
- return -EFAULT;
- } else {
- char zero = '\0';
-
- if (copy_to_user(ubuf, tp_name, ulen - 1))
- return -EFAULT;
- if (put_user(zero, ubuf + ulen - 1))
- return -EFAULT;
- return -ENOSPC;
- }
-
- return 0;
+ return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
@@ -3278,9 +3367,154 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
kfree(perf_link);
}
+static int bpf_perf_link_fill_common(const struct perf_event *event,
+ char __user *uname, u32 ulen,
+ u64 *probe_offset, u64 *probe_addr,
+ u32 *fd_type)
+{
+ const char *buf;
+ u32 prog_id;
+ size_t len;
+ int err;
+
+ if (!ulen ^ !uname)
+ return -EINVAL;
+
+ err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
+ probe_offset, probe_addr);
+ if (err)
+ return err;
+ if (!uname)
+ return 0;
+ if (buf) {
+ len = strlen(buf);
+ err = bpf_copy_to_user(uname, buf, ulen, len);
+ if (err)
+ return err;
+ } else {
+ char zero = '\0';
+
+ if (put_user(zero, uname))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
+ ulen = info->perf_event.kprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type);
+ if (err)
+ return err;
+ if (type == BPF_FD_TYPE_KRETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_KPROBE;
+
+ info->perf_event.kprobe.offset = offset;
+ if (!kallsyms_show_value(current_cred()))
+ addr = 0;
+ info->perf_event.kprobe.addr = addr;
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
+ ulen = info->perf_event.uprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type);
+ if (err)
+ return err;
+
+ if (type == BPF_FD_TYPE_URETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.offset = offset;
+ return 0;
+}
+#endif
+
+static int bpf_perf_link_fill_probe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fill_kprobe(event, info);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fill_uprobe(event, info);
+#endif
+ return -EOPNOTSUPP;
+}
+
+static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u32 ulen;
+
+ uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
+ ulen = info->perf_event.tracepoint.name_len;
+ info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+}
+
+static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ info->perf_event.event.type = event->attr.type;
+ info->perf_event.event.config = event->attr.config;
+ info->perf_event.type = BPF_PERF_EVENT_EVENT;
+ return 0;
+}
+
+static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_link_fill_perf_event(event, info);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_perf_link_fill_tracepoint(event, info);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_perf_link_fill_probe(event, info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
static const struct bpf_link_ops bpf_perf_link_lops = {
.release = bpf_perf_link_release,
.dealloc = bpf_perf_link_dealloc,
+ .fill_link_info = bpf_perf_link_fill_link_info,
};
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -3422,34 +3656,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return fd;
}
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
- enum bpf_attach_type attach_type)
-{
- switch (prog->type) {
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_SK_LOOKUP:
- return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
- case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
- /* cg-skb progs can be loaded by unpriv user.
- * check permissions at attach time.
- */
- return -EPERM;
- return prog->enforce_expected_attach_type &&
- prog->expected_attach_type != attach_type ?
- -EINVAL : 0;
- case BPF_PROG_TYPE_KPROBE:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
- attach_type != BPF_TRACE_KPROBE_MULTI)
- return -EINVAL;
- return 0;
- default:
- return 0;
- }
-}
-
static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
@@ -3508,31 +3714,101 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_XDP;
case BPF_LSM_CGROUP:
return BPF_PROG_TYPE_LSM;
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
+#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
+
+#define BPF_F_ATTACH_MASK_BASE \
+ (BPF_F_ALLOW_OVERRIDE | \
+ BPF_F_ALLOW_MULTI | \
+ BPF_F_REPLACE)
-#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
+#define BPF_F_ATTACH_MASK_MPROG \
+ (BPF_F_REPLACE | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_ID | \
+ BPF_F_LINK)
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
+ u32 mask;
int ret;
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
- return -EINVAL;
-
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
+ mask = bpf_mprog_supported(ptype) ?
+ BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE;
+ if (attr->attach_flags & ~mask)
+ return -EINVAL;
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
@@ -3568,6 +3844,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
else
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_prog_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -3577,25 +3856,41 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return ret;
}
-#define BPF_PROG_DETACH_LAST_FIELD attach_type
+#define BPF_PROG_DETACH_LAST_FIELD expected_revision
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ struct bpf_prog *prog = NULL;
enum bpf_prog_type ptype;
+ int ret;
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
+ if (bpf_mprog_supported(ptype)) {
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ if (attr->attach_bpf_fd) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ }
switch (ptype) {
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_SK_SKB:
- return sock_map_prog_detach(attr, ptype);
+ ret = sock_map_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_LIRC_MODE2:
- return lirc_prog_detach(attr);
+ ret = lirc_prog_detach(attr);
+ break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return netns_bpf_prog_detach(attr, ptype);
+ ret = netns_bpf_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3604,13 +3899,21 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_LSM:
- return cgroup_bpf_prog_detach(attr, ptype);
+ ret = cgroup_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_prog_detach(attr, prog);
+ break;
default:
- return -EINVAL;
+ ret = -EINVAL;
}
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
+#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3658,6 +3961,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_SK_MSG_VERDICT:
case BPF_SK_SKB_VERDICT:
return sock_map_bpf_prog_query(attr, uattr);
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return tcx_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -4575,10 +4881,9 @@ err_put:
return err;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
- enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
@@ -4598,38 +4903,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
switch (prog->type) {
- case BPF_PROG_TYPE_EXT:
- break;
- case BPF_PROG_TYPE_NETFILTER:
- if (attr->link_create.attach_type != BPF_NETFILTER) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_TRACEPOINT:
- if (attr->link_create.attach_type != BPF_PERF_EVENT) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_KPROBE:
- if (attr->link_create.attach_type != BPF_PERF_EVENT &&
- attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
- ret = -EINVAL;
- goto out;
- }
- break;
- default:
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
- goto out;
- }
- break;
- }
-
- switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4671,6 +4944,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_link_attach(attr, prog);
+ break;
case BPF_PROG_TYPE_NETFILTER:
ret = bpf_nf_link_attach(attr, prog);
break;
@@ -4682,8 +4958,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -4778,7 +5056,7 @@ out_put_progs:
if (ret)
bpf_prog_put(new_prog);
out_put_link:
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4801,7 +5079,7 @@ static int link_detach(union bpf_attr *attr)
else
ret = -EOPNOTSUPP;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4871,7 +5149,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_link_new_fd(link);
if (fd < 0)
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return fd;
}
@@ -4948,7 +5226,7 @@ static int bpf_iter_create(union bpf_attr *attr)
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return err;
}
@@ -5018,23 +5296,8 @@ out_prog_put:
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
- bool capable;
int err;
- capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
-
- /* Intent here is for unprivileged_bpf_disabled to block key object
- * creation commands for unprivileged users; other actions depend
- * of fd availability and access to bpffs, so are dependent on
- * object creation success. Capabilities are later verified for
- * operations such as load and map create, so even with unprivileged
- * BPF disabled, capability checks are still carried out for these
- * and other operations.
- */
- if (!capable &&
- (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
- return -EPERM;
-
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
@@ -5394,7 +5657,8 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
*(int *)table->data = unpriv_enable;
}
- unpriv_ebpf_notify(unpriv_enable);
+ if (write)
+ unpriv_ebpf_notify(unpriv_enable);
return ret;
}
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
new file mode 100644
index 000000000000..13f0b5dc8262
--- /dev/null
+++ b/kernel/bpf/tcx.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+#include <linux/netdevice.h>
+
+#include <net/tcx.h>
+
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool created, ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+void tcx_uninstall(struct net_device *dev, bool ingress)
+{
+ struct bpf_mprog_entry *entry, *entry_new = NULL;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ bool active;
+
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry)
+ return;
+ active = tcx_entry(entry)->miniq_active;
+ if (active)
+ bpf_mprog_clear_all(entry, &entry_new);
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ tcx_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ tcx_skeys_dec(ingress);
+ }
+ if (!active)
+ tcx_entry_free(entry);
+}
+
+int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->query.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, entry);
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool created, ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = tcx->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry)
+ return -ENOMEM;
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+ return ret;
+}
+
+static void tcx_link_release(struct bpf_link *link)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev)
+ goto out;
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ tcx->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void tcx_link_dealloc(struct bpf_link *link)
+{
+ kfree(tcx_link(link));
+}
+
+static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct tcx_link *tcx = tcx_link_const(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ tcx->location,
+ tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress");
+}
+
+static int tcx_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct tcx_link *tcx = tcx_link_const(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ info->tcx.ifindex = ifindex;
+ info->tcx.attach_type = tcx->location;
+ return 0;
+}
+
+static int tcx_link_detach(struct bpf_link *link)
+{
+ tcx_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops tcx_link_lops = {
+ .release = tcx_link_release,
+ .detach = tcx_link_detach,
+ .dealloc = tcx_link_dealloc,
+ .update_prog = tcx_link_update,
+ .show_fdinfo = tcx_link_fdinfo,
+ .fill_link_info = tcx_link_fill_info,
+};
+
+static int tcx_link_init(struct tcx_link *tcx,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog);
+ tcx->location = attr->link_create.attach_type;
+ tcx->dev = dev;
+ return bpf_link_prime(&tcx->link, link_primer);
+}
+
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct net_device *dev;
+ struct tcx_link *tcx;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ tcx = kzalloc(sizeof(*tcx), GFP_USER);
+ if (!tcx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = tcx_link_init(tcx, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(tcx);
+ goto out;
+ }
+ ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags,
+ attr->link_create.tcx.relative_fd,
+ attr->link_create.tcx.expected_revision);
+ if (ret) {
+ tcx->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ac021bc43a66..78acf28d4873 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -251,11 +251,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
return tlinks;
}
-static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
{
- struct bpf_tramp_image *im;
-
- im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
bpf_jit_uncharge_modmem(PAGE_SIZE);
@@ -263,6 +260,14 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
kfree_rcu(im, rcu);
}
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_tramp_image_free(im);
+}
+
/* callback, fexit step 3 or fentry step 2 */
static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
{
@@ -344,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
{
struct bpf_tramp_image *im;
struct bpf_ksym *ksym;
@@ -371,7 +376,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
bpf_image_ksym_add(image, ksym);
return im;
@@ -401,11 +406,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
err = unregister_fentry(tr, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
- tr->selector = 0;
goto out;
}
- im = bpf_tramp_image_alloc(tr->key, tr->selector);
+ im = bpf_tramp_image_alloc(tr->key);
if (IS_ERR(im)) {
err = PTR_ERR(im);
goto out;
@@ -438,12 +442,11 @@ again:
&tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
- goto out;
+ goto out_free;
set_memory_rox((long)im->image, 1);
- WARN_ON(tr->cur_image && tr->selector == 0);
- WARN_ON(!tr->cur_image && tr->selector);
+ WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
@@ -468,18 +471,21 @@ again:
}
#endif
if (err)
- goto out;
+ goto out_free;
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
- tr->selector++;
out:
/* If any error happens, restore previous flags */
if (err)
tr->flags = orig_flags;
kfree(tlinks);
return err;
+
+out_free:
+ bpf_tramp_image_free(im);
+ goto out;
}
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cf5f230360f5..bb78212fa5b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -25,6 +25,8 @@
#include <linux/btf_ids.h>
#include <linux/poison.h>
#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <net/xdp.h>
#include "disasm.h"
@@ -197,6 +199,7 @@ static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
static void specialize_kfunc(struct bpf_verifier_env *env,
u32 func_id, u16 offset, unsigned long *addr);
+static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -240,6 +243,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0;
+}
+
static bool bpf_pseudo_call(const struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_CALL) &&
@@ -273,11 +282,6 @@ struct bpf_call_arg_meta {
struct btf_field *kptr_field;
};
-struct btf_and_id {
- struct btf *btf;
- u32 btf_id;
-};
-
struct bpf_kfunc_call_arg_meta {
/* In parameters */
struct btf *btf;
@@ -296,10 +300,21 @@ struct bpf_kfunc_call_arg_meta {
u64 value;
bool found;
} arg_constant;
- union {
- struct btf_and_id arg_obj_drop;
- struct btf_and_id arg_refcount_acquire;
- };
+
+ /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
+ * generally to pass info about user-defined local kptr types to later
+ * verification logic
+ * bpf_obj_drop
+ * Record the local kptr type to be drop'd
+ * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+ * Record the local kptr type to be refcount_incr'd and use
+ * arg_owning_ref to determine whether refcount_acquire should be
+ * fallible
+ */
+ struct btf *arg_btf;
+ u32 arg_btf_id;
+ bool arg_owning_ref;
+
struct {
struct btf_field *field;
} arg_list_head;
@@ -309,6 +324,7 @@ struct bpf_kfunc_call_arg_meta {
struct {
enum bpf_dynptr_type type;
u32 id;
+ u32 ref_obj_id;
} initialized_dynptr;
struct {
u8 spi;
@@ -429,8 +445,11 @@ static bool type_may_be_null(u32 type)
return type & PTR_MAYBE_NULL;
}
-static bool reg_type_not_null(enum bpf_reg_type type)
+static bool reg_not_null(const struct bpf_reg_state *reg)
{
+ enum bpf_reg_type type;
+
+ type = reg->type;
if (type_may_be_null(type))
return false;
@@ -440,6 +459,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
type == PTR_TO_MEM;
}
@@ -468,6 +488,13 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
return rec;
}
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
+
+ return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
+}
+
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -515,6 +542,8 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
+static bool is_callback_calling_kfunc(u32 btf_id);
+
static bool is_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@@ -524,6 +553,11 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_user_ringbuf_drain;
}
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_timer_set_callback;
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -604,9 +638,9 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
type & PTR_TRUSTED ? "trusted_" : ""
);
- snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
- return env->type_str_buf;
+ return env->tmp_str_buf;
}
static char slot_type_char[] = {
@@ -847,11 +881,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi);
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type, int insn_idx)
+ enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type type;
- int spi, i, id, err;
+ int spi, i, err;
spi = dynptr_get_spi(env, reg);
if (spi < 0)
@@ -887,7 +921,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (dynptr_type_refcounted(type)) {
/* The id is used to track proper releasing */
- id = acquire_reference_state(env, insn_idx);
+ int id;
+
+ if (clone_ref_obj_id)
+ id = clone_ref_obj_id;
+ else
+ id = acquire_reference_state(env, insn_idx);
+
if (id < 0)
return id;
@@ -901,24 +941,15 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
return 0;
}
-static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *state = func(env, reg);
- int spi, i;
-
- spi = dynptr_get_spi(env, reg);
- if (spi < 0)
- return spi;
+ int i;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
- /* Invalidate any slices associated with this dynptr */
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
- WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
-
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
@@ -945,6 +976,50 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
*/
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ref_obj_id, i;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ invalidate_dynptr(env, state, spi);
+ return 0;
+ }
+
+ ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+ /* If the dynptr has a ref_obj_id, then we need to invalidate
+ * two things:
+ *
+ * 1) Any dynptrs with a matching ref_obj_id (clones)
+ * 2) Any slices derived from this dynptr.
+ */
+
+ /* Invalidate any slices associated with this dynptr */
+ WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+ /* Invalidate any dynptr clones */
+ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+ continue;
+
+ /* it should always be the case that if the ref obj id
+ * matches then the stack slot also belongs to a
+ * dynptr
+ */
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+ verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
+ return -EFAULT;
+ }
+ if (state->stack[i].spilled_ptr.dynptr.first_slot)
+ invalidate_dynptr(env, state, i);
+ }
return 0;
}
@@ -1254,6 +1329,12 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack)
return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
static void scrub_spilled_slot(u8 *stype)
{
if (*stype != STACK_INVALID)
@@ -2775,7 +2856,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- off = i + insn[i].off + 1;
+ if (code == (BPF_JMP32 | BPF_JA))
+ off = i + insn[i].imm + 1;
+ else
+ off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -2787,6 +2871,7 @@ next:
* or unconditional jump back
*/
if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP32 | BPF_JA) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
@@ -2932,8 +3017,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
+ return false;
+
if (class == BPF_ALU64 || class == BPF_JMP ||
- /* BPF_END always use BPF_ALU class. */
(class == BPF_ALU && op == BPF_END && insn->imm == 64))
return true;
@@ -3144,12 +3231,172 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verbose(bt->env, "BUG subprog exit from frame 0\n");
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
+{
+ bt_set_frame_slot(bt, bt->frame, slot);
+}
+
+static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
+{
+ bt_clear_frame_slot(bt, bt->frame, slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
+{
+ return bt->stack_masks[bt->frame] & (1ull << slot);
+}
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
*/
-static int backtrack_insn(struct bpf_verifier_env *env, int idx,
- u32 *reg_mask, u64 *stack_mask)
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -3160,29 +3407,33 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
u8 mode = BPF_MODE(insn->code);
- u32 dreg = 1u << insn->dst_reg;
- u32 sreg = 1u << insn->src_reg;
- u32 spi;
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i;
if (insn->code == 0)
return 0;
if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (class == BPF_ALU || class == BPF_ALU64) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
* dreg needs precision after this insn
* sreg needs precision before this insn
*/
- *reg_mask &= ~dreg;
- *reg_mask |= sreg;
+ bt_clear_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -3190,7 +3441,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* as precise=true in this verifier state.
* No further markings in parent are necessary
*/
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
}
} else {
if (BPF_SRC(insn->code) == BPF_X) {
@@ -3198,15 +3449,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* both dreg and sreg need precision
* before this insn
*/
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
}
} else if (class == BPF_LDX) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
@@ -3227,9 +3478,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- *stack_mask |= 1ull << spi;
+ bt_set_slot(bt, spi);
} else if (class == BPF_STX || class == BPF_ST) {
- if (*reg_mask & dreg)
+ if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
@@ -3244,20 +3495,92 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- if (!(*stack_mask & (1ull << spi)))
+ if (!bt_is_slot_set(bt, spi))
return 0;
- *stack_mask &= ~(1ull << spi);
+ bt_clear_slot(bt, spi);
if (class == BPF_STX)
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
- if (opcode == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- return -ENOTSUPP;
- /* BPF helpers that invoke callback subprogs are
- * equivalent to BPF_PSEUDO_CALL above
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* we don't track register spills perfectly,
+ * so fallback to force-precise instead of failing */
+ if (bt_stack_mask(bt) != 0)
+ return -ENOTSUPP;
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if ((bpf_helper_call(insn) &&
+ is_callback_calling_function(insn->imm) &&
+ !is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
+ /* callback-calling helper or kfunc call, which means
+ * we are exiting from subprog, but unlike the subprog
+ * call handling above, we shouldn't propagate
+ * precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback
+ * subprogs
*/
- if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0)
return -ENOTSUPP;
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
/* kfunc with imm==0 is invalid and fixup_kfunc_call will
* catch this error later. Make backtracking conservative
* with ENOTSUPP.
@@ -3265,19 +3588,51 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
return -ENOTSUPP;
/* regular helper call sets R0 */
- *reg_mask &= ~1;
- if (*reg_mask & 0x3f) {
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
/* if backtracing was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", *reg_mask);
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
- return -ENOTSUPP;
+ bool r0_precise;
+
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracing was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
} else if (BPF_SRC(insn->code) == BPF_X) {
- if (!(*reg_mask & (dreg | sreg)))
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
return 0;
/* dreg <cond> sreg
* Both dreg and sreg need precision before
@@ -3285,7 +3640,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- *reg_mask |= (sreg | dreg);
+ bt_set_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
/* else dreg <cond> K
* Only dreg still needs precision before
* this insn, so for the K-based conditional
@@ -3293,9 +3649,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
}
} else if (class == BPF_LD) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* It's ld_imm64 or ld_abs or ld_ind.
* For ld_imm64 no further tracking of precision
* into parent is necessary
@@ -3366,6 +3722,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
int i, j;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
* We also skip current state and go straight to first parent state,
@@ -3377,17 +3738,25 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
}
}
}
@@ -3418,6 +3787,96 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
}
}
+static bool idset_contains(struct bpf_idset *s, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < s->count; ++i)
+ if (s->ids[i] == id)
+ return true;
+
+ return false;
+}
+
+static int idset_push(struct bpf_idset *s, u32 id)
+{
+ if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
+ return -EFAULT;
+ s->ids[s->count++] = id;
+ return 0;
+}
+
+static void idset_reset(struct bpf_idset *s)
+{
+ s->count = 0;
+}
+
+/* Collect a set of IDs for all registers currently marked as precise in env->bt.
+ * Mark all registers with these IDs as precise.
+ */
+static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_idset *precise_ids = &env->idset_scratch;
+ struct backtrack_state *bt = &env->bt;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ DECLARE_BITMAP(mask, 64);
+ int i, fr;
+
+ idset_reset(precise_ids);
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (!reg->id || reg->type != SCALAR_VALUE)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE)
+ break;
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+ }
+
+ for (fr = 0; fr <= st->curframe; ++fr) {
+ func = st->frame[fr];
+
+ for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
+ reg = &func->regs[i];
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_reg(bt, fr, i);
+ }
+ for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_slot(bt, fr, i);
+ }
+ }
+
+ return 0;
+}
+
/*
* __mark_chain_precision() backtracks BPF program instruction sequence and
* chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -3505,62 +3964,74 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* mark_all_scalars_imprecise() to hopefully get more permissive and generic
* finalized states which help in short circuiting more future states.
*/
-static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
- int spi)
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
+ struct backtrack_state *bt = &env->bt;
struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
int last_idx = env->insn_idx;
+ int subseq_idx = -1;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
- u32 reg_mask = regno >= 0 ? 1u << regno : 0;
- u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
bool skip_first = true;
- bool new_marks = false;
- int i, err;
+ int i, fr, err;
if (!env->bpf_capable)
return 0;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, env->cur_state->curframe);
+
/* Do sanity checks against current state of register and/or stack
* slot, but don't set precise flag in current state, as precision
* tracking in the current state is unnecessary.
*/
- func = st->frame[frame];
+ func = st->frame[bt->frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- new_marks = true;
+ bt_set_reg(bt, regno);
}
- while (spi >= 0) {
- if (!is_spilled_reg(&func->stack[spi])) {
- stack_mask = 0;
- break;
- }
- reg = &func->stack[spi].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask = 0;
- break;
- }
- new_marks = true;
- break;
- }
-
- if (!new_marks)
- return 0;
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
return 0;
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ /* If some register with scalar ID is marked as precise,
+ * make sure that all registers sharing this ID are also precise.
+ * This is needed to estimate effect of find_equal_scalars().
+ * Do this at the last instruction of each state,
+ * bpf_reg_state::id fields are valid for these instructions.
+ *
+ * Allows to track precision in situation like below:
+ *
+ * r2 = unknown value
+ * ...
+ * --- state #0 ---
+ * ...
+ * r1 = r2 // r1 and r2 now share the same ID
+ * ...
+ * --- state #1 {r1.id = A, r2.id = A} ---
+ * ...
+ * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
+ * ...
+ * --- state #2 {r1.id = A, r2.id = A} ---
+ * r3 = r10
+ * r3 += r1 // need to mark both r1 and r2
+ */
+ if (mark_precise_scalar_ids(env, st))
+ return -EFAULT;
if (last_idx < 0) {
/* we are at the entry into subprog, which
@@ -3571,12 +4042,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
if (st->curframe == 0 &&
st->frame[0]->subprogno > 0 &&
st->frame[0]->callsite == BPF_MAIN_FUNC &&
- stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
- bitmap_from_u64(mask, reg_mask);
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
+ bt_clear_reg(bt, i);
continue;
}
reg->precise = true;
@@ -3584,8 +4056,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
return 0;
}
- verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
- st->frame[0]->subprogno, reg_mask, stack_mask);
+ verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
@@ -3595,15 +4067,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ err = backtrack_insn(env, i, subseq_idx, bt);
}
if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, st);
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
return 0;
} else if (err) {
return err;
}
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
/* Found assignment(s) into tracked register in this state.
* Since this state is already marked, just return.
* Nothing to be tracked further in the parent state.
@@ -3611,6 +4084,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
return 0;
if (i == first_idx)
break;
+ subseq_idx = i;
i = get_prev_insn_idx(st, i, &history);
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
@@ -3628,84 +4102,95 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
if (!st)
break;
- new_marks = false;
- func = st->frame[frame];
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
- continue;
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise)
+ bt_clear_frame_reg(bt, fr, i);
+ else
+ reg->precise = true;
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, st);
- return 0;
- }
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
+ /* the sequence of instructions:
+ * 2: (bf) r3 = r10
+ * 3: (7b) *(u64 *)(r3 -8) = r0
+ * 4: (79) r4 = *(u64 *)(r10 -8)
+ * doesn't contain jmps. It's backtracked
+ * as a single block.
+ * During backtracking insn 3 is not recognized as
+ * stack access, so at the end of backtracking
+ * stack slot fp-8 is still marked in stack_mask.
+ * However the parent state may not have accessed
+ * fp-8 and it's "unallocated" stack space.
+ * In such case fallback to conservative.
+ */
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
+ return 0;
+ }
- if (!is_spilled_reg(&func->stack[i])) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (!is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise)
+ bt_clear_frame_slot(bt, fr, i);
+ else
+ reg->precise = true;
}
- reg = &func->stack[i].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, func, true);
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "parent %s regs=%x stack=%llx marks:",
- new_marks ? "didn't have" : "already had",
- reg_mask, stack_mask);
- print_verifier_state(env, func, true);
}
- if (!reg_mask && !stack_mask)
- break;
- if (!new_marks)
- break;
+ if (bt_empty(bt))
+ return 0;
+ subseq_idx = first_idx;
last_idx = st->last_insn_idx;
first_idx = st->first_insn_idx;
}
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
+ }
+
return 0;
}
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
-}
-
-static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
-{
- return __mark_chain_precision(env, frame, regno, -1);
+ return __mark_chain_precision(env, regno);
}
-static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env)
{
- return __mark_chain_precision(env, frame, -1, spi);
+ return __mark_chain_precision(env, -1);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -4070,6 +4555,7 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
for (i = min_off; i < max_off; i++) {
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
+ mark_stack_slot_scratched(env, spi);
stype = ptr_state->stack[spi].slot_type;
if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
break;
@@ -4121,6 +4607,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
+ mark_stack_slot_scratched(env, spi);
+
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -4502,20 +4990,22 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno)
{
const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+ int perm_flags;
const char *reg_name = "";
- /* Only unreferenced case accepts untrusted pointers */
- if (kptr_field->type == BPF_KPTR_UNREF)
- perm_flags |= PTR_UNTRUSTED;
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ }
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
goto bad_type;
- if (!btf_is_kernel(reg->btf)) {
- verbose(env, "R%d must point to kernel BTF\n", regno);
- return -EINVAL;
- }
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
reg_name = btf_type_name(reg->btf, reg->btf_id);
@@ -4528,7 +5018,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
if (__check_ptr_off_reg(env, reg, regno, true))
return -EACCES;
- /* A full type match is needed, as BTF can be vmlinux or module BTF, and
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
* we also need to take into account the reg->off.
*
* We want to support cases like:
@@ -4574,7 +5064,9 @@ bad_type:
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
- return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+ return env->cur_state->active_rcu_lock ||
+ env->cur_state->active_lock.ptr ||
+ !env->prog->aux->sleepable;
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
@@ -4932,12 +5424,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+ [CONST_PTR_TO_MAP] = btf_bpf_map_id,
+};
+
static bool is_trusted_reg(const struct bpf_reg_state *reg)
{
/* A referenced register is always trusted. */
if (reg->ref_obj_id)
return true;
+ /* Types listed in the reg2btf_ids are always trusted */
+ if (reg2btf_ids[base_type(reg->type)])
+ return true;
+
/* If a register is not referenced, it is trusted if it has the
* MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
* other type modifiers may be safe, but we elect to take an opt-in
@@ -5093,16 +5598,17 @@ static int update_stack_depth(struct bpf_verifier_env *env,
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
{
- int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
+ i = subprog[idx].start;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -5141,7 +5647,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- int next_insn;
+ int next_insn, sidx;
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
@@ -5151,21 +5657,23 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
- idx = find_subprog(env, next_insn);
- if (idx < 0) {
+ sidx = find_subprog(env, next_insn);
+ if (sidx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
next_insn);
return -EFAULT;
}
- if (subprog[idx].is_async_cb) {
- if (subprog[idx].has_tail_call) {
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
verbose(env, "verifier bug. subprog has tail_call and async cb\n");
return -EFAULT;
}
- /* async callbacks don't increase bpf prog stack size */
- continue;
+ /* async callbacks don't increase bpf prog stack size unless called directly */
+ if (!bpf_pseudo_call(insn + i))
+ continue;
}
i = next_insn;
+ idx = sidx;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -5201,6 +5709,22 @@ continue_func:
goto continue_func;
}
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int ret;
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (!i || si[i].is_async_cb) {
+ ret = check_max_stack_depth_subprog(env, i);
+ if (ret < 0)
+ return ret;
+ }
+ continue;
+ }
+ return 0;
+}
+
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
const struct bpf_insn *insn, int idx)
@@ -5314,6 +5838,147 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
__reg_combine_64_into_32(reg);
}
+static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->smin_value = reg->s32_min_value = S8_MIN;
+ reg->smax_value = reg->s32_max_value = S8_MAX;
+ } else if (size == 2) {
+ reg->smin_value = reg->s32_min_value = S16_MIN;
+ reg->smax_value = reg->s32_max_value = S16_MAX;
+ } else {
+ /* size == 4 */
+ reg->smin_value = reg->s32_min_value = S32_MIN;
+ reg->smax_value = reg->s32_max_value = S32_MAX;
+ }
+ reg->umin_value = reg->u32_min_value = 0;
+ reg->umax_value = U64_MAX;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_unknown;
+}
+
+static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
+ u64 top_smax_value, top_smin_value;
+ u64 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u64_cval = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u64_cval);
+ else if (size == 2)
+ reg->var_off = tnum_const((s16)u64_cval);
+ else
+ /* size == 4 */
+ reg->var_off = tnum_const((s32)u64_cval);
+
+ u64_cval = reg->var_off.value;
+ reg->smax_value = reg->smin_value = u64_cval;
+ reg->umax_value = reg->umin_value = u64_cval;
+ reg->s32_max_value = reg->s32_min_value = u64_cval;
+ reg->u32_max_value = reg->u32_min_value = u64_cval;
+ return;
+ }
+
+ top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s64_min and s64_min after sign extension */
+ if (size == 1) {
+ init_s64_max = (s8)reg->smax_value;
+ init_s64_min = (s8)reg->smin_value;
+ } else if (size == 2) {
+ init_s64_max = (s16)reg->smax_value;
+ init_s64_min = (s16)reg->smin_value;
+ } else {
+ init_s64_max = (s32)reg->smax_value;
+ init_s64_min = (s32)reg->smin_value;
+ }
+
+ s64_max = max(init_s64_max, init_s64_min);
+ s64_min = min(init_s64_max, init_s64_min);
+
+ /* both of s64_max/s64_min positive or negative */
+ if ((s64_max >= 0) == (s64_min >= 0)) {
+ reg->smin_value = reg->s32_min_value = s64_min;
+ reg->smax_value = reg->s32_max_value = s64_max;
+ reg->umin_value = reg->u32_min_value = s64_min;
+ reg->umax_value = reg->u32_max_value = s64_max;
+ reg->var_off = tnum_range(s64_min, s64_max);
+ return;
+ }
+
+out:
+ set_sext64_default_val(reg, size);
+}
+
+static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->s32_min_value = S8_MIN;
+ reg->s32_max_value = S8_MAX;
+ } else {
+ /* size == 2 */
+ reg->s32_min_value = S16_MIN;
+ reg->s32_max_value = S16_MAX;
+ }
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+}
+
+static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
+ u32 top_smax_value, top_smin_value;
+ u32 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u32_val = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u32_val);
+ else
+ reg->var_off = tnum_const((s16)u32_val);
+
+ u32_val = reg->var_off.value;
+ reg->s32_min_value = reg->s32_max_value = u32_val;
+ reg->u32_min_value = reg->u32_max_value = u32_val;
+ return;
+ }
+
+ top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s32_min and s32_min after sign extension */
+ if (size == 1) {
+ init_s32_max = (s8)reg->s32_max_value;
+ init_s32_min = (s8)reg->s32_min_value;
+ } else {
+ /* size == 2 */
+ init_s32_max = (s16)reg->s32_max_value;
+ init_s32_min = (s16)reg->s32_min_value;
+ }
+ s32_max = max(init_s32_max, init_s32_min);
+ s32_min = min(init_s32_max, init_s32_min);
+
+ if ((s32_min >= 0) == (s32_max >= 0)) {
+ reg->s32_min_value = s32_min;
+ reg->s32_max_value = s32_max;
+ reg->u32_min_value = (u32)s32_min;
+ reg->u32_max_value = (u32)s32_max;
+ return;
+ }
+
+out:
+ set_sext32_default_val(reg, size);
+}
+
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
/* A map is considered read-only if the following condition are true:
@@ -5334,7 +5999,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map)
!bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -5347,13 +6013,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
switch (size) {
case sizeof(u8):
- *val = (u64)*(u8 *)ptr;
+ *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
break;
case sizeof(u16):
- *val = (u64)*(u16 *)ptr;
+ *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
break;
case sizeof(u32):
- *val = (u64)*(u32 *)ptr;
+ *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
break;
case sizeof(u64):
*val = *(u64 *)ptr;
@@ -5537,7 +6203,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
* program allocated objects (which always have ref_obj_id > 0),
* but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
*/
- if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
@@ -5586,6 +6252,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
type_is_rcu_or_null(env, reg, field_name, btf_id)) {
/* __rcu tagged pointers can be NULL */
flag |= MEM_RCU | PTR_MAYBE_NULL;
+
+ /* We always trust them */
+ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
+ flag & PTR_UNTRUSTED)
+ flag &= ~PTR_UNTRUSTED;
} else if (flag & (MEM_PERCPU | MEM_USER)) {
/* keep as-is */
} else {
@@ -5767,7 +6438,7 @@ static int check_stack_access_within_bounds(
*/
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
int off, int bpf_size, enum bpf_access_type t,
- int value_regno, bool strict_alignment_once)
+ int value_regno, bool strict_alignment_once, bool is_ldsx)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -5828,7 +6499,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
u64 val = 0;
err = bpf_map_direct_read(map, map_off, size,
- &val);
+ &val, is_ldsx);
if (err)
return err;
@@ -5998,8 +6669,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
- /* b/h/w load zero-extends, mark upper bits as known 0 */
- coerce_reg_to_size(&regs[value_regno], size);
+ if (!is_ldsx)
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ coerce_reg_to_size(&regs[value_regno], size);
+ else
+ coerce_reg_to_size_sx(&regs[value_regno], size);
}
return err;
}
@@ -6091,17 +6765,17 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
* case to simulate the register fill.
*/
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, load_reg,
- true);
+ true, false);
if (err)
return err;
/* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
@@ -6347,7 +7021,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return zero_size_allowed ? 0 : -EACCES;
return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- atype, -1, false);
+ atype, -1, false, false);
}
fallthrough;
@@ -6680,7 +7354,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
* type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
- enum bpf_arg_type arg_type)
+ enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
@@ -6719,12 +7393,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, regno,
- i, BPF_DW, BPF_WRITE, -1, false);
+ i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
}
- err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
} else /* MEM_RDONLY and None case from above */ {
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
@@ -6812,7 +7486,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
err = check_mem_access(env, insn_idx, regno,
- i, BPF_DW, BPF_WRITE, -1, false);
+ i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
}
@@ -7146,14 +7820,18 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
* ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
* but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
*
+ * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+ *
* Therefore we fold these flags depending on the arg_type before comparison.
*/
if (arg_type & MEM_RDONLY)
type &= ~MEM_RDONLY;
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (base_type(arg_type) == ARG_PTR_TO_MEM)
+ type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
type &= ~MEM_ALLOC;
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
@@ -7242,7 +7920,10 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- /* Handled by helper specific checks */
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ }
break;
case PTR_TO_BTF_ID | MEM_PERCPU:
case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
@@ -7294,17 +7975,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
return 0;
- if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
- if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
- return __check_ptr_off_reg(env, reg, regno, true);
-
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- btf_type_name(reg->btf, reg->btf_id), reg->off);
- return -EINVAL;
- }
-
/* Doing check_ptr_off_reg check for the offset will catch this
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
@@ -7339,6 +8009,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
* can be non-zero. This was already checked above. So pass
@@ -7634,7 +8305,7 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, insn_idx, arg_type);
+ err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
if (err)
return err;
break;
@@ -8181,17 +8852,13 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
-static bool is_callback_calling_kfunc(u32 btf_id);
-
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
{
struct bpf_verifier_state *state = env->cur_state;
- struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int err;
- bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -8206,13 +8873,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
- func_info_aux = env->prog->aux->func_info_aux;
- if (func_info_aux)
- is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (is_global) {
+ if (subprog_is_global(env, subprog)) {
if (err) {
verbose(env, "Caller passes invalid args into func#%d\n",
subprog);
@@ -8639,19 +9303,33 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
- if (ret_type != RET_INTEGER ||
- (func_id != BPF_FUNC_get_stack &&
- func_id != BPF_FUNC_get_task_stack &&
- func_id != BPF_FUNC_probe_read_str &&
- func_id != BPF_FUNC_probe_read_kernel_str &&
- func_id != BPF_FUNC_probe_read_user_str))
+ if (ret_type != RET_INTEGER)
return;
- ret_reg->smax_value = meta->msize_max_value;
- ret_reg->s32_max_value = meta->msize_max_value;
- ret_reg->smin_value = -MAX_ERRNO;
- ret_reg->s32_min_value = -MAX_ERRNO;
- reg_bounds_sync(ret_reg);
+ switch (func_id) {
+ case BPF_FUNC_get_stack:
+ case BPF_FUNC_get_task_stack:
+ case BPF_FUNC_probe_read_str:
+ case BPF_FUNC_probe_read_kernel_str:
+ case BPF_FUNC_probe_read_user_str:
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_smp_processor_id:
+ ret_reg->umax_value = nr_cpu_ids - 1;
+ ret_reg->u32_max_value = nr_cpu_ids - 1;
+ ret_reg->smax_value = nr_cpu_ids - 1;
+ ret_reg->s32_max_value = nr_cpu_ids - 1;
+ ret_reg->umin_value = 0;
+ ret_reg->u32_min_value = 0;
+ ret_reg->smin_value = 0;
+ ret_reg->s32_min_value = 0;
+ reg_bounds_sync(ret_reg);
+ break;
+ }
}
static int
@@ -8945,7 +9623,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
*/
for (i = 0; i < meta.access_size; i++) {
err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
}
@@ -9327,11 +10005,6 @@ static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ACQUIRE;
}
-static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_RET_NULL;
-}
-
static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_RELEASE;
@@ -9401,6 +10074,11 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
return __kfunc_param_match_suffix(btf, arg, "__szk");
}
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__opt");
+}
+
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -9554,15 +10232,6 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
return true;
}
-
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CTX,
KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
@@ -9598,6 +10267,7 @@ enum special_kfunc_type {
KF_bpf_dynptr_from_xdp,
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
+ KF_bpf_dynptr_clone,
};
BTF_SET_START(special_kfunc_set)
@@ -9617,6 +10287,7 @@ BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -9638,6 +10309,17 @@ BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ meta->arg_owning_ref) {
+ return false;
+ }
+
+ return meta->kfunc_flags & KF_RET_NULL;
+}
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -9794,6 +10476,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct btf_record *rec = reg_btf_record(reg);
if (!state->active_lock.ptr) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
@@ -9806,6 +10489,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
}
reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
return 0;
}
@@ -10116,6 +10802,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off, btf_name_by_offset(reg->btf, t->name_off));
return -EINVAL;
}
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
if (node_off != field->graph_root.node_offset) {
verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
@@ -10326,13 +11014,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if (meta->btf == btf_vmlinux &&
meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
- meta->arg_obj_drop.btf = reg->btf;
- meta->arg_obj_drop.btf_id = reg->btf_id;
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
}
break;
case KF_ARG_PTR_TO_DYNPTR:
{
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+ int clone_ref_obj_id = 0;
if (reg->type != PTR_TO_STACK &&
reg->type != CONST_PTR_TO_DYNPTR) {
@@ -10346,12 +11035,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_uninit(btf, &args[i]))
dynptr_arg_type |= MEM_UNINIT;
- if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
dynptr_arg_type |= DYNPTR_TYPE_SKB;
- else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+
+ if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+ verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
+ return -EFAULT;
+ }
- ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+ dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+ clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+ if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+ verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
+ return -EFAULT;
+ }
+ }
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
if (ret < 0)
return ret;
@@ -10364,6 +11069,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
meta->initialized_dynptr.id = id;
meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
}
break;
@@ -10467,13 +11173,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_MEM_SIZE:
{
+ struct bpf_reg_state *buff_reg = &regs[regno];
+ const struct btf_param *buff_arg = &args[i];
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
- if (ret < 0) {
- verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
- return ret;
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
}
if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
@@ -10497,10 +11207,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
meta->subprogno = reg->subprogno;
break;
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
- if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+ if (!type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
return -EINVAL;
}
+ if (!type_is_non_owning_ref(reg->type))
+ meta->arg_owning_ref = true;
rec = reg_btf_record(reg);
if (!rec) {
@@ -10512,12 +11224,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
return -EINVAL;
}
- if (rec->refcount_off >= 0) {
- verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
- return -EINVAL;
- }
- meta->arg_refcount_acquire.btf = reg->btf;
- meta->arg_refcount_acquire.btf_id = reg->btf_id;
+
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
break;
}
}
@@ -10558,7 +11267,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
*kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
- kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
if (!kfunc_flags) {
return -EACCES;
}
@@ -10620,6 +11329,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
if (rcu_lock) {
verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
return -EINVAL;
@@ -10663,6 +11377,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
@@ -10749,12 +11464,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
- regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+ regs[BPF_REG_0].btf = meta.arg_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_btf_id;
insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_refcount_acquire.btf,
- meta.arg_refcount_acquire.btf_id);
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
@@ -10884,8 +11599,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_obj_drop.btf,
- meta.arg_obj_drop.btf_id);
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
}
}
}
@@ -12371,7 +13086,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- BPF_CLASS(insn->code) == BPF_ALU64) {
+ (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_SRC(insn->code) != BPF_TO_LE)) {
verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -12396,11 +13112,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
+ if (BPF_CLASS(insn->code) == BPF_ALU) {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -12420,20 +13149,46 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = regs + insn->src_reg;
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+ bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
+ !tnum_is_const(src_reg->var_off);
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- /* case: R1 = R2
- * copy register state to dest reg
- */
- if (src_reg->type == SCALAR_VALUE && !src_reg->id)
- /* Assign src and dst registers the same ID
- * that will be used by find_equal_scalars()
- * to propagate min/max range.
+ if (insn->off == 0) {
+ /* case: R1 = R2
+ * copy register state to dest reg
*/
- src_reg->id = ++env->id_gen;
- copy_register_state(dst_reg, src_reg);
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = DEF_NOT_SUBREG;
+ if (need_id)
+ /* Assign src and dst registers the same ID
+ * that will be used by find_equal_scalars()
+ * to propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ /* case: R1 = (s8, s16 s32)R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d sign-extension part of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ bool no_sext;
+
+ no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ }
+ }
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -12442,19 +13197,33 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
-
- if (is_src_reg_u32 && !src_reg->id)
- src_reg->id = ++env->id_gen;
- copy_register_state(dst_reg, src_reg);
- /* Make sure ID is cleared if src_reg is not in u32 range otherwise
- * dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
- */
- if (!is_src_reg_u32)
- dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = env->insn_idx + 1;
+ if (insn->off == 0) {
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32
+ * range otherwise dst_reg min/max could be incorrectly
+ * propagated into src_reg by find_equal_scalars()
+ */
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ } else {
+ /* case: W1 = (s8, s16)W2 */
+ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
+ }
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
@@ -12485,7 +13254,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -12494,7 +13264,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -12776,7 +13547,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
if (__is_pointer_value(false, reg)) {
- if (!reg_type_not_null(reg->type))
+ if (!reg_not_null(reg))
return -1;
/* If pointer is valid tests against zero will fail so we can
@@ -13279,6 +14050,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -13290,12 +14067,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- if (is_pointer_value(env, insn->src_reg)) {
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
- src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -13303,12 +14081,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
}
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg = &regs[insn->dst_reg];
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (BPF_SRC(insn->code) == BPF_K) {
@@ -14038,7 +14810,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
- int ret;
+ int ret, off;
if (bpf_pseudo_func(insn))
return visit_func_call_insn(t, insns, env, true);
@@ -14086,14 +14858,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K)
return -EINVAL;
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
/* unconditional jump with single edge */
- ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
- mark_prune_point(env, t + insn->off + 1);
- mark_jmp_point(env, t + insn->off + 1);
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
return ret;
@@ -14600,8 +15377,9 @@ static bool range_within(struct bpf_reg_state *old,
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
+ struct bpf_id_pair *map = idmap->map;
unsigned int i;
/* either both IDs should be set or both should be zero */
@@ -14612,20 +15390,34 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
return true;
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!idmap[i].old) {
+ if (!map[i].old) {
/* Reached an empty slot; haven't seen this id before */
- idmap[i].old = old_id;
- idmap[i].cur = cur_id;
+ map[i].old = old_id;
+ map[i].cur = cur_id;
return true;
}
- if (idmap[i].old == old_id)
- return idmap[i].cur == cur_id;
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
}
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
+/* Similar to check_ids(), but allocate a unique temporary ID
+ * for 'old_id' or 'cur_id' of zero.
+ * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ return check_ids(old_id, cur_id, idmap);
+}
+
static void clean_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *st)
{
@@ -14724,7 +15516,7 @@ next:
static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+ struct bpf_idmap *idmap)
{
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_ids(rold->id, rcur->id, idmap) &&
@@ -14733,7 +15525,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
{
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
@@ -14770,15 +15562,42 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (regs_exact(rold, rcur, idmap))
- return true;
- if (env->explore_alu_limits)
- return false;
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ }
if (!rold->precise)
return true;
- /* new val must satisfy old val knowledge */
+ /* Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
@@ -14824,7 +15643,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_id_pair *idmap)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap)
{
int i, spi;
@@ -14927,7 +15746,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
}
static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+ struct bpf_idmap *idmap)
{
int i;
@@ -14975,13 +15794,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
- env->idmap_scratch))
+ &env->idmap_scratch))
return false;
- if (!stacksafe(env, old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, &env->idmap_scratch))
return false;
- if (!refsafe(old, cur, env->idmap_scratch))
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
return true;
@@ -14996,7 +15815,8 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
@@ -15014,7 +15834,7 @@ static bool states_equal(struct bpf_verifier_env *env,
return false;
if (old->active_lock.id &&
- !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
return false;
if (old->active_rcu_lock != cur->active_rcu_lock)
@@ -15121,20 +15941,25 @@ static int propagate_precision(struct bpf_verifier_env *env,
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
int i, err = 0, fr;
+ bool first;
for (fr = old->curframe; fr >= 0; fr--) {
state = old->frame[fr];
state_reg = state->regs;
+ first = true;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
!state_reg->precise ||
!(state_reg->live & REG_LIVE_READ))
continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating r%d\n", fr, i);
- err = mark_chain_precision_frame(env, fr, i);
- if (err < 0)
- return err;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
@@ -15145,14 +15970,24 @@ static int propagate_precision(struct bpf_verifier_env *env,
!state_reg->precise ||
!(state_reg->live & REG_LIVE_READ))
continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating fp%d\n",
- fr, (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack_frame(env, fr, i);
- if (err < 0)
- return err;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
}
+ if (!first)
+ verbose(env, "\n");
}
+
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
return 0;
}
@@ -15582,7 +16417,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* Have to support a use case when one path through
* the program yields TRUSTED pointer while another
* is UNTRUSTED. Fallback to UNTRUSTED to generate
- * BPF_PROBE_MEM.
+ * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
*/
*prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
} else {
@@ -15723,7 +16558,8 @@ static int do_check(struct bpf_verifier_env *env)
*/
err = check_mem_access(env, env->insn_idx, insn->src_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false);
+ BPF_READ, insn->dst_reg, false,
+ BPF_MODE(insn->code) == BPF_MEMSX);
if (err)
return err;
@@ -15760,7 +16596,7 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false);
+ BPF_WRITE, insn->src_reg, false, false);
if (err)
return err;
@@ -15785,7 +16621,7 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
@@ -15830,15 +16666,18 @@ static int do_check(struct bpf_verifier_env *env)
mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
- env->insn_idx += insn->off + 1;
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
continue;
} else if (opcode == BPF_EXIT) {
@@ -15857,7 +16696,8 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_rcu_lock) {
+ if (env->cur_state->active_rcu_lock &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_rcu_read_unlock is missing\n");
return -EINVAL;
}
@@ -16137,11 +16977,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
-
- if (prog->aux->sleepable) {
- verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n");
- return -EINVAL;
- }
}
if (btf_record_has_field(map->record, BPF_TIMER)) {
@@ -16213,7 +17048,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0)) {
verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
@@ -16684,13 +17520,13 @@ static bool insn_is_cond_jump(u8 code)
{
u8 op;
+ op = BPF_OP(code);
if (BPF_CLASS(code) == BPF_JMP32)
- return true;
+ return op != BPF_JA;
if (BPF_CLASS(code) != BPF_JMP)
return false;
- op = BPF_OP(code);
return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}
@@ -16907,11 +17743,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
type = BPF_READ;
} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
@@ -16970,8 +17810,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
*/
case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
}
continue;
@@ -16981,6 +17825,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
/* If the read access is a narrower load of the field,
* convert to a 4/8-byte load, to minimum program type specific
@@ -17040,6 +17885,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
(1ULL << size * 8) - 1);
}
}
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -17159,7 +18008,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM)
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
@@ -17431,6 +18281,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
insn_buf[0] = addr[0];
insn_buf[1] = addr[1];
insn_buf[2] = *insn;
@@ -17438,6 +18295,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
int struct_meta_reg = BPF_REG_3;
int node_offset_reg = BPF_REG_4;
@@ -17447,6 +18305,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
node_offset_reg = BPF_REG_5;
}
+ if (!kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
@@ -18617,7 +19481,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* in the fmodret id set with the KF_SLEEPABLE flag.
*/
else {
- u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
+ prog);
if (flags && (*flags & KF_SLEEPABLE))
ret = 0;
@@ -18645,7 +19510,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = -EINVAL;
- if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {
@@ -18812,6 +19677,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (!env)
return -ENOMEM;
+ env->bt.env = env;
+
len = (*prog)->len;
env->insn_aux_data =
vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
diff --git a/kernel/capability.c b/kernel/capability.c
index 3e058f41df32..dac4df77e376 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -112,7 +112,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
int ret;
if (pid && (pid != task_pid_vnr(current))) {
- struct task_struct *target;
+ const struct task_struct *target;
rcu_read_lock();
@@ -467,6 +467,7 @@ EXPORT_SYMBOL(file_ns_capable);
/**
* privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
* @ns: The user namespace in question
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
*
* Return true if the inode uid and gid are within the namespace.
@@ -481,6 +482,7 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
* @cap: The capability in question
*
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 367b0a42ada9..c56071f150f2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -220,8 +220,6 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
-bool cgroup_is_thread_root(struct cgroup *cgrp);
-bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 5407241dbb45..c487ffef6652 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
if (l->list[mid] == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (l->list[mid] < pid)
index = mid + 1;
else
end = mid;
@@ -563,7 +563,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ strscpy(cgrp->root->release_agent_path, strstrip(buf),
sizeof(cgrp->root->release_agent_path));
spin_unlock(&release_agent_path_lock);
cgroup_kn_unlock(of->kn);
@@ -797,7 +797,7 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
spin_lock(&release_agent_path_lock);
- strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+ strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4d42f0cbc11e..1fb7f562289d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -312,8 +313,6 @@ bool cgroup_ssid_enabled(int ssid)
* masks of ancestors.
*
* - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
@@ -356,7 +355,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp)
return cgrp->nr_populated_csets;
}
-bool cgroup_is_threaded(struct cgroup *cgrp)
+static bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
}
@@ -395,7 +394,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
}
/* is @cgrp root of a threaded subtree? */
-bool cgroup_is_thread_root(struct cgroup *cgrp)
+static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
/* thread root should be a domain */
if (cgroup_is_threaded(cgrp))
@@ -494,28 +493,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
- * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get @cgrp's css associated with @ss. If the css doesn't exist
- * or is offline, %NULL is returned.
- */
-static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
- css = cgroup_css(cgrp, ss);
- if (css && !css_tryget_online(css))
- css = NULL;
- rcu_read_unlock();
-
- return css;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -618,7 +595,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
+ cgroup_get(cgrp);
}
/**
@@ -680,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css);
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_[tree_]mutex.
+ * Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -690,21 +667,6 @@ EXPORT_SYMBOL_GPL(of_css);
else
/**
- * for_each_e_css - iterate all effective css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css_by_mask(cgrp, \
- cgroup_subsys[(ssid)]))) \
- ; \
- else
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -945,7 +907,7 @@ static void css_set_move_task(struct task_struct *task,
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
@@ -1086,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset,
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+ struct cgroup_subsys_state **template)
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
@@ -1752,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
struct cftype *cfts, *failed_cfts;
int ret;
- if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ if (css->flags & CSS_VISIBLE)
return 0;
if (!css->ss) {
@@ -2393,45 +2355,6 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
- * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
- * @task: target task
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Determine @task's cgroup on the first (the one with the lowest non-zero
- * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
- * function grabs cgroup_mutex and shouldn't be used inside locks used by
- * cgroup controller callbacks.
- *
- * Return value is the same as kernfs_path().
- */
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
-{
- struct cgroup_root *root;
- struct cgroup *cgrp;
- int hierarchy_id = 1;
- int ret;
-
- cgroup_lock();
- spin_lock_irq(&css_set_lock);
-
- root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
-
- if (root) {
- cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
- } else {
- /* if no hierarchy exists, everyone is in "/" */
- ret = strscpy(buf, "/", buflen);
- }
-
- spin_unlock_irq(&css_set_lock);
- cgroup_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(task_cgroup_path);
-
-/**
* cgroup_attach_lock - Lock for ->attach()
* @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
*
@@ -2554,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/*
* This function may be called both before and
- * after cgroup_taskset_migrate(). The two cases
+ * after cgroup_migrate_execute(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
@@ -2885,9 +2808,9 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct task_struct *task;
/*
- * Prevent freeing of tasks while we take a snapshot. Tasks that are
- * already PF_EXITING could be freed from underneath us unless we
- * take an rcu_read_lock.
+ * The following thread iteration should be inside an RCU critical
+ * section to prevent tasks from being freed while taking the snapshot.
+ * spin_lock_irq() implies RCU critical section here.
*/
spin_lock_irq(&css_set_lock);
task = leader;
@@ -3709,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
-static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (css && !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
@@ -3728,14 +3674,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
return ret;
}
+static int cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_local_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_local_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+#endif
+
static int cpu_stat_show(struct seq_file *seq, void *v)
{
- struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
- ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+ ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
+static int cpu_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
@@ -3785,7 +3761,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
- new = psi_trigger_create(psi, buf, res, of->file);
+ new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -3891,6 +3867,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
+static int cgroup_pressure_open(struct kernfs_open_file *of)
+{
+ if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ return 0;
+}
+
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@@ -4367,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
return ret;
}
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
- return 0;
}
/**
@@ -4390,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
- int ret;
-
if (!cfts || cfts[0].name[0] == '\0')
return 0;
@@ -4399,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
return -ENOENT;
cgroup_lock();
- ret = cgroup_rm_cftypes_locked(cfts);
+ cgroup_rm_cftypes_locked(cfts);
cgroup_unlock();
- return ret;
+ return 0;
}
/**
@@ -5282,6 +5263,10 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat.local",
+ .seq_show = cpu_local_stat_show,
+ },
{ } /* terminate */
};
@@ -5290,6 +5275,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5298,6 +5284,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5306,6 +5293,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5315,6 +5303,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5346,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = {
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
- * css_free_work_fn().
+ * css_free_rwork_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
@@ -5590,8 +5579,7 @@ err_free_css:
/*
* The returned cgroup is fully initialized including its control mask, but
- * it isn't associated with its kernfs_node and doesn't have the control
- * mask applied.
+ * it doesn't have the control mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
@@ -5917,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
- * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * cgroup_kn_lock_live(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
@@ -5939,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
- for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
@@ -6132,8 +6120,8 @@ int __init cgroup_init(void)
continue;
if (cgroup1_ssid_disabled(ssid))
- printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
- ss->name);
+ pr_info("Disabling %s control group subsystem in v1 mounts\n",
+ ss->name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
@@ -6696,6 +6684,9 @@ void cgroup_exit(struct task_struct *tsk)
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
+
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e4ca2dd2b764..58ec88efa4f8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,45 +25,22 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/kthread.h>
-#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/fs_context.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
-#include <linux/seq_file.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/time64.h>
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -193,6 +170,14 @@ struct cpuset {
int use_parent_ecpus;
int child_ecpus_count;
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
@@ -245,6 +230,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
+void inc_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks++;
+}
+
+void dec_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks--;
+}
+
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -366,22 +365,23 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_rwsem and
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_rwsem write lock. Other
- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
- * prevent change to cpuset structures.
+ * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
+ * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * structures. Note that cpuset_mutex needs to be a mutex as it is used in
+ * paths that rely on priority inheritance (e.g. scheduler - on RT) for
+ * correctness.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets. It can perform various checks on the cpuset structure
- * first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_rwsem. While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
+ * cpuset_mutex, it blocks others, ensuring that it is the only task able to
+ * also acquire callback_lock and be able to modify cpusets. It can perform
+ * various checks on the cpuset structure first, knowing nothing will change.
+ * It can also allocate memory while just holding cpuset_mutex. While it is
+ * performing these checks, various callback routines can briefly acquire
+ * callback_lock to query cpusets. Once it is ready to make the changes, it
+ * takes callback_lock, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
@@ -403,16 +403,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+static DEFINE_MUTEX(cpuset_mutex);
-void cpuset_read_lock(void)
+void cpuset_lock(void)
{
- percpu_down_read(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}
-void cpuset_read_unlock(void)
+void cpuset_unlock(void)
{
- percpu_up_read(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
static DEFINE_SPINLOCK(callback_lock);
@@ -496,7 +496,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
@@ -538,7 +538,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -550,7 +550,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
* if on default hierarchy.
*/
static void cpuset_update_task_spread_flags(struct cpuset *cs,
@@ -575,7 +575,7 @@ static void cpuset_update_task_spread_flags(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_rwsem.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -713,7 +713,7 @@ out:
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_rwsem held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -829,7 +829,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_rwsem held. */
+/* Must be called with cpuset_mutex held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -855,7 +855,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_rwsem held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -1066,11 +1066,14 @@ done:
return ndoms;
}
-static void update_tasks_root_domain(struct cpuset *cs)
+static void dl_update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ if (cs->nr_deadline_tasks == 0)
+ return;
+
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
@@ -1079,12 +1082,12 @@ static void update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void rebuild_root_domains(void)
+static void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1107,7 +1110,7 @@ static void rebuild_root_domains(void)
rcu_read_unlock();
- update_tasks_root_domain(cs);
+ dl_update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
@@ -1121,7 +1124,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
mutex_lock(&sched_domains_mutex);
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- rebuild_root_domains();
+ dl_rebuild_rd_accounting();
mutex_unlock(&sched_domains_mutex);
}
@@ -1134,7 +1137,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_rwsem held. Takes cpus_read_lock().
+ * Call with cpuset_mutex held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1145,7 +1148,7 @@ static void rebuild_sched_domains_locked(void)
int ndoms;
lockdep_assert_cpus_held();
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* If we have raced with CPU hotplug, return early to avoid
@@ -1196,9 +1199,9 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
}
@@ -1208,7 +1211,7 @@ void rebuild_sched_domains(void)
* @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
* is used instead of effective_cpus to make sure all offline CPUs are also
* included as hotplug code won't update cpumasks for tasks in top_cpuset.
@@ -1227,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
/*
* Percpu kthreads in top_cpuset are ignored
*/
- if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
+ if (kthread_is_per_cpu(task))
continue;
cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
} else {
@@ -1252,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus) {
+ if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
cpumask_or(new_cpus, parent->effective_cpus,
parent->subparts_cpus);
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
@@ -1274,6 +1277,52 @@ enum subparts_cmd {
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on);
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > 0);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool new_lb = (new_prs != PRS_ISOLATED);
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+ if (rebuild_domains)
+ rebuild_sched_domains_locked();
+}
+
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cs: The cpuset that requests change in partition root state
@@ -1322,7 +1371,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* The parent must be a partition root.
@@ -1333,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if ((newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cs->cpus_allowed)))
+ if (!newmask && cpumask_empty(cs->cpus_allowed))
return PERR_CPUSEMPTY;
/*
@@ -1401,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
- if (adding &&
+ } else if (adding &&
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
!cpumask_intersects(tmp->delmask, cpu_active_mask) &&
partition_is_populated(parent, cs)) {
@@ -1477,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
/*
* Transitioning between invalid to valid or vice versa may require
- * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ * changing CS_CPU_EXCLUSIVE.
*/
if (old_prs != new_prs) {
- if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
- (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
- return PERR_NOTEXCL;
- if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ int err = update_partition_exclusive(cs, new_prs);
+
+ if (err)
+ return err;
}
/*
@@ -1517,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
- if (adding || deleting)
+ if (adding || deleting) {
update_tasks_cpumask(parent, tmp->addmask);
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, tmp);
+ }
/*
- * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
- * rebuild_sched_domains_locked() may be called.
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
+ * Update the load balance flag and scheduling domain if
+ * cpus_read_trylock() is successful.
*/
- if (old_prs != new_prs) {
- if (old_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- else if (new_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ update_partition_sd_lb(cs, old_prs);
+ cpus_read_unlock();
}
+
notify_partition_change(cs, old_prs);
return 0;
}
/*
+ * update_cpumasks_hier() flags
+ */
+#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
+#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
+
+/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
@@ -1545,10 +1607,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
- bool force)
+ int flags)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1585,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
}
/*
- * Skip the whole subtree if the cpumask remains the same
- * and has no partition root state and force flag not set.
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) HIER_CHECKALL flag not set, and
+ * 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state && !force &&
- cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1673,6 +1740,20 @@ update_parent_subparts:
update_tasks_cpumask(cp, tmp->new_cpus);
/*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
+
+ /*
* On legacy hierarchy, if the effective cpumask of any non-
* empty cpuset is changed, we need to rebuild sched domains.
* On default hierarchy, the cpuset needs to be a partition
@@ -1689,7 +1770,7 @@ update_parent_subparts:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains)
+ if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
rebuild_sched_domains_locked();
}
@@ -1705,7 +1786,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -1713,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* to use the right effective_cpus value.
*
* The update_cpumasks_hier() function may sleep. So we have to
- * release the RCU read lock before calling it.
+ * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
+ * flag is used to suppress rebuild of sched domains as the callers
+ * will take care of that.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1725,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp, false);
+ update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -1744,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
struct tmpmasks tmp;
bool invalidate = false;
+ int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1771,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /*
- * Use the cpumasks in trialcs for tmpmasks when they are pointers
- * to allocated cpumasks.
- *
- * Note that update_parent_subparts_cpumask() uses only addmask &
- * delmask, but not new_cpus.
- */
- tmp.addmask = trialcs->subparts_cpus;
- tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = NULL;
-#endif
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
retval = validate_change(cs, trialcs);
@@ -1811,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = 0;
}
if (retval < 0)
- return retval;
+ goto out_free;
if (cs->partition_root_state) {
if (invalidate)
@@ -1846,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
spin_unlock_irq(&callback_lock);
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /* Now trialcs->cpus_allowed is available */
- tmp.new_cpus = trialcs->cpus_allowed;
-#endif
-
/* effective_cpus will be updated here */
- update_cpumasks_hier(cs, &tmp, false);
+ update_cpumasks_hier(cs, &tmp, 0);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1863,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmp);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+ update_partition_sd_lb(cs, old_prs);
}
+out_free:
+ free_cpumasks(NULL, &tmp);
return 0;
}
@@ -1955,12 +2029,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_rwsem */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
struct css_task_iter it;
struct task_struct *task;
@@ -1973,7 +2047,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_rwsem, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -2019,7 +2093,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -2072,7 +2146,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_rwsem held. May take callback_lock during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -2164,7 +2238,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_rwsem held, cpuset membership stays
+ * function is called with cpuset_mutex held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -2184,7 +2258,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -2234,12 +2308,11 @@ out:
* @new_prs: new partition root state
* Return: 0 if successful, != 0 if error
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
int err = PERR_NONE, old_prs = cs->partition_root_state;
- bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2258,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
+ err = update_partition_exclusive(cs, new_prs);
+ if (err)
+ goto out;
+
if (!old_prs) {
/*
- * Turning on partition root requires setting the
- * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be empty.
+ * cpus_allowed cannot be empty.
*/
if (cpumask_empty(cs->cpus_allowed)) {
err = PERR_CPUSEMPTY;
goto out;
}
- err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err) {
- err = PERR_NOTEXCL;
- goto out;
- }
-
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
- if (err) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- goto out;
- }
-
- if (new_prs == PRS_ISOLATED) {
- /*
- * Disable the load balance flag should not return an
- * error unless the system is running out of memory.
- */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- sched_domain_rebuilt = true;
- }
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
- sched_domain_rebuilt = true;
- goto out; /* Sched domain is rebuilt in update_flag() */
+ ;
} else {
/*
* Switching back to member is always allowed even if it
@@ -2315,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs)
compute_effective_cpumask(cs->effective_cpus, cs, parent);
spin_unlock_irq(&callback_lock);
}
-
- /* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
-
- if (!is_sched_load_balance(cs)) {
- /* Make sure load balance is on */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- sched_domain_rebuilt = true;
- }
}
-
- update_tasks_cpumask(parent, tmpmask.new_cpus);
-
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmpmask);
-
- if (!sched_domain_rebuilt)
- rebuild_sched_domains_locked();
out:
/*
- * Make partition invalid if an error happen
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
*/
- if (err)
+ if (err) {
new_prs = -new_prs;
+ update_partition_exclusive(cs, new_prs);
+ }
+
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
+
/*
* Update child cpusets, if present.
* Force update if switching back to member.
*/
if (!list_empty(&cs->css.children))
- update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
@@ -2472,41 +2517,86 @@ static int cpuset_can_attach_check(struct cpuset *cs)
return 0;
}
-/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
+static void reset_migrate_dl_data(struct cpuset *cs)
+{
+ cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+}
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
+ struct cpuset *cs, *oldcs;
struct task_struct *task;
+ bool cpus_updated, mems_updated;
int ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = cpuset_attach_old_cs;
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs);
if (ret)
goto out_unlock;
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->effective_cpus);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (dl_task(task)) {
+ cs->nr_migrate_dl_tasks++;
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
+ }
+
+ if (!cs->nr_migrate_dl_tasks)
+ goto out_success;
+
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
goto out_unlock;
+ }
}
+out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -2518,15 +2608,23 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+
+ if (cs->nr_migrate_dl_tasks) {
+ int cpu = cpumask_any(cs->effective_cpus);
+
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ reset_migrate_dl_data(cs);
+ }
+
+ mutex_unlock(&cpuset_mutex);
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2535,7 +2633,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
if (cs != &top_cpuset)
guarantee_online_cpus(task, cpus_attach);
@@ -2565,7 +2663,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cpus_updated = !cpumask_equal(cs->effective_cpus,
oldcs->effective_cpus);
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
@@ -2622,11 +2720,17 @@ static void cpuset_attach(struct cgroup_taskset *tset)
out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
+ if (cs->nr_migrate_dl_tasks) {
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ reset_migrate_dl_data(cs);
+ }
+
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -2658,7 +2762,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = 0;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2694,7 +2798,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return retval;
}
@@ -2707,7 +2811,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = -ENODEV;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2720,7 +2824,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return retval;
}
@@ -2753,7 +2857,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_rwsem anyway. This only happens on the legacy
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
@@ -2761,7 +2865,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
flush_work(&cpuset_hotplug_work);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2785,7 +2889,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
@@ -2933,13 +3037,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
css_get(&cs->css);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
@@ -3156,7 +3260,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
return 0;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -3173,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
+
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -3207,7 +3319,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return 0;
}
@@ -3228,7 +3340,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
struct cpuset *cs = css_cs(css);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (is_partition_valid(cs))
update_prstate(cs, 0);
@@ -3247,7 +3359,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
}
@@ -3260,7 +3372,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
@@ -3273,7 +3385,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/*
@@ -3294,14 +3406,14 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
return 0;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs);
if (ret)
goto out_unlock;
- ret = task_can_attach(task, cs->effective_cpus);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
@@ -3315,7 +3427,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
*/
cs->attach_in_progress++;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -3331,11 +3443,11 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
if (same_cs)
return;
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/*
@@ -3363,7 +3475,7 @@ static void cpuset_fork(struct task_struct *task)
}
/* CLONE_INTO_CGROUP */
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cpuset_attach_task(cs, task);
@@ -3371,7 +3483,7 @@ static void cpuset_fork(struct task_struct *task)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -3472,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- percpu_up_write(&cpuset_rwsem);
-
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
*/
- if (is_empty)
+ if (is_empty) {
+ mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
-
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
+ }
}
static void
@@ -3533,14 +3644,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
goto retry;
}
@@ -3637,11 +3748,12 @@ update_tasks:
cpus_updated, mems_updated);
unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * @work: unused
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -3667,7 +3779,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3724,7 +3836,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
@@ -4024,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
/**
* cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -4155,7 +4268,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index fe3e8a0eb7ed..79a3717a5803 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -14,7 +14,7 @@
#include <linux/misc_cgroup.h>
#define MAX_STR "max"
-#define MAX_NUM ULONG_MAX
+#define MAX_NUM U64_MAX
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
static const char *const misc_res_name[] = {
@@ -37,7 +37,7 @@ static struct misc_cg root_cg;
* more than the actual capacity. We are using Limits resource distribution
* model of cgroup for miscellaneous controller.
*/
-static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+static u64 misc_res_capacity[MISC_CG_RES_TYPES];
/**
* parent_misc() - Get the parent of the passed misc cgroup.
@@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type)
* Context: Any context.
* Return: Current total usage of the resource.
*/
-unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+u64 misc_cg_res_total_usage(enum misc_res_type type)
{
if (valid_type(type))
- return atomic_long_read(&root_cg.res[type].usage);
+ return atomic64_read(&root_cg.res[type].usage);
return 0;
}
@@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
* * %0 - Successfully registered the capacity.
* * %-EINVAL - If @type is invalid.
*/
-int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
if (!valid_type(type))
return -EINVAL;
@@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
* Context: Any context.
*/
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+ u64 amount)
{
- WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+ WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
"misc cgroup resource %s became less than 0",
misc_res_name[type]);
}
@@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
* * -EBUSY - If max limit will be crossed or total usage will be more than the
* capacity.
*/
-int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i, *j;
int ret;
struct misc_res *res;
- int new_usage;
+ u64 new_usage;
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
return -EINVAL;
@@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
for (i = cg; i; i = parent_misc(i)) {
res = &i->res[type];
- new_usage = atomic_long_add_return(amount, &res->usage);
+ new_usage = atomic64_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
ret = -EBUSY;
@@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
err_charge:
for (j = i; j; j = parent_misc(j)) {
- atomic_long_inc(&j->res[type].events);
+ atomic64_inc(&j->res[type].events);
cgroup_file_notify(&j->events_file);
}
@@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
*
* Context: Any context.
*/
-void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i;
@@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
{
int i;
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long max;
+ u64 max;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
if (READ_ONCE(misc_res_capacity[i])) {
@@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
if (max == MAX_NUM)
seq_printf(sf, "%s max\n", misc_res_name[i]);
else
- seq_printf(sf, "%s %lu\n", misc_res_name[i],
+ seq_printf(sf, "%s %llu\n", misc_res_name[i],
max);
}
}
@@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
* Return:
* * >= 0 - Number of bytes processed in the input.
* * -EINVAL - If buf is not valid.
- * * -ERANGE - If number is bigger than the unsigned long capacity.
+ * * -ERANGE - If number is bigger than the u64 capacity.
*/
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct misc_cg *cg;
- unsigned long max;
+ u64 max;
int ret = 0, i;
enum misc_res_type type = MISC_CG_RES_TYPES;
char *token;
@@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
if (!strcmp(MAX_STR, buf)) {
max = MAX_NUM;
} else {
- ret = kstrtoul(buf, 0, &max);
+ ret = kstrtou64(buf, 0, &max);
if (ret)
return ret;
}
@@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
static int misc_cg_current_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long usage;
+ u64 usage;
struct misc_cg *cg = css_misc(seq_css(sf));
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- usage = atomic_long_read(&cg->res[i].usage);
+ usage = atomic64_read(&cg->res[i].usage);
if (READ_ONCE(misc_res_capacity[i]) || usage)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
}
return 0;
@@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long cap;
+ u64 cap;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
cap = READ_ONCE(misc_res_capacity[i]);
if (cap)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
}
return 0;
@@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
static int misc_events_show(struct seq_file *sf, void *v)
{
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long events, i;
+ u64 events;
+ int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic_long_read(&cg->res[i].events);
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
- seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
@@ -357,7 +356,6 @@ static struct cftype misc_cg_files[] = {
{
.name = "current",
.seq_show = misc_cg_current_show,
- .flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "capacity",
@@ -398,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
WRITE_ONCE(cg->res[i].max, MAX_NUM);
- atomic_long_set(&cg->res[i].usage, 0);
+ atomic64_set(&cg->res[i].usage, 0);
}
return &cg->css;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 0d5c29879a50..144a464e45c6 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
.install = cgroupns_install,
.owner = cgroupns_owner,
};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 3135406608c7..ef5878fb2005 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
* stop uncharging
@@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @index: index of the resource to uncharge in cgroup in given resource pool
*/
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9c4c55228567..d80d7a608141 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__diag_pop();
/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
@@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
- /* if @may_sleep, play nice and yield if necessary */
- if (may_sleep && (need_resched() ||
- spin_needbreak(&cgroup_rstat_lock))) {
+ /* play nice and yield if necessary */
+ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
@@ -236,26 +235,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
/**
- * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
- * @cgrp: target cgroup
- *
- * This function can be called from any context.
- */
-void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cgroup_rstat_lock, flags);
- cgroup_rstat_flush_locked(cgrp, false);
- spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
-}
-
-/**
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
@@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
}
/**
@@ -360,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
struct cgroup_base_stat delta;
unsigned seq;
@@ -373,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
- /* propagate percpu delta to global */
+ /* propagate per-cpu delta to cgroup and per-cpu global statistics */
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
- /* propagate global delta to parent (unless that's root) */
+ /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = rstatc->subtree_bstat;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
}
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e8db8d938661..4722b998a324 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -1,3 +1,5 @@
+# Help: Debugging for CI systems and finding regressions
+#
# The config is based on running daily CI for enterprise Linux distros to
# seek regressions on linux-next builds on different bare-metal and virtual
# platforms. It can be used for example,
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
CONFIG_NET=y
CONFIG_NET_CORE=y
CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
CONFIG_PM=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
index 38a7c5362c9c..2c6e001a7284 100644
--- a/kernel/configs/rust.config
+++ b/kernel/configs/rust.config
@@ -1 +1,2 @@
+# Help: Enable Rust
CONFIG_RUST=y
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
index 2f0e6bf6db2c..ffb9dcafca26 100644
--- a/kernel/configs/tiny-base.config
+++ b/kernel/configs/tiny-base.config
@@ -1 +1 @@
-CONFIG_EMBEDDED=y
+CONFIG_EXPERT=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 144b2bd86b14..00009f7d0835 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -6,6 +6,5 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
-# CONFIG_SLAB is not set
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index 6fac5b405334..35f48671b8d5 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -1,3 +1,4 @@
+# Help: Debugging options for tip tree testing
CONFIG_X86_DEBUG_FPU=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_VM=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 436f806aa1ed..6878b9a49be8 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
# global stuff - these enable us to allow some
# of the not so generic stuff below for xen
CONFIG_PARAVIRT=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index a09f1c19336a..6ef0b35fc28c 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- arch_atomic_set(&ct->state, state);
+ raw_atomic_set(&ct->state, state);
} else {
/*
* Even if context tracking is disabled on this CPU, because it's outside
@@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- arch_atomic_set(&ct->state, state);
+ raw_atomic_set(&ct->state, state);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- arch_atomic_add(state, &ct->state);
+ raw_atomic_add(state, &ct->state);
}
}
}
@@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- arch_atomic_set(&ct->state, CONTEXT_KERNEL);
+ raw_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- arch_atomic_set(&ct->state, CONTEXT_KERNEL);
+ raw_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- arch_atomic_sub(state, &ct->state);
+ raw_atomic_sub(state, &ct->state);
}
}
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f4a2c5845bcb..6de7c6bb74ee 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
@@ -59,6 +60,7 @@
* @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
+ * @ap_sync_state: State for AP synchronization
* @done_up: Signal completion to the issuer of the task for cpu-up
* @done_down: Signal completion to the issuer of the task for cpu-down
*/
@@ -76,6 +78,7 @@ struct cpuhp_cpu_state {
struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
+ atomic_t ap_sync_state;
struct completion done_up;
struct completion done_down;
#endif
@@ -276,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state)
return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}
+/* Synchronization state management */
+enum cpuhp_sync_state {
+ SYNC_STATE_DEAD,
+ SYNC_STATE_KICKED,
+ SYNC_STATE_SHOULD_DIE,
+ SYNC_STATE_ALIVE,
+ SYNC_STATE_SHOULD_ONLINE,
+ SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state: The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+ enum cpuhp_sync_state next_state)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ ktime_t now, end, start = ktime_get();
+ int sync;
+
+ end = start + 10ULL * NSEC_PER_SEC;
+
+ sync = atomic_read(st);
+ while (1) {
+ if (sync == state) {
+ if (!atomic_try_cmpxchg(st, &sync, next_state))
+ continue;
+ return true;
+ }
+
+ now = ktime_get();
+ if (now > end) {
+ /* Timeout. Leave the state unchanged */
+ return false;
+ } else if (now - start < NSEC_PER_MSEC) {
+ /* Poll for one millisecond */
+ arch_cpuhp_sync_state_poll();
+ } else {
+ usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
+ }
+ sync = atomic_read(st);
+ }
+ return true;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+ cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+ do {
+ /* CPU can have reported dead already. Don't overwrite that! */
+ if (sync == SYNC_STATE_DEAD)
+ break;
+ } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+ if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+ /* CPU reached dead state. Invoke the cleanup function */
+ arch_cpuhp_cleanup_dead_cpu(cpu);
+ return;
+ }
+
+ /* No further action possible. Emit message and give up. */
+ pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+ /* Wait for the control CPU to release it. */
+ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+ cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+again:
+ switch (sync) {
+ case SYNC_STATE_DEAD:
+ /* CPU is properly dead */
+ break;
+ case SYNC_STATE_KICKED:
+ /* CPU did not come up in previous attempt */
+ break;
+ case SYNC_STATE_ALIVE:
+ /* CPU is stuck cpuhp_ap_sync_alive(). */
+ break;
+ default:
+ /* CPU failed to report online or dead and is in limbo state. */
+ return false;
+ }
+
+ /* Prepare for booting */
+ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+ goto again;
+
+ return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+ return 0;
+
+ if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+ pr_err("CPU%u failed to report alive state\n", cpu);
+ ret = -EIO;
+ }
+
+ /* Let the architecture cleanup the kick alive mechanics. */
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -413,7 +592,10 @@ static void lockdep_release_cpus_lock(void)
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
+
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
void __init cpu_smt_disable(bool force)
{
@@ -427,16 +609,33 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
+ cpu_smt_num_threads = 1;
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code.
*/
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads)
{
- if (!topology_smt_supported())
+ WARN_ON(!num_threads || (num_threads > max_threads));
+
+ if (max_threads == 1)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+ cpu_smt_max_threads = max_threads;
+
+ /*
+ * If SMT has been disabled via the kernel command line or SMT is
+ * not supported, set cpu_smt_num_threads to 1 for consistency.
+ * If enabled, take the architecture requested number of threads
+ * to bring up into account.
+ */
+ if (cpu_smt_control != CPU_SMT_ENABLED)
+ cpu_smt_num_threads = 1;
+ else if (num_threads < cpu_smt_num_threads)
+ cpu_smt_num_threads = num_threads;
}
static int __init smt_cmdline_disable(char *str)
@@ -446,9 +645,23 @@ static int __init smt_cmdline_disable(char *str)
}
early_param("nosmt", smt_cmdline_disable);
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+ return topology_smt_thread_allowed(cpu);
+#else
+ return true;
+#endif
+}
+
static inline bool cpu_smt_allowed(unsigned int cpu)
{
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
return true;
if (topology_is_primary_thread(cpu))
@@ -463,13 +676,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
-/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
@@ -558,7 +772,7 @@ static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
return ret;
}
-static int bringup_wait_for_ap(unsigned int cpu)
+static int bringup_wait_for_ap_online(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -579,38 +793,94 @@ static int bringup_wait_for_ap(unsigned int cpu)
*/
if (!cpu_smt_allowed(cpu))
return -ECANCELED;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
+{
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
+}
+
+static int cpuhp_bringup_ap(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
+
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
return cpuhp_kick_ap(cpu, st, st->target);
-}
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
+}
+#else
static int bringup_cpu(unsigned int cpu)
{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle = idle_thread_get(cpu);
int ret;
- /*
- * Reset stale stack state from the last time this CPU was online.
- */
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
/*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
- * Prevent irq alloc/free across the bringup.
+ *
+ * Prevent irq alloc/free across the bringup by acquiring the
+ * sparse irq lock. Hold it until the upcoming CPU completes the
+ * startup in cpuhp_online_idle() which allows to avoid
+ * intermediate synchronization points in the architecture code.
*/
irq_lock_sparse();
- /* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
- irq_unlock_sparse();
if (ret)
- return ret;
- return bringup_wait_for_ap(cpu);
+ goto out_unlock;
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
}
+#endif
static int finish_cpu(unsigned int cpu)
{
@@ -1099,6 +1369,8 @@ static int takedown_cpu(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
+ cpuhp_bp_sync_dead(cpu);
+
tick_cleanup_dead_cpu(cpu);
rcutree_migrate_callbacks(cpu);
return 0;
@@ -1215,8 +1487,22 @@ out:
return ret;
}
+struct cpu_down_work {
+ unsigned int cpu;
+ enum cpuhp_state target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+ struct cpu_down_work *work = arg;
+
+ return _cpu_down(work->cpu, 0, work->target);
+}
+
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
/*
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
@@ -1225,7 +1511,15 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
- return _cpu_down(cpu, 0, target);
+
+ /*
+ * Ensure that the control task does not run on the to be offlined
+ * CPU to prevent a deadlock against cfs_b->period_timer.
+ */
+ cpu = cpumask_any_but(cpu_online_mask, cpu);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1345,8 +1639,10 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
/*
- * Unpart the stopper thread before we start the idle loop (and start
+ * Unpark the stopper thread before we start the idle loop (and start
* scheduling); this ensures the stopper task is always available.
*/
stop_machine_unpark(smp_processor_id());
@@ -1383,6 +1679,12 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = PTR_ERR(idle);
goto out;
}
+
+ /*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
}
cpuhp_tasks_frozen = tasks_frozen;
@@ -1502,18 +1804,106 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
return 0;
}
-void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+ enum cpuhp_state target)
{
unsigned int cpu;
- for_each_present_cpu(cpu) {
- if (num_online_cpus() >= setup_max_cpus)
+ for_each_cpu(cpu, mask) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+ /*
+ * If this failed then cpu_up() might have only
+ * rolled back to CPUHP_BP_KICK_AP for the final
+ * online. Clean it up. NOOP if already rolled back.
+ */
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+ }
+
+ if (!--ncpus)
break;
- if (!cpu_online(cpu))
- cpu_up(cpu, CPUHP_ONLINE);
}
}
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+ return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+static inline bool cpuhp_smt_aware(void)
+{
+ return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+ const struct cpumask *mask = cpu_present_mask;
+
+ if (__cpuhp_parallel_bringup)
+ __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+ if (!__cpuhp_parallel_bringup)
+ return false;
+
+ if (cpuhp_smt_aware()) {
+ const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+ static struct cpumask tmp_mask __initdata;
+
+ /*
+ * X86 requires to prevent that SMT siblings stopped while
+ * the primary thread does a microcode update for various
+ * reasons. Bring the primary threads up first.
+ */
+ cpumask_and(&tmp_mask, mask, pmask);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+ /* Account for the online CPUs */
+ ncpus -= num_online_cpus();
+ if (!ncpus)
+ return true;
+ /* Create the mask for secondary CPUs */
+ cpumask_andnot(&tmp_mask, mask, pmask);
+ mask = &tmp_mask;
+ }
+
+ /* Bring the not-yet started CPUs up */
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+ return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int setup_max_cpus)
+{
+ /* Try parallel bringup optimization if enabled */
+ if (cpuhp_bringup_cpus_parallel(setup_max_cpus))
+ return;
+
+ /* Full per CPU serialized bringup */
+ cpuhp_bringup_mask(cpu_present_mask, setup_max_cpus, CPUHP_ONLINE);
+}
+
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
@@ -1740,13 +2130,38 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
- /* Kicks the plugged cpu into life */
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+ /*
+ * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+ * the next step will release it.
+ */
+ [CPUHP_BP_KICK_AP] = {
+ .name = "cpu:kick_ap",
+ .startup.single = cpuhp_kick_ap_alive,
+ },
+
+ /*
+ * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+ * releases it for the complete bringup.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = cpuhp_bringup_ap,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#else
+ /*
+ * All-in-one CPU bringup state which includes the kick alive.
+ */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = finish_cpu,
.cant_stop = true,
},
+#endif
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
@@ -2263,6 +2678,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+ /*
+ * Disable can be called with CPU_SMT_ENABLED when changing
+ * from a higher to lower number of SMT threads per core.
+ */
+ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2297,6 +2718,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+ if (!cpu_smt_thread_allowed(cpu))
+ continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
@@ -2475,20 +2898,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+ if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+ return threads >= 1 && threads <= cpu_smt_max_threads;
+ return threads == 1 || threads == cpu_smt_max_threads;
+}
+
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- int ctrlval, ret;
-
- if (sysfs_streq(buf, "on"))
- ctrlval = CPU_SMT_ENABLED;
- else if (sysfs_streq(buf, "off"))
- ctrlval = CPU_SMT_DISABLED;
- else if (sysfs_streq(buf, "forceoff"))
- ctrlval = CPU_SMT_FORCE_DISABLED;
- else
- return -EINVAL;
+ int ctrlval, ret, num_threads, orig_threads;
+ bool force_off;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2496,21 +2918,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
+ if (sysfs_streq(buf, "on")) {
+ ctrlval = CPU_SMT_ENABLED;
+ num_threads = cpu_smt_max_threads;
+ } else if (sysfs_streq(buf, "off")) {
+ ctrlval = CPU_SMT_DISABLED;
+ num_threads = 1;
+ } else if (sysfs_streq(buf, "forceoff")) {
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ num_threads = 1;
+ } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+ if (num_threads == 1)
+ ctrlval = CPU_SMT_DISABLED;
+ else if (cpu_smt_num_threads_valid(num_threads))
+ ctrlval = CPU_SMT_ENABLED;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
- if (ctrlval != cpu_smt_control) {
- switch (ctrlval) {
- case CPU_SMT_ENABLED:
- ret = cpuhp_smt_enable();
- break;
- case CPU_SMT_DISABLED:
- case CPU_SMT_FORCE_DISABLED:
- ret = cpuhp_smt_disable(ctrlval);
- break;
- }
- }
+ orig_threads = cpu_smt_num_threads;
+ cpu_smt_num_threads = num_threads;
+
+ force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+ if (num_threads > orig_threads)
+ ret = cpuhp_smt_enable();
+ else if (num_threads < orig_threads || force_off)
+ ret = cpuhp_smt_disable(ctrlval);
unlock_device_hotplug();
return ret ? ret : count;
@@ -2538,6 +2978,17 @@ static ssize_t control_show(struct device *dev,
{
const char *state = smt_states[cpu_smt_control];
+#ifdef CONFIG_HOTPLUG_SMT
+ /*
+ * If SMT is enabled but not all threads are enabled then show the
+ * number of threads. If all threads are enabled show "on". Otherwise
+ * show the state name.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED &&
+ cpu_smt_num_threads != cpu_smt_max_threads)
+ return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
@@ -2723,6 +3174,7 @@ void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+ atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 90ce1dfd591c..03a7932cde0a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -10,6 +10,9 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -17,6 +20,10 @@
#include <crypto/sha1.h>
#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
@@ -314,6 +321,187 @@ static int __init parse_crashkernel_dummy(char *arg)
}
early_param("crashkernel", parse_crashkernel_dummy);
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo ELF note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two ELF headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each possible CPU */
+ for_each_possible_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (need_kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long) _text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ phdr++;
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end, p_start, p_end;
+ struct range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ p_start = start;
+ if (mend > end)
+ p_end = end;
+
+ /* Found completely overlapping range */
+ if (p_start == start && p_end == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+
+ /*
+ * Continue to check if there are another overlapping ranges
+ * from the current position because of shifting the above
+ * mem ranges.
+ */
+ i--;
+ mem->nr_ranges--;
+ continue;
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (p_start > start && p_end < end) {
+ /* Split original range */
+ mem->ranges[i].end = p_start - 1;
+ temp_range.start = p_end + 1;
+ temp_range.end = end;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
+ else
+ mem->ranges[i].start = p_end + 1;
+ break;
+ }
+
+ /* If a split happened, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == mem->max_nr_ranges - 1)
+ return -ENOMEM;
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -455,8 +643,6 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(folio, _folio_dtor);
- VMCOREINFO_OFFSET(folio, _folio_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
@@ -490,7 +676,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+ VMCOREINFO_NUMBER(PG_hugetlb);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
@@ -515,3 +701,206 @@ static int __init crash_save_vmcoreinfo_init(void)
}
subsys_initcall(crash_save_vmcoreinfo_init);
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+/*
+ * This routine utilized when the crash_hotplug sysfs node is read.
+ * It reflects the kernel's ability/permission to update the crash
+ * elfcorehdr directly.
+ */
+int crash_check_update_elfcorehdr(void)
+{
+ int rc = 0;
+
+ /* Obtain lock while reading crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ return 0;
+ }
+ if (kexec_crash_image) {
+ if (kexec_crash_image->file_mode)
+ rc = 1;
+ else
+ rc = kexec_crash_image->update_elfcorehdr;
+ }
+ /* Release lock now that update complete */
+ kexec_unlock();
+
+ return rc;
+}
+
+/*
+ * To accurately reflect hot un/plug changes of cpu and memory resources
+ * (including onling and offlining of those resources), the elfcorehdr
+ * (which is passed to the crash kernel via the elfcorehdr= parameter)
+ * must be updated with the new list of CPUs and memories.
+ *
+ * In order to make changes to elfcorehdr, two conditions are needed:
+ * First, the segment containing the elfcorehdr must be large enough
+ * to permit a growing number of resources; the elfcorehdr memory size
+ * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES.
+ * Second, purgatory must explicitly exclude the elfcorehdr from the
+ * list of segments it checks (since the elfcorehdr changes and thus
+ * would require an update to purgatory itself to update the digest).
+ */
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+{
+ struct kimage *image;
+
+ /* Obtain lock while changing crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ return;
+ }
+
+ /* Check kdump is not loaded */
+ if (!kexec_crash_image)
+ goto out;
+
+ image = kexec_crash_image;
+
+ /* Check that updating elfcorehdr is permitted */
+ if (!(image->file_mode || image->update_elfcorehdr))
+ goto out;
+
+ if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+ hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+ pr_debug("hp_action %u, cpu %u\n", hp_action, cpu);
+ else
+ pr_debug("hp_action %u\n", hp_action);
+
+ /*
+ * The elfcorehdr_index is set to -1 when the struct kimage
+ * is allocated. Find the segment containing the elfcorehdr,
+ * if not already found.
+ */
+ if (image->elfcorehdr_index < 0) {
+ unsigned long mem;
+ unsigned char *ptr;
+ unsigned int n;
+
+ for (n = 0; n < image->nr_segments; n++) {
+ mem = image->segment[n].mem;
+ ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (ptr) {
+ /* The segment containing elfcorehdr */
+ if (memcmp(ptr, ELFMAG, SELFMAG) == 0)
+ image->elfcorehdr_index = (int)n;
+ kunmap_local(ptr);
+ }
+ }
+ }
+
+ if (image->elfcorehdr_index < 0) {
+ pr_err("unable to locate elfcorehdr segment");
+ goto out;
+ }
+
+ /* Needed in order for the segments to be updated */
+ arch_kexec_unprotect_crashkres();
+
+ /* Differentiate between normal load and hotplug update */
+ image->hp_action = hp_action;
+
+ /* Now invoke arch-specific update handler */
+ arch_crash_handle_hotplug_event(image);
+
+ /* No longer handling a hotplug event */
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_updated = true;
+
+ /* Change back to read-only */
+ arch_kexec_protect_crashkres();
+
+ /* Errors in the callback is not a reason to rollback state */
+out:
+ /* Release lock now that update complete */
+ kexec_unlock();
+}
+
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+ switch (val) {
+ case MEM_ONLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+
+ case MEM_OFFLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block crash_memhp_nb = {
+ .notifier_call = crash_memhp_notifier,
+ .priority = 0
+};
+
+static int crash_cpuhp_online(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+ return 0;
+}
+
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+ return 0;
+}
+
+static int __init crash_hotplug_init(void)
+{
+ int result = 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ register_memory_notifier(&crash_memhp_nb);
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
+ "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline);
+ }
+
+ return result;
+}
+
+subsys_initcall(crash_hotplug_init);
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 811ad654abd1..98cb4eca23fb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -4,6 +4,9 @@
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells ([email protected])
*/
+
+#define pr_fmt(fmt) "CRED: " fmt
+
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
@@ -835,32 +838,32 @@ EXPORT_SYMBOL(creds_are_invalid);
static void dump_invalid_creds(const struct cred *cred, const char *label,
const struct task_struct *tsk)
{
- printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ pr_err("%s credentials: %p %s%s%s\n",
label, cred,
cred == &init_cred ? "[init]" : "",
cred == tsk->real_cred ? "[real]" : "",
cred == tsk->cred ? "[eff]" : "");
- printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ pr_err("->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ pr_err("->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
- printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ pr_err("->*uid = { %d,%d,%d,%d }\n",
from_kuid_munged(&init_user_ns, cred->uid),
from_kuid_munged(&init_user_ns, cred->euid),
from_kuid_munged(&init_user_ns, cred->suid),
from_kuid_munged(&init_user_ns, cred->fsuid));
- printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ pr_err("->*gid = { %d,%d,%d,%d }\n",
from_kgid_munged(&init_user_ns, cred->gid),
from_kgid_munged(&init_user_ns, cred->egid),
from_kgid_munged(&init_user_ns, cred->sgid),
from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
- printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ pr_err("->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
(((unsigned long) cred->security & 0xffffff00) !=
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ pr_err("->security {%x, %x}\n",
((u32*)cred->security)[0],
((u32*)cred->security)[1]);
#endif
@@ -871,8 +874,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
*/
void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
- printk(KERN_ERR "CRED: Invalid credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ pr_err("Invalid credentials\n");
+ pr_err("At %s:%u\n", file, line);
dump_invalid_creds(cred, "Specified", current);
BUG();
}
@@ -898,14 +901,14 @@ void __validate_process_creds(struct task_struct *tsk,
return;
invalid_creds:
- printk(KERN_ERR "CRED: Invalid process credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ pr_err("Invalid process credentials\n");
+ pr_err("At %s:%u\n", file, line);
dump_invalid_creds(tsk->real_cred, "Real", tsk);
if (tsk->cred != tsk->real_cred)
dump_invalid_creds(tsk->cred, "Effective", tsk);
else
- printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ pr_err("Effective creds == Real creds\n");
BUG();
}
EXPORT_SYMBOL(__validate_process_creds);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index d5e9ccde3ab8..621037a0aa87 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -968,7 +968,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key)
+static void sysrq_handle_dbg(u8 key)
{
if (!dbg_io_ops) {
pr_crit("ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 5c7e9ba7cd6b..9443bc63c5a2 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -131,6 +131,7 @@ char kdb_getchar(void)
int escape_delay = 0;
get_char_func *f, *f_prev = NULL;
int key;
+ static bool last_char_was_cr;
for (f = &kdb_poll_funcs[0]; ; ++f) {
if (*f == NULL) {
@@ -150,6 +151,18 @@ char kdb_getchar(void)
}
/*
+ * The caller expects that newlines are either CR or LF. However
+ * some terminals send _both_ CR and LF. Avoid having to handle
+ * this in the caller by stripping the LF if we saw a CR right
+ * before.
+ */
+ if (last_char_was_cr && key == '\n') {
+ last_char_was_cr = false;
+ continue;
+ }
+ last_char_was_cr = (key == '\r');
+
+ /*
* When the first character is received (or we get a change
* input source) we set ourselves up to handle an escape
* sequences (just in case).
@@ -244,7 +257,8 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* linefeed */
+ case 13: /* carriage return */
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
@@ -576,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len)
continue;
if (c == dbg_io_ops->cons)
continue;
+ if (!c->write)
+ continue;
/*
* Set oops_in_progress to encourage the console drivers to
* disregard their internal spin locks: in the current calling
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index f87c750d3eb3..3c2987f46f6e 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -13,6 +13,8 @@
#include <linux/ctype.h>
#include <linux/io.h>
+#include "kdb_private.h"
+
/* Keyboard Controller Registers on normal PCs. */
#define KBD_STATUS_REG 0x64 /* Status register (R) */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 1f8c519a5f81..548fd4059bf9 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -194,7 +194,6 @@ extern char kdb_task_state_char (const struct task_struct *);
extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_send_sig(struct task_struct *p, int sig);
extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 6677d0e64d27..4c1e9a3c0ab6 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -24,6 +24,9 @@ config DMA_OPS_BYPASS
config ARCH_HAS_DMA_MAP_DIRECT
bool
+config NEED_SG_DMA_FLAGS
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -39,7 +42,7 @@ config ARCH_HAS_DMA_SET_MASK
#
# Select this option if the architecture needs special handling for
# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
-# people thing of when saying write combine, so very few platforms should
+# people think of when saying write combine, so very few platforms should
# need to enable this.
#
config ARCH_HAS_DMA_WRITE_COMBINE
@@ -87,6 +90,23 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config SWIOTLB_DYNAMIC
+ bool "Dynamic allocation of DMA bounce buffers"
+ default n
+ depends on SWIOTLB
+ help
+ This enables dynamic resizing of the software IO TLB. The kernel
+ starts with one memory pool at boot and it will allocate additional
+ pools as needed. To reduce run-time kernel memory requirements, you
+ may have to specify a smaller size of the initial pool using
+ "swiotlb=" on the kernel command line.
+
+ If unsure, say N.
+
+config DMA_BOUNCE_UNALIGNED_KMALLOC
+ bool
+ depends on SWIOTLB
+
config DMA_RESTRICTED_POOL
bool "DMA Restricted Pool"
depends on OF && OF_RESERVED_MEM && SWIOTLB
@@ -138,15 +158,16 @@ config DMA_CMA
if DMA_CMA
-config DMA_PERNUMA_CMA
- bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
- default NUMA && ARM64
+config DMA_NUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
+ default NUMA
help
- Enable this option to get pernuma CMA areas so that devices like
- ARM64 SMMU can get local memory by DMA coherent APIs.
+ Enable this option to get numa CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
You can set the size of pernuma CMA by specifying "cma_pernuma=size"
- on the kernel's command line.
+ or set the node id and its size of CMA by specifying "numa_cma=
+ <node>:size[,<node>:size]" on the kernel's command line.
comment "Default contiguous memory area size:"
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 6ea80ae42622..88c595e49e34 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -50,6 +50,7 @@
#include <linux/sizes.h>
#include <linux/dma-map-ops.h>
#include <linux/cma.h>
+#include <linux/nospec.h>
#ifdef CONFIG_CMA_SIZE_MBYTES
#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -96,11 +97,44 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
static phys_addr_t pernuma_size_bytes __initdata;
+static int __init early_numa_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ numa_cma_size[nid] = tmp;
+
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else
+ break;
+ }
+
+ return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
static int __init early_cma_pernuma(char *p)
{
pernuma_size_bytes = memparse(p, &p);
@@ -127,32 +161,49 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
-#ifdef CONFIG_DMA_PERNUMA_CMA
-void __init dma_pernuma_cma_reserve(void)
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
{
int nid;
- if (!pernuma_size_bytes)
- return;
-
- for_each_online_node(nid) {
+ for_each_node(nid) {
int ret;
char name[CMA_MAX_NAME];
- struct cma **cma = &dma_contiguous_pernuma_area[nid];
-
- snprintf(name, sizeof(name), "pernuma%d", nid);
- ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
- 0, false, name, cma, nid);
- if (ret) {
- pr_warn("%s: reservation failed: err %d, node %d", __func__,
- ret, nid);
+ struct cma **cma;
+
+ if (!node_online(nid)) {
+ if (pernuma_size_bytes || numa_cma_size[nid])
+ pr_warn("invalid node %d specified\n", nid);
continue;
}
- pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
- (unsigned long long)pernuma_size_bytes / SZ_1M, nid);
+ if (pernuma_size_bytes) {
+
+ cma = &dma_contiguous_pernuma_area[nid];
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+
+ if (numa_cma_size[nid]) {
+
+ cma = &dma_contiguous_numa_area[nid];
+ snprintf(name, sizeof(name), "numa%d", nid);
+ ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+ name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
}
}
+#else
+static inline void __init dma_numa_cma_reserve(void)
+{
+}
#endif
/**
@@ -171,6 +222,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
phys_addr_t selected_limit = limit;
bool fixed = false;
+ dma_numa_cma_reserve();
+
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
@@ -303,7 +356,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
int nid = dev_to_node(dev);
#endif
@@ -315,7 +368,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return NULL;
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
struct cma *cma = dma_contiguous_pernuma_area[nid];
struct page *page;
@@ -325,6 +378,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (page)
return page;
}
+
+ cma = dma_contiguous_numa_area[nid];
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
}
#endif
if (!dma_contiguous_default_area)
@@ -356,10 +416,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
/*
* otherwise, page is from either per-numa cma or default cma
*/
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
page, count))
return;
+ if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+ page, count))
+ return;
#endif
if (cma_release(dma_contiguous_default_area, page, count))
return;
@@ -410,6 +473,11 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
return -EBUSY;
}
+ if (memblock_is_region_reserved(rmem->base, rmem->size)) {
+ pr_info("Reserved memory: overlap with other memblock reserved region\n");
+ return -EBUSY;
+ }
+
if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 5595d1d5cdcc..9596ae1aa0da 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -66,7 +66,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
return 0;
}
-static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
@@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int i;
for_each_sg(sgl, sg, nents, i) {
- if (sg_is_dma_bus_address(sg))
+ if (sg_dma_is_bus_address(sg))
sg_dma_unmark_bus_address(sg);
else
dma_direct_unmap_page(dev, sg->dma_address,
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e38ffc5e6bdd..97ec892ea0b5 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -94,7 +94,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
return swiotlb_map(dev, phys, size, dir, attrs);
}
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
if (is_pci_p2pdma_page(page))
return DMA_MAPPING_ERROR;
if (is_swiotlb_active(dev))
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9a4db5cce600..e323ca48f7f2 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -760,12 +760,6 @@ bool dma_pci_p2pdma_supported(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
-#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
-void arch_dma_set_mask(struct device *dev, u64 mask);
-#else
-#define arch_dma_set_mask(dev, mask) do { } while (0)
-#endif
-
int dma_set_mask(struct device *dev, u64 mask)
{
/*
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index b4526668072e..27596f3b4aef 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -43,13 +43,13 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
void *vaddr;
int i;
- pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
+ pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
for (i = 0; i < count; i++)
pages[i] = nth_page(page, i);
vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
- kfree(pages);
+ kvfree(pages);
return vaddr;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index af2e304c672c..394494a6b1f3 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -35,6 +35,7 @@
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
+#include <linux/rculist.h>
#include <linux/scatterlist.h>
#include <linux/set_memory.h>
#include <linux/spinlock.h>
@@ -62,6 +63,13 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+/**
+ * struct io_tlb_slot - IO TLB slot descriptor
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size: Size of the allocated buffer.
+ * @list: The free list describing the number of free entries available
+ * from each index.
+ */
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
@@ -71,7 +79,22 @@ struct io_tlb_slot {
static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;
-struct io_tlb_mem io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+static void swiotlb_dyn_alloc(struct work_struct *work);
+
+static struct io_tlb_mem io_tlb_default_mem = {
+ .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
+ .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
+ .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
+ swiotlb_dyn_alloc),
+};
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static struct io_tlb_mem io_tlb_default_mem;
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static unsigned long default_nareas;
@@ -115,9 +138,16 @@ static bool round_up_default_nslabs(void)
return true;
}
+/**
+ * swiotlb_adjust_nareas() - adjust the number of areas and slots
+ * @nareas: Desired number of areas. Zero is treated as 1.
+ *
+ * Adjust the default number of areas in a memory pool.
+ * The default size of the memory pool may also change to meet minimum area
+ * size requirements.
+ */
static void swiotlb_adjust_nareas(unsigned int nareas)
{
- /* use a single area when non is specified */
if (!nareas)
nareas = 1;
else if (!is_power_of_2(nareas))
@@ -131,6 +161,23 @@ static void swiotlb_adjust_nareas(unsigned int nareas)
(default_nslabs << IO_TLB_SHIFT) >> 20);
}
+/**
+ * limit_nareas() - get the maximum number of areas for a given memory pool size
+ * @nareas: Desired number of areas.
+ * @nslots: Total number of slots in the memory pool.
+ *
+ * Limit the number of areas to the maximum possible number of areas in
+ * a memory pool of the given size.
+ *
+ * Return: Maximum possible number of areas.
+ */
+static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
+{
+ if (nslots < nareas * IO_TLB_SEGSIZE)
+ return nslots / IO_TLB_SEGSIZE;
+ return nareas;
+}
+
static int __init
setup_io_tlb_npages(char *str)
{
@@ -178,7 +225,7 @@ void __init swiotlb_adjust_size(unsigned long size)
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
if (!mem->nslabs) {
pr_warn("No low mem\n");
@@ -207,7 +254,7 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long bytes;
if (!mem->nslabs || mem->late_alloc)
@@ -216,9 +263,8 @@ void __init swiotlb_update_mem_attributes(void)
set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
-static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, unsigned int flags,
- bool late_alloc, unsigned int nareas)
+static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc, unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -230,8 +276,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->nareas = nareas;
mem->area_nslabs = nslabs / mem->nareas;
- mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
-
for (i = 0; i < mem->nareas; i++) {
spin_lock_init(&mem->areas[i].lock);
mem->areas[i].index = 0;
@@ -249,6 +293,23 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
+/**
+ * add_mem_pool() - add a memory pool to the allocator
+ * @mem: Software IO TLB allocator.
+ * @pool: Memory pool to be added.
+ */
+static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock(&mem->lock);
+ list_add_rcu(&pool->node, &mem->pools);
+ mem->nslabs += pool->nslabs;
+ spin_unlock(&mem->lock);
+#else
+ mem->nslabs = pool->nslabs;
+#endif
+}
+
static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
@@ -288,8 +349,9 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long nslabs;
+ unsigned int nareas;
size_t alloc_size;
void *tlb;
@@ -298,18 +360,28 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
if (swiotlb_force_disable)
return;
- /*
- * default_nslabs maybe changed when adjust area number.
- * So allocate bounce buffer after adjusting area number.
- */
+ io_tlb_default_mem.force_bounce =
+ swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (flags & SWIOTLB_ANY)
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+ else
+ io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
+#endif
+
if (!default_nareas)
swiotlb_adjust_nareas(num_possible_cpus());
nslabs = default_nslabs;
+ nareas = limit_nareas(default_nareas, nslabs);
while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
if (nslabs <= IO_TLB_MIN_SLABS)
return;
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
}
if (default_nslabs != nslabs) {
@@ -333,8 +405,9 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
return;
}
- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
- default_nareas);
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
+ default_nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
@@ -353,16 +426,36 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
int swiotlb_init_late(size_t size, gfp_t gfp_mask,
int (*remap)(void *tlb, unsigned long nslabs))
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ unsigned int nareas;
unsigned char *vstart = NULL;
unsigned int order, area_order;
bool retried = false;
int rc = 0;
+ if (io_tlb_default_mem.nslabs)
+ return 0;
+
if (swiotlb_force_disable)
return 0;
+ io_tlb_default_mem.force_bounce = swiotlb_force_bounce;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+ else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+ else
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
retry:
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
@@ -397,11 +490,8 @@ retry:
(PAGE_SIZE << order) >> 20);
}
- if (!default_nareas)
- swiotlb_adjust_nareas(num_possible_cpus());
-
- area_order = get_order(array_size(sizeof(*mem->areas),
- default_nareas));
+ nareas = limit_nareas(default_nareas, nslabs);
+ area_order = get_order(array_size(sizeof(*mem->areas), nareas));
mem->areas = (struct io_tlb_area *)
__get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
if (!mem->areas)
@@ -414,8 +504,9 @@ retry:
set_memory_decrypted((unsigned long)vstart,
(nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
- default_nareas);
+ swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
+ nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
swiotlb_print_info();
return 0;
@@ -429,7 +520,7 @@ error_area:
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
unsigned int area_order;
@@ -462,6 +553,265 @@ void __init swiotlb_exit(void)
memset(mem, 0, sizeof(*mem));
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * alloc_dma_pages() - allocate pages to be used for DMA
+ * @gfp: GFP flags for the allocation.
+ * @bytes: Size of the buffer.
+ *
+ * Allocate pages from the buddy allocator. If successful, make the allocated
+ * pages decrypted that they can be used for DMA.
+ *
+ * Return: Decrypted pages, or %NULL on failure.
+ */
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes)
+{
+ unsigned int order = get_order(bytes);
+ struct page *page;
+ void *vaddr;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ vaddr = page_address(page);
+ if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ goto error;
+ return page;
+
+error:
+ __free_pages(page, order);
+ return NULL;
+}
+
+/**
+ * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
+ * @dev: Device for which a memory pool is allocated.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ * @gfp: GFP flags for the allocation.
+ *
+ * Return: Allocated pages, or %NULL on allocation failure.
+ */
+static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
+ u64 phys_limit, gfp_t gfp)
+{
+ struct page *page;
+
+ /*
+ * Allocate from the atomic pools if memory is encrypted and
+ * the allocation is atomic, because decrypting may block.
+ */
+ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ void *vaddr;
+
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return NULL;
+
+ return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
+ dma_coherent_ok);
+ }
+
+ gfp &= ~GFP_ZONEMASK;
+ if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ gfp |= __GFP_DMA;
+ else if (phys_limit <= DMA_BIT_MASK(32))
+ gfp |= __GFP_DMA32;
+
+ while ((page = alloc_dma_pages(gfp, bytes)) &&
+ page_to_phys(page) + bytes - 1 > phys_limit) {
+ /* allocated, but too high */
+ __free_pages(page, get_order(bytes));
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (__GFP_DMA32 | __GFP_DMA)))
+ gfp |= __GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ !(gfp & __GFP_DMA))
+ gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
+ else
+ return NULL;
+ }
+
+ return page;
+}
+
+/**
+ * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
+ * @vaddr: Virtual address of the buffer.
+ * @bytes: Size of the buffer.
+ */
+static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(NULL, vaddr, bytes))
+ return;
+
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(virt_to_page(vaddr), get_order(bytes));
+}
+
+/**
+ * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
+ * @dev: Device for which a memory pool is allocated.
+ * @minslabs: Minimum number of slabs.
+ * @nslabs: Desired (maximum) number of slabs.
+ * @nareas: Number of areas.
+ * @phys_limit: Maximum DMA buffer physical address.
+ * @gfp: GFP flags for the allocations.
+ *
+ * Allocate and initialize a new IO TLB memory pool. The actual number of
+ * slabs may be reduced if allocation of @nslabs fails. If even
+ * @minslabs cannot be allocated, this function fails.
+ *
+ * Return: New memory pool, or %NULL on allocation failure.
+ */
+static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
+ unsigned long minslabs, unsigned long nslabs,
+ unsigned int nareas, u64 phys_limit, gfp_t gfp)
+{
+ struct io_tlb_pool *pool;
+ unsigned int slot_order;
+ struct page *tlb;
+ size_t pool_size;
+ size_t tlb_size;
+
+ pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
+ pool = kzalloc(pool_size, gfp);
+ if (!pool)
+ goto error;
+ pool->areas = (void *)pool + sizeof(*pool);
+
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ if (nslabs <= minslabs)
+ goto error_tlb;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ }
+
+ slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
+ pool->slots = (struct io_tlb_slot *)
+ __get_free_pages(gfp, slot_order);
+ if (!pool->slots)
+ goto error_slots;
+
+ swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
+ return pool;
+
+error_slots:
+ swiotlb_free_tlb(page_address(tlb), tlb_size);
+error_tlb:
+ kfree(pool);
+error:
+ return NULL;
+}
+
+/**
+ * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
+ * @work: Pointer to dyn_alloc in struct io_tlb_mem.
+ */
+static void swiotlb_dyn_alloc(struct work_struct *work)
+{
+ struct io_tlb_mem *mem =
+ container_of(work, struct io_tlb_mem, dyn_alloc);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
+ default_nareas, mem->phys_limit, GFP_KERNEL);
+ if (!pool) {
+ pr_warn_ratelimited("Failed to allocate new pool");
+ return;
+ }
+
+ add_mem_pool(mem, pool);
+
+ /* Pairs with smp_rmb() in is_swiotlb_buffer(). */
+ smp_wmb();
+}
+
+/**
+ * swiotlb_dyn_free() - RCU callback to free a memory pool
+ * @rcu: RCU head in the corresponding struct io_tlb_pool.
+ */
+static void swiotlb_dyn_free(struct rcu_head *rcu)
+{
+ struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
+ size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
+ size_t tlb_size = pool->end - pool->start;
+
+ free_pages((unsigned long)pool->slots, get_order(slots_size));
+ swiotlb_free_tlb(pool->vaddr, tlb_size);
+ kfree(pool);
+}
+
+/**
+ * swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * @dev: Device which has mapped the DMA buffer.
+ * @paddr: Physical address within the DMA buffer.
+ *
+ * Find the IO TLB memory pool descriptor which contains the given physical
+ * address, if any.
+ *
+ * Return: Memory pool which contains @paddr, or %NULL if none.
+ */
+struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+
+ list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+ pool = NULL;
+out:
+ rcu_read_unlock();
+ return pool;
+}
+
+/**
+ * swiotlb_del_pool() - remove an IO TLB pool from a device
+ * @dev: Owning device.
+ * @pool: Memory pool to be removed.
+ */
+static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_del_rcu(&pool->node);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+ call_rcu(&pool->rcu, swiotlb_dyn_free);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_dev_init() - initialize swiotlb fields in &struct device
+ * @dev: Device to be initialized.
+ */
+void swiotlb_dev_init(struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
+ spin_lock_init(&dev->dma_io_tlb_lock);
+ dev->dma_uses_io_tlb = false;
+#endif
+}
+
/*
* Return the offset into a iotlb slot required to keep the device happy.
*/
@@ -476,7 +826,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
@@ -553,12 +903,10 @@ static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
*/
static inline unsigned long get_max_slots(unsigned long boundary_mask)
{
- if (boundary_mask == ~0UL)
- return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
- return nr_slots(boundary_mask + 1);
+ return (boundary_mask >> IO_TLB_SHIFT) + 1;
}
-static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
{
if (index >= mem->area_nslabs)
return 0;
@@ -599,19 +947,30 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
}
#endif /* CONFIG_DEBUG_FS */
-/*
- * Find a suitable number of IO TLB entries size that will fit this request and
- * allocate a buffer from that IO TLB pool.
+/**
+ * swiotlb_area_find_slots() - search for slots in one IO TLB memory area
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @area_index: Index of the IO TLB memory area to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Find a suitable sequence of IO TLB entries for the request and allocate
+ * a buffer from the given IO TLB memory area.
+ * This function takes care of locking.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
*/
-static int swiotlb_do_find_slots(struct device *dev, int area_index,
- phys_addr_t orig_addr, size_t alloc_size,
+static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
+ int area_index, phys_addr_t orig_addr, size_t alloc_size,
unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- struct io_tlb_area *area = mem->areas + area_index;
+ struct io_tlb_area *area = pool->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
+ phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) | alloc_align_mask;
@@ -623,7 +982,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
unsigned int slot_index;
BUG_ON(!nslots);
- BUG_ON(area_index >= mem->nareas);
+ BUG_ON(area_index >= pool->nareas);
/*
* For allocations of PAGE_SIZE or larger only look for page aligned
@@ -640,35 +999,30 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
spin_lock_irqsave(&area->lock, flags);
- if (unlikely(nslots > mem->area_nslabs - area->used))
+ if (unlikely(nslots > pool->area_nslabs - area->used))
goto not_found;
- slot_base = area_index * mem->area_nslabs;
+ slot_base = area_index * pool->area_nslabs;
index = area->index;
- for (slots_checked = 0; slots_checked < mem->area_nslabs; ) {
+ for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
slot_index = slot_base + index;
if (orig_addr &&
(slot_addr(tbl_dma_addr, slot_index) &
iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
- index = wrap_area_index(mem, index + 1);
+ index = wrap_area_index(pool, index + 1);
slots_checked++;
continue;
}
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[slot_index].list >= nslots)
+ if (pool->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_area_index(mem, index + stride);
+ index = wrap_area_index(pool, index + stride);
slots_checked += stride;
}
@@ -677,56 +1031,212 @@ not_found:
return -1;
found:
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot onwards
+ * and set the list of free entries to '0' indicating unavailable.
+ */
for (i = slot_index; i < slot_index + nslots; i++) {
- mem->slots[i].list = 0;
- mem->slots[i].alloc_size = alloc_size - (offset +
+ pool->slots[i].list = 0;
+ pool->slots[i].alloc_size = alloc_size - (offset +
((i - slot_index) << IO_TLB_SHIFT));
}
for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
- mem->slots[i].list; i--)
- mem->slots[i].list = ++count;
+ pool->slots[i].list; i--)
+ pool->slots[i].list = ++count;
/*
* Update the indices to avoid searching in the next round.
*/
- area->index = wrap_area_index(mem, index + nslots);
+ area->index = wrap_area_index(pool, index + nslots);
area->used += nslots;
spin_unlock_irqrestore(&area->lock, flags);
- inc_used_and_hiwater(mem, nslots);
+ inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
return slot_index;
}
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size, unsigned int alloc_align_mask)
+/**
+ * swiotlb_pool_find_slots() - search for slots in one memory pool
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Search through one memory pool to find a sequence of slots that match the
+ * allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool,
+ phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- int start = raw_smp_processor_id() & (mem->nareas - 1);
+ int start = raw_smp_processor_id() & (pool->nareas - 1);
int i = start, index;
do {
- index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size,
- alloc_align_mask);
+ index = swiotlb_area_find_slots(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
if (index >= 0)
return index;
- if (++i >= mem->nareas)
+ if (++i >= pool->nareas)
i = 0;
} while (i != start);
return -1;
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_find_slots() - search for slots in the whole swiotlb
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search through the whole software IO TLB to find a sequence of slots that
+ * match the allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ unsigned long nslabs;
+ unsigned long flags;
+ u64 phys_limit;
+ int index;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ index = swiotlb_pool_find_slots(dev, pool, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0) {
+ rcu_read_unlock();
+ goto found;
+ }
+ }
+ rcu_read_unlock();
+ if (!mem->can_grow)
+ return -1;
+
+ schedule_work(&mem->dyn_alloc);
+
+ nslabs = nr_slots(alloc_size);
+ phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!pool)
+ return -1;
+
+ index = swiotlb_pool_find_slots(dev, pool, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index < 0) {
+ swiotlb_dyn_free(&pool->rcu);
+ return -1;
+ }
+
+ pool->transient = true;
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+found:
+ dev->dma_uses_io_tlb = true;
+ /* Pairs with smp_rmb() in is_swiotlb_buffer() */
+ smp_wmb();
+
+ *retpool = pool;
+ return index;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ *retpool = &dev->dma_io_tlb_mem->defpool;
+ return swiotlb_pool_find_slots(dev, *retpool,
+ orig_addr, alloc_size, alloc_align_mask);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+#ifdef CONFIG_DEBUG_FS
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is accurate in this version of the function, because an atomic
+ * counter is available if CONFIG_DEBUG_FS is set.
+ *
+ * Return: Number of used slots.
+ */
static unsigned long mem_used(struct io_tlb_mem *mem)
{
+ return atomic_long_read(&mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+
+/**
+ * mem_pool_used() - get number of used slots in a memory pool
+ * @pool: Software IO TLB memory pool.
+ *
+ * The result is not accurate, see mem_used().
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_pool_used(struct io_tlb_pool *pool)
+{
int i;
unsigned long used = 0;
- for (i = 0; i < mem->nareas; i++)
- used += mem->areas[i].used;
+ for (i = 0; i < pool->nareas; i++)
+ used += pool->areas[i].used;
return used;
}
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is not accurate, because there is no locking of individual
+ * areas.
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ struct io_tlb_pool *pool;
+ unsigned long used = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node)
+ used += mem_pool_used(pool);
+ rcu_read_unlock();
+
+ return used;
+#else
+ return mem_pool_used(&mem->defpool);
+#endif
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
unsigned int alloc_align_mask, enum dma_data_direction dir,
@@ -734,6 +1244,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ struct io_tlb_pool *pool;
unsigned int i;
int index;
phys_addr_t tlb_addr;
@@ -754,7 +1265,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
}
index = swiotlb_find_slots(dev, orig_addr,
- alloc_size + offset, alloc_align_mask);
+ alloc_size + offset, alloc_align_mask, &pool);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -769,8 +1280,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* needed.
*/
for (i = 0; i < nr_slots(alloc_size + offset); i++)
- mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- tlb_addr = slot_addr(mem->start, index) + offset;
+ pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(pool->start, index) + offset;
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
@@ -784,7 +1295,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
unsigned long flags;
unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
@@ -828,9 +1339,44 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
area->used -= nslots;
spin_unlock_irqrestore(&area->lock, flags);
- dec_used(mem, nslots);
+ dec_used(dev->dma_io_tlb_mem, nslots);
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_del_transient() - delete a transient memory pool
+ * @dev: Device which mapped the buffer.
+ * @tlb_addr: Physical address within a bounce buffer.
+ *
+ * Check whether the address belongs to a transient SWIOTLB memory pool.
+ * If yes, then delete the pool.
+ *
+ * Return: %true if @tlb_addr belonged to a transient pool that was released.
+ */
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
+{
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool->transient)
+ return false;
+
+ dec_used(dev->dma_io_tlb_mem, pool->nslabs);
+ swiotlb_del_pool(dev, pool);
+ return true;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static inline bool swiotlb_del_transient(struct device *dev,
+ phys_addr_t tlb_addr)
+{
+ return false;
}
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
@@ -845,6 +1391,8 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ if (swiotlb_del_transient(dev, tlb_addr))
+ return;
swiotlb_release_slots(dev, tlb_addr);
}
@@ -915,13 +1463,47 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
}
+/**
+ * is_swiotlb_allocated() - check if the default software IO TLB is initialized
+ */
+bool is_swiotlb_allocated(void)
+{
+ return io_tlb_default_mem.nslabs;
+}
+
bool is_swiotlb_active(struct device *dev)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
return mem && mem->nslabs;
}
-EXPORT_SYMBOL_GPL(is_swiotlb_active);
+
+/**
+ * default_swiotlb_base() - get the base address of the default SWIOTLB
+ *
+ * Get the lowest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_base(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ io_tlb_default_mem.can_grow = false;
+#endif
+ return io_tlb_default_mem.defpool.start;
+}
+
+/**
+ * default_swiotlb_limit() - get the address limit of the default SWIOTLB
+ *
+ * Get the highest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_limit(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ return io_tlb_default_mem.phys_limit;
+#else
+ return io_tlb_default_mem.defpool.end - 1;
+#endif
+}
#ifdef CONFIG_DEBUG_FS
@@ -996,17 +1578,18 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0);
+ index = swiotlb_find_slots(dev, 0, size, 0, &pool);
if (index == -1)
return NULL;
- tlb_addr = slot_addr(mem->start, index);
+ tlb_addr = slot_addr(pool->start, index);
return pfn_to_page(PFN_DOWN(tlb_addr));
}
@@ -1043,29 +1626,37 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
* to it.
*/
if (!mem) {
+ struct io_tlb_pool *pool;
+
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem)
return -ENOMEM;
+ pool = &mem->defpool;
- mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
- if (!mem->slots) {
+ pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL);
+ if (!pool->slots) {
kfree(mem);
return -ENOMEM;
}
- mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+ pool->areas = kcalloc(nareas, sizeof(*pool->areas),
GFP_KERNEL);
- if (!mem->areas) {
- kfree(mem->slots);
+ if (!pool->areas) {
+ kfree(pool->slots);
kfree(mem);
return -ENOMEM;
}
set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
rmem->size >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
- false, nareas);
+ swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
+ false, nareas);
+ mem->force_bounce = true;
mem->for_alloc = true;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock_init(&mem->lock);
+#endif
+ add_mem_pool(mem, pool);
rmem->priv = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..d7ee4bc3f2ba 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -205,8 +205,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
arch_exit_to_user_mode_prepare(regs, ti_work);
- /* Ensure that the address limit is intact and no locks are held */
- addr_limit_user_check();
+ /* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index db016e418931..4c72a41f11af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
return;
send_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ event->orig_type, event->attr.sig_data);
}
/*
@@ -7490,6 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
+again:
pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7498,6 +7499,9 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pmd_leaf_size(pmd);
ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ goto again;
+
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
size = pte_leaf_size(pte);
@@ -8245,7 +8249,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -8627,7 +8631,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
@@ -8674,33 +8678,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
- }
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name, sizeof(tmp));
name = tmp;
got_name:
/*
@@ -9124,7 +9117,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strlcpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym, KSYM_NAME_LEN);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -9591,16 +9584,16 @@ u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
+ old = local64_read(&hwc->period_left);
+ do {
+ val = old;
+ if (val < 0)
+ return 0;
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
return nr;
}
@@ -9951,6 +9944,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}
+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -9966,7 +9962,10 @@ static int perf_swevent_init(struct perf_event *event)
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;
default:
@@ -11098,7 +11097,7 @@ static void cpu_clock_event_read(struct perf_event *event)
static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -11119,6 +11118,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -11179,7 +11179,7 @@ static void task_clock_event_read(struct perf_event *event)
static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -11200,6 +11200,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -11379,6 +11380,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
+ pmu->dev->parent = pmu->parent;
pmu->dev->release = pmu_dev_release;
ret = dev_set_name(pmu->dev, "%s", pmu->name);
@@ -11427,31 +11429,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto unlock;
pmu->type = -1;
- if (!name)
- goto skip_type;
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
- if (type != PERF_TYPE_SOFTWARE) {
- if (type >= 0)
- max = type;
+ if (type >= 0)
+ max = type;
- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto free_pdc;
- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && ret != type);
- type = ret;
- }
+ type = ret;
pmu->type = type;
- if (pmu_bus_running) {
+ if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}
-skip_type:
ret = -ENOMEM;
pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
if (!pmu->cpu_pmu_context)
@@ -11493,16 +11495,7 @@ skip_type:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- /*
- * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
- * since these cannot be in the IDR. This way the linear search
- * is fast, provided a valid software event is provided.
- */
- if (type == PERF_TYPE_SOFTWARE || !name)
- list_add_rcu(&pmu->entry, &pmus);
- else
- list_add_tail_rcu(&pmu->entry, &pmus);
-
+ list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -11511,12 +11504,13 @@ unlock:
return ret;
free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
free_idr:
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
+ idr_remove(&pmu_idr, pmu->type);
free_pdc:
free_percpu(pmu->pmu_disable_count);
@@ -11537,9 +11531,8 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
- if (pmu_bus_running) {
+ idr_remove(&pmu_idr, pmu->type);
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
@@ -11613,6 +11606,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
idx = srcu_read_lock(&pmus_srcu);
+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
+
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
@@ -13652,8 +13651,8 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
@@ -13696,7 +13695,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;
ret = pmu_dev_alloc(pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c3797701339c..6c2cb4e4f48d 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -523,26 +523,6 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int we
return 0;
}
-__weak int arch_reserve_bp_slot(struct perf_event *bp)
-{
- return 0;
-}
-
-__weak void arch_release_bp_slot(struct perf_event *bp)
-{
-}
-
-/*
- * Function to perform processor-specific cleanup during unregistration
- */
-__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
-{
- /*
- * A weak stub function here for those archs that don't define
- * it inside arch/.../kernel/hw_breakpoint.c
- */
-}
-
/*
* Constraints to check before allowing this new breakpoint counter.
*
@@ -594,7 +574,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int max_pinned_slots;
int weight;
- int ret;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
@@ -613,10 +592,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
- ret = arch_reserve_bp_slot(bp);
- if (ret)
- return ret;
-
return toggle_bp_slot(bp, true, type, weight);
}
@@ -634,8 +609,6 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int weight;
- arch_release_bp_slot(bp);
-
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
WARN_ON(toggle_bp_slot(bp, false, type, weight));
@@ -645,7 +618,6 @@ void release_bp_slot(struct perf_event *bp)
{
struct mutex *mtx = bp_constraints_lock(bp);
- arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
bp_constraints_unlock(mtx);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a0433f37b024..fb1e180b5f0a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -191,9 +191,10 @@ __perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
+ offset = local_read(&rb->head);
do {
+ head = offset;
tail = READ_ONCE(rb->user_page->data_tail);
- offset = head = local_read(&rb->head);
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
@@ -217,7 +218,7 @@ __perf_output_begin(struct perf_output_handle *handle,
head += size;
else
head -= size;
- } while (local_cmpxchg(&rb->head, offset, head) != offset);
+ } while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 59887c69d54c..3048589e2e85 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,8 +192,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
+ ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
@@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
- struct vm_area_struct *vma;
int ret;
short *ptr;
@@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
- FOLL_WRITE, &page, &vma, NULL);
+ FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -474,10 +473,9 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
- &old_page, &vma, NULL);
- if (ret <= 0)
- return ret;
+ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR_OR_NULL(old_page))
+ return old_page ? PTR_ERR(old_page) : 0;
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
@@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
- NULL, NULL);
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
if (result < 0)
return result;
diff --git a/kernel/fork.c b/kernel/fork.c
index 41c964104b58..3b6d20dfb9a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -252,23 +252,19 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
int i;
int ret;
+ int nr_charged = 0;
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
+ nr_charged++;
}
return 0;
err:
- /*
- * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
- * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
- * ignore this page.
- */
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ for (i = 0; i < nr_charged; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
return ret;
}
@@ -690,6 +686,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
for_each_vma(old_vmi, mpnt) {
struct file *file;
+ vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
@@ -912,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
- int i;
-
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
@@ -928,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
- for (i = 0; i < NR_MM_COUNTERS; i++)
- percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -988,6 +982,14 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
@@ -1255,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- int i;
-
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1304,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_cid(mm))
goto fail_cid;
- for (i = 0; i < NR_MM_COUNTERS; i++)
- if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
- goto fail_pcpu;
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_pcpu:
- while (i > 0)
- percpu_counter_destroy(&mm->rss_stat[--i]);
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
@@ -1399,8 +1397,8 @@ EXPORT_SYMBOL_GPL(mmput_async);
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
*
* Can only fail if new_exe_file != NULL.
*/
@@ -1435,9 +1433,7 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
*
- * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
- * dealing with concurrent invocation and without grabbing the mmap lock in
- * write mode.
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/
@@ -1467,22 +1463,20 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- /* set the new file, lockless */
ret = deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
- old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
if (old_exe_file) {
- /*
- * Don't race with dup_mmap() getting the file and disallowing
- * write access while someone might open the file writable.
- */
- mmap_read_lock(mm);
allow_write_access(old_exe_file);
fput(old_exe_file);
- mmap_read_unlock(mm);
}
return 0;
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 514e4582b863..f10587d1d481 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1132,8 +1132,7 @@ static int __init futex_init(void)
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
+ futex_hashsize, 0, 0,
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 16f8ecc7d882..ccd02afaeffb 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
+CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations
obj-$(CONFIG_CC_IS_CLANG) += clang.o
+CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 1ef9a87511f5..6d443ea22bb7 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -83,12 +83,9 @@ find $cpio_dir -type f -print0 |
xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
# Create archive and try to normalize metadata for reproducibility.
-# For compatibility with older versions of tar, files are fed to tar
-# pre-sorted, as --sort=name might not be available.
-find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
- tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
- --owner=0 --group=0 --numeric-owner --no-recursion \
- -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null
+tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+ --owner=0 --group=0 --sort=name --numeric-owner \
+ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
echo $headers_md5 > kernel/kheaders.md5
echo "$this_file_md5" >> kernel/kheaders.md5
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 62c92e43aa0d..dc2120776e1c 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -3,19 +3,16 @@
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
-
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
+#include <linux/ioremap.h>
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
+#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
}
#endif
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 49e7bc871fec..dc94e0bf2c94 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -306,6 +306,7 @@ static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
+ clear_irq_resend(desc);
desc->depth = 1;
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -472,11 +473,12 @@ void handle_nested_irq(unsigned int irq)
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ raw_spin_unlock_irq(&desc->lock);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ atomic_inc(&desc->threads_active);
raw_spin_unlock_irq(&desc->lock);
action_ret = IRQ_NONE;
@@ -486,11 +488,7 @@ void handle_nested_irq(unsigned int irq)
if (!irq_settings_no_debug(desc))
note_interrupt(desc, action_ret);
- raw_spin_lock_irq(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock_irq(&desc->lock);
+ wake_threads_waitq(desc);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
@@ -692,8 +690,16 @@ void handle_fasteoi_irq(struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (!irq_may_run(desc))
+ /*
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
+ */
+ if (!irq_may_run(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
goto out;
+ }
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
@@ -715,6 +721,12 @@ void handle_fasteoi_irq(struct irq_desc *desc)
cond_unmask_eoi_irq(desc, chip);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
+
raw_spin_unlock(&desc->lock);
return;
out:
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index bbcaac64038e..5971a66be034 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
};
static const struct irq_bit_descr irqdesc_states[] = {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5fdc0b557579..bcc7f21db9ee 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,9 +12,9 @@
#include <linux/sched/clock.h>
#ifdef CONFIG_SPARSE_IRQ
-# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+# define MAX_SPARSE_IRQS INT_MAX
#else
-# define IRQ_BITMAP_BITS NR_IRQS
+# define MAX_SPARSE_IRQS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
@@ -47,9 +47,12 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
* IRQS_SYSFS - descriptor has been added to sysfs
@@ -105,17 +108,19 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
enum irqchip_irq_state which,
bool *state);
-extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
int check_irq_resend(struct irq_desc *desc, bool inject);
+void clear_irq_resend(struct irq_desc *desc);
+void irq_resend_init(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+void wake_threads_waitq(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 240e145e969f..27ca1c866f29 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -12,8 +12,7 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/radix-tree.h>
-#include <linux/bitmap.h>
+#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
#include <linux/sysfs.h>
@@ -131,7 +130,40 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
+ MT_FLAGS_ALLOC_RANGE |
+ MT_FLAGS_LOCK_EXTERN |
+ MT_FLAGS_USE_RCU,
+ sparse_irq_lock);
+
+static int irq_find_free_area(unsigned int from, unsigned int cnt)
+{
+ MA_STATE(mas, &sparse_irqs, 0, 0);
+
+ if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
+ return -ENOSPC;
+ return mas.index;
+}
+
+static unsigned int irq_find_at_or_after(unsigned int offset)
+{
+ unsigned long index = offset;
+ struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs);
+
+ return desc ? irq_desc_get_irq(desc) : nr_irqs;
+}
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ mas_erase(&mas);
+}
#ifdef CONFIG_SPARSE_IRQ
@@ -344,26 +376,14 @@ static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
-static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
-
-static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
-{
- radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return radix_tree_lookup(&irq_desc_tree, irq);
+ return mtree_load(&sparse_irqs, irq);
}
#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
-static void delete_irq_desc(unsigned int irq)
-{
- radix_tree_delete(&irq_desc_tree, irq);
-}
-
#ifdef CONFIG_SMP
static void free_masks(struct irq_desc *desc)
{
@@ -415,6 +435,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
kobject_init(&desc->kobj, &irq_kobj_type);
+ irq_resend_init(desc);
return desc;
@@ -505,7 +526,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
irq_sysfs_add(start + i, desc);
irq_add_debugfs_entry(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
err:
@@ -516,7 +536,7 @@ err:
static int irq_expand_nr_irqs(unsigned int nr)
{
- if (nr > IRQ_BITMAP_BITS)
+ if (nr > MAX_SPARSE_IRQS)
return -ENOMEM;
nr_irqs = nr;
return 0;
@@ -534,18 +554,17 @@ int __init early_irq_init(void)
printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
NR_IRQS, nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
- nr_irqs = IRQ_BITMAP_BITS;
+ if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
+ nr_irqs = MAX_SPARSE_IRQS;
- if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
- initcnt = IRQ_BITMAP_BITS;
+ if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
+ initcnt = MAX_SPARSE_IRQS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, 0, NULL, NULL);
- set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
@@ -581,6 +600,7 @@ int __init early_irq_init(void)
mutex_init(&desc[i].request_mutex);
init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
+ irq_resend_init(desc);
}
return arch_early_irq_init();
}
@@ -599,6 +619,7 @@ static void free_desc(unsigned int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ delete_irq_desc(irq);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -611,8 +632,8 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
+ irq_insert_desc(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -624,7 +645,7 @@ static int irq_expand_nr_irqs(unsigned int nr)
void irq_mark_irq(unsigned int irq)
{
mutex_lock(&sparse_irq_lock);
- bitmap_set(allocated_irqs, irq, 1);
+ irq_insert_desc(irq, irq_desc + irq);
mutex_unlock(&sparse_irq_lock);
}
@@ -768,7 +789,6 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
for (i = 0; i < cnt; i++)
free_desc(from + i);
- bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -810,8 +830,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
- from, cnt, 0);
+ start = irq_find_free_area(from, cnt);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto unlock;
@@ -836,7 +855,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return find_next_bit(allocated_irqs, nr_irqs, offset);
+ return irq_find_at_or_after(offset);
}
struct irq_desc *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f34760a1e222..0bdef4fe925b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -182,9 +182,7 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
return NULL;
}
- strreplace(name, '/', ':');
-
- domain->name = name;
+ domain->name = strreplace(name, '/', ':');
domain->fwnode = fwnode;
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
@@ -1915,6 +1913,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
static struct dentry *domain_dir;
static void
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d2742af0f0fd..d309ba84e08a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -108,6 +108,16 @@ bool synchronize_hardirq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_hardirq);
+static void __synchronize_irq(struct irq_desc *desc)
+{
+ __synchronize_hardirq(desc, true);
+ /*
+ * We made sure that no hardirq handler is running. Now verify that no
+ * threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+}
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -127,16 +137,8 @@ void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc) {
- __synchronize_hardirq(desc, true);
- /*
- * We made sure that no hardirq handler is
- * running. Now verify that no threaded handlers are
- * active.
- */
- wait_event(desc->wait_for_threads,
- !atomic_read(&desc->threads_active));
- }
+ if (desc)
+ __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);
@@ -1216,7 +1218,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
return ret;
}
-static void wake_threads_waitq(struct irq_desc *desc)
+void wake_threads_waitq(struct irq_desc *desc)
{
if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
@@ -1944,7 +1946,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* supports it also make sure that there is no (not yet serviced)
* interrupt in flight at the hardware level.
*/
- __synchronize_hardirq(desc, true);
+ __synchronize_irq(desc);
#ifdef CONFIG_DEBUG_SHIRQ
/*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 0c46e9fe3a89..5f2c66860ac6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -21,8 +21,9 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
-/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+/* hlist_head to handle software resend of interrupts: */
+static HLIST_HEAD(irq_resend_list);
+static DEFINE_RAW_SPINLOCK(irq_resend_lock);
/*
* Run software resends of IRQ's
@@ -30,18 +31,17 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
static void resend_irqs(struct tasklet_struct *unused)
{
struct irq_desc *desc;
- int irq;
- while (!bitmap_empty(irqs_resend, nr_irqs)) {
- irq = find_first_bit(irqs_resend, nr_irqs);
- clear_bit(irq, irqs_resend);
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
- local_irq_disable();
+ raw_spin_lock_irq(&irq_resend_lock);
+ while (!hlist_empty(&irq_resend_list)) {
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc,
+ resend_node);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
- local_irq_enable();
+ raw_spin_lock(&irq_resend_lock);
}
+ raw_spin_unlock_irq(&irq_resend_lock);
}
/* Tasklet to handle resend: */
@@ -49,8 +49,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
- unsigned int irq = irq_desc_get_irq(desc);
-
/*
* Validate whether this interrupt can be safely injected from
* non interrupt context
@@ -70,16 +68,36 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
- irq = desc->parent_irq;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
}
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
+ /* Add to resend_list and activate the softirq: */
+ raw_spin_lock(&irq_resend_lock);
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ raw_spin_unlock(&irq_resend_lock);
tasklet_schedule(&resend_tasklet);
return 0;
}
+void clear_irq_resend(struct irq_desc *desc)
+{
+ raw_spin_lock(&irq_resend_lock);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
+}
+
+void irq_resend_init(struct irq_desc *desc)
+{
+ INIT_HLIST_NODE(&desc->resend_node);
+}
#else
+void clear_irq_resend(struct irq_desc *desc) {}
+void irq_resend_init(struct irq_desc *desc) {}
+
static int irq_sw_resend(struct irq_desc *desc)
{
return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 77747391f49b..18edd57b5fe8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -163,42 +163,36 @@ unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-static bool cleanup_symbol_name(char *s)
+static void cleanup_symbol_name(char *s)
{
char *res;
if (!IS_ENABLED(CONFIG_LTO_CLANG))
- return false;
+ return;
/*
* LLVM appends various suffixes for local functions and variables that
* must be promoted to global scope as part of LTO. This can break
* hooking of static functions with kprobes. '.' is not a valid
- * character in an identifier in C. Suffixes observed:
+ * character in an identifier in C. Suffixes only in LLVM LTO observed:
* - foo.llvm.[0-9a-f]+
- * - foo.[0-9a-f]+
*/
- res = strchr(s, '.');
- if (res) {
+ res = strstr(s, ".llvm.");
+ if (res)
*res = '\0';
- return true;
- }
- return false;
+ return;
}
static int compare_symbol_name(const char *name, char *namebuf)
{
- int ret;
-
- ret = strcmp(name, namebuf);
- if (!ret)
- return ret;
-
- if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
- return 0;
-
- return ret;
+ /* The kallsyms_seqs_of_names is sorted based on names after
+ * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled.
+ * To ensure correct bisection in kallsyms_lookup_names(), do
+ * cleanup_symbol_name(namebuf) before comparing name and namebuf.
+ */
+ cleanup_symbol_name(namebuf);
+ return strcmp(name, namebuf);
}
static unsigned int get_symbol_seq(int index)
@@ -484,34 +478,6 @@ found:
return 0;
}
-int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- int res;
-
- name[0] = '\0';
- name[KSYM_NAME_LEN - 1] = '\0';
-
- if (is_ksym_addr(addr)) {
- unsigned long pos;
-
- pos = get_symbol_pos(addr, size, offset);
- /* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos),
- name, KSYM_NAME_LEN);
- modname[0] = '\0';
- goto found;
- }
- /* See if it's in a module. */
- res = lookup_module_symbol_attrs(addr, size, offset, modname, name);
- if (res)
- return res;
-
-found:
- cleanup_symbol_name(name);
- return 0;
-}
-
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
int symbol_offset, int add_offset, int add_buildid)
@@ -646,7 +612,6 @@ int sprint_backtrace_build_id(char *buffer, unsigned long address)
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
- loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
loff_t pos_bpf_end;
@@ -659,29 +624,9 @@ struct kallsym_iter {
int show_value;
};
-int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name)
-{
- return -EINVAL;
-}
-
-static int get_ksymbol_arch(struct kallsym_iter *iter)
-{
- int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name);
-
- if (ret < 0) {
- iter->pos_arch_end = iter->pos;
- return 0;
- }
-
- return 1;
-}
-
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- int ret = module_get_kallsym(iter->pos - iter->pos_arch_end,
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
&iter->value, &iter->type,
iter->name, iter->module_name,
&iter->exported);
@@ -716,7 +661,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
int ret;
- strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
+ strscpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
@@ -736,7 +681,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter)
*/
static int get_ksymbol_kprobe(struct kallsym_iter *iter)
{
- strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
iter->exported = 0;
return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
&iter->value, &iter->type,
@@ -764,7 +709,6 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
if (new_pos == 0) {
- iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
iter->pos_bpf_end = 0;
@@ -780,10 +724,6 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if ((!iter->pos_arch_end || iter->pos_arch_end > pos) &&
- get_ksymbol_arch(iter))
- return 1;
-
if ((!iter->pos_mod_end || iter->pos_mod_end > pos) &&
get_ksymbol_mod(iter))
return 1;
@@ -961,41 +901,6 @@ late_initcall(bpf_ksym_iter_register);
#endif /* CONFIG_BPF_SYSCALL */
-static inline int kallsyms_for_perf(void)
-{
-#ifdef CONFIG_PERF_EVENTS
- extern int sysctl_perf_event_paranoid;
- if (sysctl_perf_event_paranoid <= 1)
- return 1;
-#endif
- return 0;
-}
-
-/*
- * We show kallsyms information even to normal users if we've enabled
- * kernel profiling and are explicitly not paranoid (so kptr_restrict
- * is clear, and sysctl_perf_event_paranoid isn't set).
- *
- * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
- * block even that).
- */
-bool kallsyms_show_value(const struct cred *cred)
-{
- switch (kptr_restrict) {
- case 0:
- if (kallsyms_for_perf())
- return true;
- fallthrough;
- case 1:
- if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
- CAP_OPT_NOAUDIT) == 0)
- return true;
- fallthrough;
- default:
- return false;
- }
-}
-
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index a2e3745d15c4..b4cac76ea5e9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -196,7 +196,7 @@ static bool match_cleanup_name(const char *s, const char *name)
if (!IS_ENABLED(CONFIG_LTO_CLANG))
return false;
- p = strchr(s, '.');
+ p = strstr(s, ".llvm.");
if (!p)
return false;
@@ -341,30 +341,10 @@ static int test_kallsyms_basic_function(void)
ret = lookup_symbol_name(addr, namebuf);
if (unlikely(ret)) {
namebuf[0] = 0;
+ pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr);
goto failed;
}
- /*
- * The first '.' may be the initial letter, in which case the
- * entire symbol name will be truncated to an empty string in
- * cleanup_symbol_name(). Do not test these symbols.
- *
- * For example:
- * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
- * .E_read_words
- * .E_leading_bytes
- * .E_trailing_bytes
- * .E_write_words
- * .E_copy
- * .str.292.llvm.12122243386960820698
- * .str.24.llvm.12122243386960820698
- * .str.29.llvm.12122243386960820698
- * .str.75.llvm.12122243386960820698
- * .str.99.llvm.12122243386960820698
- */
- if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
- continue;
-
lookup_addr = kallsyms_lookup_name(namebuf);
memset(stat, 0, sizeof(*stat));
@@ -388,8 +368,11 @@ static int test_kallsyms_basic_function(void)
if (stat->addr != stat2->addr ||
stat->real_cnt != stat2->real_cnt ||
memcmp(stat->addrs, stat2->addrs,
- stat->save_cnt * sizeof(stat->addrs[0])))
+ stat->save_cnt * sizeof(stat->addrs[0]))) {
+ pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n",
+ namebuf);
goto failed;
+ }
/*
* The average of random increments is 128, that is, one of
@@ -400,15 +383,23 @@ static int test_kallsyms_basic_function(void)
}
/* Need to be found at least once */
- if (!stat->real_cnt)
+ if (!stat->real_cnt) {
+ pr_info("%s: Never found\n", namebuf);
goto failed;
+ }
/*
* kallsyms_lookup_name() returns the address of the first
* symbol found and cannot be NULL.
*/
- if (!lookup_addr || lookup_addr != stat->addrs[0])
+ if (!lookup_addr) {
+ pr_info("%s: NULL lookup_addr?!\n", namebuf);
+ goto failed;
+ }
+ if (lookup_addr != stat->addrs[0]) {
+ pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf);
goto failed;
+ }
/*
* If the addresses of all matching symbols are recorded, the
@@ -420,8 +411,10 @@ static int test_kallsyms_basic_function(void)
break;
}
- if (j == stat->save_cnt)
+ if (j == stat->save_cnt) {
+ pr_info("%s: j == save_cnt?!\n", namebuf);
goto failed;
+ }
}
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 84c717337df0..f9ac2e9e460f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -279,7 +279,7 @@ void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
-void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
}
@@ -306,16 +306,17 @@ void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
-void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
_RET_IP_);
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
-void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg)
{
u64 i;
+ u64 *cases = arg;
u64 count = cases[0];
u64 size = cases[1];
u64 type = KCOV_CMP_CONST;
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 5a60cc52adc0..8a7baf4e332e 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1270,7 +1270,9 @@ static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
DEFINE_TSAN_ATOMIC_OPS(8);
DEFINE_TSAN_ATOMIC_OPS(16);
DEFINE_TSAN_ATOMIC_OPS(32);
+#ifdef CONFIG_64BIT
DEFINE_TSAN_ATOMIC_OPS(64);
+#endif
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 92d301f98776..107f355eac10 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -129,6 +129,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if (flags & KEXEC_UPDATE_ELFCOREHDR)
+ image->update_elfcorehdr = 1;
+#endif
+
ret = machine_kexec_prepare(image);
if (ret)
goto out;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3d578c6fefee..9dc728982d79 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -49,9 +49,6 @@
atomic_t __kexec_lock = ATOMIC_INIT(0);
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
@@ -277,6 +274,12 @@ struct kimage *do_kimage_alloc_init(void)
/* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unusable_pages);
+#ifdef CONFIG_CRASH_HOTPLUG
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_index = -1;
+ image->elfcorehdr_updated = false;
+#endif
+
return image;
}
@@ -1091,6 +1094,11 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
}
}
+static inline resource_size_t crash_resource_size(const struct resource *res)
+{
+ return !res->end ? 0 : resource_size(res);
+}
+
ssize_t crash_get_memory_size(void)
{
ssize_t size = 0;
@@ -1098,19 +1106,45 @@ ssize_t crash_get_memory_size(void)
if (!kexec_trylock())
return -EBUSY;
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
kexec_unlock();
return size;
}
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
+{
+ struct resource *ram_res;
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
+
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ crashk_res.end = ram_res->start - 1;
+ }
+
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
+
+ return 0;
+}
+
int crash_shrink_memory(unsigned long new_size)
{
int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
+ unsigned long old_size, low_size;
if (!kexec_trylock())
return -EBUSY;
@@ -1119,36 +1153,42 @@ int crash_shrink_memory(unsigned long new_size)
ret = -ENOENT;
goto unlock;
}
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
+
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
if (new_size >= old_size) {
ret = (new_size == old_size) ? 0 : -EINVAL;
goto unlock;
}
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
- ram_res->name = "System RAM";
+ /*
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
+ */
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
- crashk_res.end = end - 1;
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
+ }
- insert_resource(&iomem_resource, ram_res);
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
unlock:
kexec_unlock();
@@ -1181,40 +1221,6 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
final_note(buf);
}
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- size_t size, align;
-
- /*
- * crash_notes could be allocated across 2 vmalloc pages when percpu
- * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
- * pages are also on 2 continuous physical pages. In this case the
- * 2nd part of crash_notes in 2nd page could be lost since only the
- * starting address and size of crash_notes are exported through sysfs.
- * Here round up the size of crash_notes to the nearest power of two
- * and pass it to __alloc_percpu as align value. This can make sure
- * crash_notes is allocated inside one physical page.
- */
- size = sizeof(note_buf_t);
- align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
-
- /*
- * Break compile if size is bigger than PAGE_SIZE since crash_notes
- * definitely will be in 2 pages with that.
- */
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- crash_notes = __alloc_percpu(size, align);
- if (!crash_notes) {
- pr_warn("Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69ee4a29136f..f9a419cd22d4 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -624,7 +624,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
- * This function assumes that kexec_mutex is held.
+ * This function assumes that kexec_lock is held.
* On successful return, @kbuf->mem will have the physical address of
* the buffer in memory.
*
@@ -685,7 +685,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
- if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY))
return 0;
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
@@ -726,6 +726,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
+#ifdef CONFIG_CRASH_HOTPLUG
+ /* Exclude elfcorehdr segment to allow future changes via hotplug */
+ if (j == image->elfcorehdr_index)
+ continue;
+#endif
+
ksegment = &image->segment[i];
/*
* Skip purgatory as it will be modified once we put digest
@@ -790,7 +796,7 @@ out:
return ret;
}
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
@@ -867,6 +873,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
{
unsigned long bss_addr;
unsigned long offset;
+ size_t sechdrs_size;
Elf_Shdr *sechdrs;
int i;
@@ -874,11 +881,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* The section headers in kexec_purgatory are read-only. In order to
* have them modifiable make a temporary copy.
*/
- sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum));
+ sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum);
+ sechdrs = vzalloc(sechdrs_size);
if (!sechdrs)
return -ENOMEM;
- memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
- pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size);
pi->sechdrs = sechdrs;
offset = 0;
@@ -1149,185 +1156,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
-
-int crash_exclude_mem_range(struct crash_mem *mem,
- unsigned long long mstart, unsigned long long mend)
-{
- int i, j;
- unsigned long long start, end, p_start, p_end;
- struct range temp_range = {0, 0};
-
- for (i = 0; i < mem->nr_ranges; i++) {
- start = mem->ranges[i].start;
- end = mem->ranges[i].end;
- p_start = mstart;
- p_end = mend;
-
- if (mstart > end || mend < start)
- continue;
-
- /* Truncate any area outside of range */
- if (mstart < start)
- p_start = start;
- if (mend > end)
- p_end = end;
-
- /* Found completely overlapping range */
- if (p_start == start && p_end == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
-
- /*
- * Continue to check if there are another overlapping ranges
- * from the current position because of shifting the above
- * mem ranges.
- */
- i--;
- mem->nr_ranges--;
- continue;
- }
- mem->nr_ranges--;
- return 0;
- }
-
- if (p_start > start && p_end < end) {
- /* Split original range */
- mem->ranges[i].end = p_start - 1;
- temp_range.start = p_end + 1;
- temp_range.end = end;
- } else if (p_start != start)
- mem->ranges[i].end = p_start - 1;
- else
- mem->ranges[i].start = p_end + 1;
- break;
- }
-
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
- return 0;
-}
-
-int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
- void **addr, unsigned long *sz)
-{
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
- unsigned char *buf;
- unsigned int cpu, i;
- unsigned long long notes_addr;
- unsigned long mstart, mend;
-
- /* extra phdr for vmcoreinfo ELF note */
- nr_phdr = nr_cpus + 1;
- nr_phdr += mem->nr_ranges;
-
- /*
- * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
- * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
- * I think this is required by tools like gdb. So same physical
- * memory will be mapped in two ELF headers. One will contain kernel
- * text virtual addresses and other will have __va(physical) addresses.
- */
-
- nr_phdr++;
- elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
- elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
- buf = vzalloc(elf_sz);
- if (!buf)
- return -ENOMEM;
-
- ehdr = (Elf64_Ehdr *)buf;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
- ehdr->e_ident[EI_CLASS] = ELFCLASS64;
- ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
- ehdr->e_ident[EI_VERSION] = EV_CURRENT;
- ehdr->e_ident[EI_OSABI] = ELF_OSABI;
- memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
- ehdr->e_type = ET_CORE;
- ehdr->e_machine = ELF_ARCH;
- ehdr->e_version = EV_CURRENT;
- ehdr->e_phoff = sizeof(Elf64_Ehdr);
- ehdr->e_ehsize = sizeof(Elf64_Ehdr);
- ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
- /* Prepare one phdr of type PT_NOTE for each present CPU */
- for_each_present_cpu(cpu) {
- phdr->p_type = PT_NOTE;
- notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
- phdr->p_offset = phdr->p_paddr = notes_addr;
- phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
- (ehdr->e_phnum)++;
- phdr++;
- }
-
- /* Prepare one PT_NOTE header for vmcoreinfo */
- phdr->p_type = PT_NOTE;
- phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
- phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
- (ehdr->e_phnum)++;
- phdr++;
-
- /* Prepare PT_LOAD type program header for kernel text region */
- if (need_kernel_map) {
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (unsigned long) _text;
- phdr->p_filesz = phdr->p_memsz = _end - _text;
- phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
- ehdr->e_phnum++;
- phdr++;
- }
-
- /* Go through all the ranges in mem->ranges[] and prepare phdr */
- for (i = 0; i < mem->nr_ranges; i++) {
- mstart = mem->ranges[i].start;
- mend = mem->ranges[i].end;
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mstart;
-
- phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long) __va(mstart);
- phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
- phdr->p_align = 0;
- ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
- phdr++;
- }
-
- *addr = buf;
- *sz = elf_sz;
- return 0;
-}
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 00e177de91cc..0c6185aefaef 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1072,7 +1072,7 @@ static int kprobe_ftrace_enabled;
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
lockdep_assert_held(&kprobe_mutex);
@@ -1110,7 +1110,7 @@ static int arm_kprobe_ftrace(struct kprobe *p)
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
lockdep_assert_held(&kprobe_mutex);
@@ -1545,6 +1545,17 @@ static int check_ftrace_location(struct kprobe *p)
return 0;
}
+static bool is_cfi_preamble_symbol(unsigned long addr)
+{
+ char symbuf[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symbuf))
+ return false;
+
+ return str_has_prefix("__cfi_", symbuf) ||
+ str_has_prefix("__pfx_", symbuf);
+}
+
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
@@ -1563,7 +1574,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
- find_bug((unsigned long)p->addr)) {
+ find_bug((unsigned long)p->addr) ||
+ is_cfi_preamble_symbol((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
@@ -2007,9 +2019,9 @@ void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
void *frame_pointer)
{
- kprobe_opcode_t *correct_ret_addr = NULL;
struct kretprobe_instance *ri = NULL;
struct llist_node *first, *node = NULL;
+ kprobe_opcode_t *correct_ret_addr;
struct kretprobe *rp;
/* Find correct address and all nodes for this frame. */
@@ -2127,6 +2139,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
NOKPROBE_SYMBOL(pre_handler_kretprobe);
static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ unsigned long ret_addr,
struct pt_regs *regs)
{
struct kretprobe *rp = (struct kretprobe *)data;
@@ -2219,8 +2232,7 @@ int register_kretprobe(struct kretprobe *rp)
return -ENOMEM;
for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
+ inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
if (inst == NULL) {
rethook_free(rp->rh);
rp->rh = NULL;
@@ -2243,8 +2255,7 @@ int register_kretprobe(struct kretprobe *rp)
rp->rph->rp = rp;
for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
+ inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
if (inst == NULL) {
refcount_set(&rp->rph->ref, i);
free_rp_inst(rp);
@@ -2692,7 +2703,7 @@ void kprobe_free_init_mem(void)
static int __init init_kprobes(void)
{
- int i, err = 0;
+ int i, err;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c
new file mode 100644
index 000000000000..cf1a73cbf2f6
--- /dev/null
+++ b/kernel/ksyms_common.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ksyms_common.c: A split of kernel/kallsyms.c
+ * Contains a few generic function definations independent of config KALLSYMS.
+ */
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+bool kallsyms_show_value(const struct cred *cred)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return true;
+ fallthrough;
+ case 1:
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index aad7a3bfd846..1d4bc493b2f4 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -165,6 +165,18 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+KERNEL_ATTR_RO(crash_elfcorehdr_size);
+
+#endif
+
#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
@@ -255,6 +267,9 @@ static struct attribute * kernel_attrs[] = {
#endif
#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490792b1066e..1eea53050bab 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -159,11 +159,10 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-bool __kthread_should_park(struct task_struct *k)
+static bool __kthread_should_park(struct task_struct *k)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
-EXPORT_SYMBOL_GPL(__kthread_should_park);
/**
* kthread_should_park - should this kthread park now?
@@ -182,6 +181,16 @@ bool kthread_should_park(void)
}
EXPORT_SYMBOL_GPL(kthread_should_park);
+bool kthread_should_stop_or_park(void)
+{
+ struct kthread *kthread = __to_kthread(current);
+
+ if (!kthread)
+ return false;
+
+ return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
+}
+
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
@@ -312,10 +321,10 @@ void __noreturn kthread_exit(long result)
* @comp: Completion to complete
* @code: The integer value to return to kthread_stop().
*
- * If present complete @comp and the reuturn code to kthread_stop().
+ * If present, complete @comp and then return code to kthread_stop().
*
* A kernel thread whose module may be removed after the completion of
- * @comp can use this function exit safely.
+ * @comp can use this function to exit safely.
*
* Does not return.
*/
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index e9fd83a02228..e54c3d60a904 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -15,7 +15,7 @@
#include "transition.h"
#define MAX_STACK_ENTRIES 100
-DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
#define STACK_ERR_BUF_SIZE 128
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 8c7e7d25f09c..a6016b91803d 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#define lockevent_cond_inc(ev, c)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4dfd2f3e09b2..e85b5ad3e206 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -709,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -724,17 +724,19 @@ static void __print_lock_name(struct lock_class *class)
printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
printk(KERN_CONT " (");
- __print_lock_name(class);
+ __print_lock_name(hlock, class);
printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
@@ -772,7 +774,7 @@ static void print_lock(struct held_lock *hlock)
}
printk(KERN_CONT "%px", hlock->instance);
- print_lock_name(lock);
+ print_lock_name(hlock, lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -817,34 +819,26 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-/*
- * Check if an address is part of freed initmem. After initmem is freed,
- * memory can be allocated from it, and such allocations would then have
- * addresses within the range [_stext, _end].
- */
-#ifndef arch_is_kernel_initmem_freed
-static int arch_is_kernel_initmem_freed(unsigned long addr)
-{
- if (system_state < SYSTEM_FREEING_INITMEM)
- return 0;
-
- return init_section_contains((void *)addr, 1);
-}
-#endif
-
static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
- if (arch_is_kernel_initmem_freed(addr))
- return 0;
+ if (is_kernel_core_data(addr))
+ return 1;
/*
- * static variable?
+ * keys are allowed in the __ro_after_init section.
*/
- if ((addr >= start) && (addr < end))
+ if (is_kernel_rodata(addr))
+ return 1;
+
+ /*
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
+ */
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -1868,7 +1862,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
if (debug_locks_silent)
return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
+ print_lock_name(NULL, target->class);
printk(KERN_CONT ":\n");
print_lock_trace(target->trace, 6);
}
@@ -1899,11 +1893,11 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT " --> ");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT " --> ");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT "\n\n");
}
@@ -1914,13 +1908,13 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" rlock(");
else
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
if (src_read != 0)
printk(" rlock(");
@@ -1928,7 +1922,7 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" sync(");
else
printk(" lock(");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2154,6 +2148,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry,
return ret;
}
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
/*
* Prove that the dependency graph starting at <src> can not
* lead to <target>. If it can, there is a circle when adding
@@ -2185,7 +2181,10 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
*trace = save_trace();
}
- print_circular_bug(&src_entry, target_entry, src, target);
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
}
return ret;
@@ -2346,7 +2345,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
+ print_lock_name(NULL, class);
#ifdef CONFIG_DEBUG_LOCKDEP
printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
#endif
@@ -2528,11 +2527,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT " --> ");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT " --> ");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT "\n\n");
}
@@ -2540,18 +2539,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2588,20 +2587,20 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nand this task is already holding:\n");
print_lock(prev);
pr_warn("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
+ print_lock_name(prev, hlock_class(prev));
pr_cont(" ->");
- print_lock_name(hlock_class(next));
+ print_lock_name(next, hlock_class(next));
pr_cont("\n");
pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
+ print_lock_name(NULL, backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
+ print_lock_name(NULL, forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
@@ -2971,10 +2970,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
+ __print_lock_name(prv, prev);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
+ __print_lock_name(nxt, next);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
@@ -2984,6 +2983,8 @@ static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
@@ -2998,6 +2999,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
@@ -3019,6 +3025,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
static int
check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -3039,6 +3046,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
if ((next->read == 2) && prev->read)
continue;
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
+
/*
* We're holding the nest_lock, which serializes this lock's
* nesting behaviour.
@@ -3100,6 +3113,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 2;
}
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -3576,7 +3597,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
hlock_id = chain_hlocks[chain->base + i];
chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id));
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -3933,11 +3954,11 @@ static void print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -4023,7 +4044,7 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
+ print_lock_name(NULL, other->class);
pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
pr_warn("\nother info that might help us debug this:\n");
@@ -4896,6 +4917,33 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
struct held_lock *hlock)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 153ddc4c47ef..270c7f80ce84 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,24 +33,20 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <[email protected]>");
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
torture_param(int, rt_boost, 2,
- "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
#define MAX_NESTED_LOCKS 8
@@ -120,7 +116,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
@@ -198,16 +194,18 @@ __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) {
+ j = jiffies;
mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -322,7 +320,7 @@ __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
@@ -455,14 +453,12 @@ __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -630,7 +626,7 @@ __acquires(torture_rtmutex)
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/*
* We want a short delay mostly to emulate likely code, and
@@ -640,7 +636,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp)
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
@@ -695,14 +691,12 @@ __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
+ const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -816,7 +810,8 @@ static int lock_torture_writer(void *arg)
bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
@@ -848,8 +843,8 @@ static int lock_torture_writer(void *arg)
lwsp->n_lock_acquired++;
}
- cxt.cur_ops->write_delay(&rand);
if (!skip_main_lock) {
+ cxt.cur_ops->write_delay(&rand);
lock_is_write_held = false;
WRITE_ONCE(last_lock_release, jiffies);
cxt.cur_ops->writeunlock(tid);
@@ -1022,8 +1017,7 @@ static void lock_torture_cleanup(void)
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -1251,8 +1245,9 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
if (torture_init_error(firsterr))
goto unwind;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6afc249ce697..6a0184e9c234 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -486,6 +486,16 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
@@ -533,16 +543,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 728f434de2bb..21db0df0eb00 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task)
return prio;
}
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
static __always_inline void
waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
{
- waiter->prio = __waiter_prio(task);
- waiter->deadline = task->dl.deadline;
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
}
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
*/
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
struct rt_mutex_waiter *top_waiter)
{
- if (rt_mutex_waiter_less(waiter, top_waiter))
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
return true;
#ifdef RT_MUTEX_BUILD_SPINLOCKS
@@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
- if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
return false;
- return rt_mutex_waiter_equal(waiter, top_waiter);
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
#else
return false;
#endif
}
#define __node_2_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, tree_entry)
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
struct rt_mutex_waiter *aw = __node_2_waiter(a);
struct rt_mutex_waiter *bw = __node_2_waiter(b);
- if (rt_mutex_waiter_less(aw, bw))
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
return 1;
if (!build_ww_mutex())
return 0;
- if (rt_mutex_waiter_less(bw, aw))
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
return 0;
/* NOTE: relies on waiter->ww_ctx being set before insertion */
@@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
static __always_inline void
rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
}
static __always_inline void
rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
}
-#define __node_2_pi_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
-static __always_inline bool
-__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
{
- return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
+ lockdep_assert_held(&task->pi_lock);
+
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
return;
- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
enum rtmutex_chainwalk chwalk,
@@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
@@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
waiter->task = task;
waiter->lock = lock;
waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index cb9fdff76a8a..a6974d044593 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index c47e8361bfb5..1162e07cdaea 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -17,27 +17,44 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
* @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
- * @prio: Priority of the waiter
- * @deadline: Deadline of the waiter if applicable
* @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
struct rt_mutex_base *lock;
unsigned int wake_state;
- int prio;
- u64 deadline;
struct ww_acquire_ctx *ww_ctx;
};
@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
- return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
}
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
+ lockdep_assert_held(&p->pi_lock);
+
return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ pi_tree.entry);
}
#define RT_MUTEX_HAS_WAITERS 1UL
@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
waiter->wake_state = TASK_NORMAL;
waiter->task = NULL;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 56f139201f24..3ad2cc4823e5 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_next(&w->tree_entry);
+ struct rb_node *n = rb_next(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_prev(&w->tree_entry);
+ struct rb_node *n = rb_prev(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline void
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 8a5d6d63b06c..87440f714c0c 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
}
wksp_size = zstd_dstream_workspace_bound(header.windowSize);
- wksp = kmalloc(wksp_size, GFP_KERNEL);
+ wksp = vmalloc(wksp_size);
if (!wksp) {
retval = -ENOMEM;
goto out;
@@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
retval = new_size;
out:
- kfree(wksp);
+ vfree(wksp);
return retval;
}
#else
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index dc7b0160c480..c8b7b4dcf782 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -32,6 +32,18 @@
/* Maximum number of characters written by module_flags() */
#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+struct kernel_symbol {
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ int value_offset;
+ int name_offset;
+ int namespace_offset;
+#else
+ unsigned long value;
+ const char *name;
+ const char *namespace;
+#endif
+};
+
extern struct mutex module_mutex;
extern struct list_head modules;
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index c550d7d45f2f..ef73ae7c8909 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -381,34 +381,6 @@ out:
return -ERANGE;
}
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strscpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strscpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *name, char *module_name, int *exported)
{
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4e2cf784cf8c..98fedfdb8db5 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -820,10 +820,8 @@ static struct module_attribute modinfo_refcnt =
void __module_get(struct module *module)
{
if (module) {
- preempt_disable();
atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
- preempt_enable();
}
}
EXPORT_SYMBOL(__module_get);
@@ -833,15 +831,12 @@ bool try_module_get(struct module *module)
bool ret = true;
if (module) {
- preempt_disable();
/* Note: here, we can fail to get a reference */
if (likely(module_is_live(module) &&
atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
else
ret = false;
-
- preempt_enable();
}
return ret;
}
@@ -852,11 +847,9 @@ void module_put(struct module *module)
int ret;
if (module) {
- preempt_disable();
ret = atomic_dec_if_positive(&module->refcnt);
WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
- preempt_enable();
}
}
EXPORT_SYMBOL(module_put);
@@ -1302,12 +1295,20 @@ void *__symbol_get(const char *symbol)
};
preempt_disable();
- if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
- preempt_enable();
- return NULL;
+ if (!find_symbol(&fsa))
+ goto fail;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ goto fail;
}
+ if (strong_try_module_get(fsa.owner))
+ goto fail;
preempt_enable();
return (void *)kernel_symbol_value(fsa.sym);
+fail:
+ preempt_enable();
+ return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -1491,7 +1492,7 @@ long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
return offset | mask;
}
-static bool module_init_layout_section(const char *sname)
+bool module_init_layout_section(const char *sname)
{
#ifndef CONFIG_MODULE_UNLOAD
if (module_exit_section(sname))
@@ -3057,34 +3058,82 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
return load_module(&info, uargs, 0);
}
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+struct idempotent {
+ const void *cookie;
+ struct hlist_node entry;
+ struct completion complete;
+ int ret;
+};
+
+#define IDEM_HASH_BITS 8
+static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
+static DEFINE_SPINLOCK(idem_lock);
+
+static bool idempotent(struct idempotent *u, const void *cookie)
{
- struct load_info info = { };
- void *buf = NULL;
- int len;
- int err;
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct idempotent *existing;
+ bool first;
- err = may_init_module();
- if (err)
- return err;
+ u->ret = 0;
+ u->cookie = cookie;
+ init_completion(&u->complete);
- pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+ spin_lock(&idem_lock);
+ first = true;
+ hlist_for_each_entry(existing, head, entry) {
+ if (existing->cookie != cookie)
+ continue;
+ first = false;
+ break;
+ }
+ hlist_add_head(&u->entry, idem_hash + hash);
+ spin_unlock(&idem_lock);
- if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC
- |MODULE_INIT_COMPRESSED_FILE))
- return -EINVAL;
+ return !first;
+}
+
+/*
+ * We were the first one with 'cookie' on the list, and we ended
+ * up completing the operation. We now need to walk the list,
+ * remove everybody - which includes ourselves - fill in the return
+ * value, and then complete the operation.
+ */
+static int idempotent_complete(struct idempotent *u, int ret)
+{
+ const void *cookie = u->cookie;
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct hlist_node *next;
+ struct idempotent *pos;
+
+ spin_lock(&idem_lock);
+ hlist_for_each_entry_safe(pos, next, head, entry) {
+ if (pos->cookie != cookie)
+ continue;
+ hlist_del(&pos->entry);
+ pos->ret = ret;
+ complete(&pos->complete);
+ }
+ spin_unlock(&idem_lock);
+ return ret;
+}
+
+static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
+{
+ struct load_info info = { };
+ void *buf = NULL;
+ int len;
- len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
- READING_MODULE);
+ len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
if (len < 0) {
mod_stat_inc(&failed_kreads);
- mod_stat_add_long(len, &invalid_kread_bytes);
return len;
}
if (flags & MODULE_INIT_COMPRESSED_FILE) {
- err = module_decompress(&info, buf, len);
+ int err = module_decompress(&info, buf, len);
vfree(buf); /* compressed data is no longer needed */
if (err) {
mod_stat_inc(&failed_decompress);
@@ -3099,6 +3148,46 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
return load_module(&info, uargs, flags);
}
+static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
+{
+ struct idempotent idem;
+
+ if (!f || !(f->f_mode & FMODE_READ))
+ return -EBADF;
+
+ /* See if somebody else is doing the operation? */
+ if (idempotent(&idem, file_inode(f))) {
+ wait_for_completion(&idem.complete);
+ return idem.ret;
+ }
+
+ /* Otherwise, we'll do it and complete others */
+ return idempotent_complete(&idem,
+ init_module_from_file(f, uargs, flags));
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct fd f;
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
+ return -EINVAL;
+
+ f = fdget(fd);
+ err = idempotent_init_module(f.file, uargs, flags);
+ fdput(f);
+ return err;
+}
+
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 80d9c6d77a45..15781acaac1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -30,7 +30,7 @@
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
@@ -55,7 +55,7 @@ static inline struct nsproxy *create_nsproxy(void)
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
- atomic_set(&nsproxy->count, 1);
+ refcount_set(&nsproxy->count, 1);
return nsproxy;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 886d2ebd0a0d..07239d4ad81e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -216,7 +216,7 @@ static void panic_print_sys_info(bool console_flush)
show_state();
if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem(0, NULL);
+ show_mem();
if (panic_print & PANIC_PRINT_TIMER_INFO)
sysrq_timer_list_show();
@@ -684,6 +684,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
+#ifdef CONFIG_BUG
#ifndef __WARN_FLAGS
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
@@ -722,8 +723,6 @@ void __warn_printk(const char *fmt, ...)
EXPORT_SYMBOL(__warn_printk);
#endif
-#ifdef CONFIG_BUG
-
/* Support resetting WARN*_ONCE state */
static int clear_warn_once_set(void *data, u64 val)
diff --git a/kernel/params.c b/kernel/params.c
index 6a7548979aa9..2d4a0564697e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -331,7 +331,7 @@ EXPORT_SYMBOL(param_ops_bool);
int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
{
- int err = 0;
+ int err;
bool new_value;
bool orig_value = *(bool *)kp->arg;
struct kernel_param dummy_kp = *kp;
@@ -847,7 +847,7 @@ static void __init param_sysfs_builtin(void)
name_len = 0;
} else {
name_len = dot - kp->name + 1;
- strlcpy(modname, kp->name, name_len);
+ strscpy(modname, kp->name, name_len);
}
kernel_add_sysfs_param(modname, kp, name_len);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index f93954a0384d..fee14a4486a3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -656,8 +659,11 @@ void __init pid_idr_init(void)
idr_init(&init_pid_ns.idr);
- init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+ init_pid_ns.pid_cachep = kmem_cache_create("pid",
+ struct_size_t(struct pid, numbers, 1),
+ __alignof__(struct pid),
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b43eee07b00c..619972c78774 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -48,7 +48,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
return kc;
snprintf(name, sizeof(name), "pid_%u", level + 1);
- len = sizeof(struct pid) + level * sizeof(struct upid);
+ len = struct_size_t(struct pid, numbers, level + 1);
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
@@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
-
- initialize_memfd_noexec_scope(ns);
-
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
return ns;
out_free_idr:
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index d67a4d45bb42..2ee41a3a1dfd 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -5,33 +5,30 @@
#include <linux/pid_namespace.h>
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
-{
- ns->memfd_noexec_scope =
- task_active_pid_ns(current)->memfd_noexec_scope;
-}
-
static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
int write, void *buf, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *ns = task_active_pid_ns(current);
struct ctl_table table_copy;
+ int err, scope, parent_scope;
if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
table_copy = *table;
- if (ns != &init_pid_ns)
- table_copy.data = &ns->memfd_noexec_scope;
- /*
- * set minimum to current value, the effect is only bigger
- * value is accepted.
- */
- if (*(int *)table_copy.data > *(int *)table_copy.extra1)
- table_copy.extra1 = table_copy.data;
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
- return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
}
static struct ctl_table pid_ns_ctl_table_vm[] = {
@@ -51,8 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void)
register_sysctl("vm", pid_ns_ctl_table_vm);
}
#else
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
-static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void register_pid_ns_sysctl_table_vm(void) {}
#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 30d1274f03f6..2b4a946a6ff5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "PM: hibernation: " fmt
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/reboot.h>
@@ -64,7 +65,6 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
bool freezer_test_done;
-bool snapshot_test;
static const struct platform_hibernation_ops *hibernation_ops;
@@ -684,26 +684,22 @@ static void power_down(void)
cpu_relax();
}
-static int load_image_and_restore(void)
+static int load_image_and_restore(bool snapshot_test)
{
int error;
unsigned int flags;
- fmode_t mode = FMODE_READ;
-
- if (snapshot_test)
- mode |= FMODE_EXCL;
pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
if (error) {
- swsusp_close(mode);
+ swsusp_close(snapshot_test);
goto Unlock;
}
error = swsusp_read(&flags);
- swsusp_close(mode);
+ swsusp_close(snapshot_test);
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -721,6 +717,7 @@ static int load_image_and_restore(void)
*/
int hibernate(void)
{
+ bool snapshot_test = false;
unsigned int sleep_flags;
int error;
@@ -748,9 +745,6 @@ int hibernate(void)
if (error)
goto Exit;
- /* protected by system_transition_mutex */
- snapshot_test = false;
-
lock_device_hotplug();
/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
@@ -792,9 +786,9 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check();
+ error = swsusp_check(snapshot_test);
if (!error)
- error = load_image_and_restore();
+ error = load_image_and_restore(snapshot_test);
}
thaw_processes();
@@ -910,52 +904,10 @@ unlock:
}
EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
-/**
- * software_resume - Resume from a saved hibernation image.
- *
- * This routine is called as a late initcall, when all devices have been
- * discovered and initialized already.
- *
- * The image reading code is called to see if there is a hibernation image
- * available for reading. If that is the case, devices are quiesced and the
- * contents of memory is restored from the saved image.
- *
- * If this is successful, control reappears in the restored target kernel in
- * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
- * attempts to recover gracefully and make the kernel return to the normal mode
- * of operation.
- */
-static int software_resume(void)
+static int __init find_resume_device(void)
{
- int error;
-
- /*
- * If the user said "noresume".. bail out early.
- */
- if (noresume || !hibernation_available())
- return 0;
-
- /*
- * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
- * is configured into the kernel. Since the regular hibernate
- * trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take system_transition_mutex)
- * this can cause lockdep to complain about a possible ABBA deadlock
- * which cannot happen since we're in the boot code here and
- * sysfs can't be invoked yet. Therefore, we use a subclass
- * here to avoid lockdep complaining.
- */
- mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
-
- snapshot_test = false;
-
- if (swsusp_resume_device)
- goto Check_image;
-
- if (!strlen(resume_file)) {
- error = -ENOENT;
- goto Unlock;
- }
+ if (!strlen(resume_file))
+ return -ENOENT;
pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
@@ -966,40 +918,41 @@ static int software_resume(void)
}
/* Check if the device is there */
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- /*
- * Some device discovery might still be in progress; we need
- * to wait for this to finish.
- */
- wait_for_device_probe();
-
- if (resume_wait) {
- while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
- msleep(10);
- async_synchronize_full();
- }
+ if (!early_lookup_bdev(resume_file, &swsusp_resume_device))
+ return 0;
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- error = -ENODEV;
- goto Unlock;
- }
+ /*
+ * Some device discovery might still be in progress; we need to wait for
+ * this to finish.
+ */
+ wait_for_device_probe();
+ if (resume_wait) {
+ while (early_lookup_bdev(resume_file, &swsusp_resume_device))
+ msleep(10);
+ async_synchronize_full();
}
- Check_image:
+ return early_lookup_bdev(resume_file, &swsusp_resume_device);
+}
+
+static int software_resume(void)
+{
+ int error;
+
pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
pm_pr_dbg("Looking for hibernation image.\n");
- error = swsusp_check();
+
+ mutex_lock(&system_transition_mutex);
+ error = swsusp_check(false);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close(false);
goto Unlock;
}
@@ -1020,7 +973,7 @@ static int software_resume(void)
goto Close_Finish;
}
- error = load_image_and_restore();
+ error = load_image_and_restore(false);
thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
@@ -1034,11 +987,43 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close(false);
goto Finish;
}
-late_initcall_sync(software_resume);
+/**
+ * software_resume_initcall - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+static int __init software_resume_initcall(void)
+{
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ if (!swsusp_resume_device) {
+ int error = find_resume_device();
+
+ if (error)
+ return error;
+ }
+
+ return software_resume();
+}
+late_initcall_sync(software_resume_initcall);
static const char * const hibernation_modes[] = {
@@ -1177,7 +1162,11 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
unsigned int sleep_flags;
int len = n;
char *name;
- dev_t res;
+ dev_t dev;
+ int error;
+
+ if (!hibernation_available())
+ return n;
if (len && buf[len-1] == '\n')
len--;
@@ -1185,13 +1174,30 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!name)
return -ENOMEM;
- res = name_to_dev_t(name);
+ error = lookup_bdev(name, &dev);
+ if (error) {
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ error = 0;
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset,
+ &dummy) == 3) {
+ dev = MKDEV(maj, min);
+ if (maj != MAJOR(dev) || min != MINOR(dev))
+ error = -EINVAL;
+ } else {
+ dev = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ error = -EINVAL;
+ }
+ }
kfree(name);
- if (!res)
- return -EINVAL;
+ if (error)
+ return error;
sleep_flags = lock_system_sleep();
- swsusp_resume_device = res;
+ swsusp_resume_device = dev;
unlock_system_sleep(sleep_flags);
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3113ec2f1db4..f6425ae3e8b0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,33 @@
#include "power.h"
#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+}
unsigned int lock_system_sleep(void)
{
@@ -556,6 +583,12 @@ power_attr_ro(pm_wakeup_irq);
bool pm_debug_messages_on __read_mostly;
+bool pm_debug_messages_should_print(void)
+{
+ return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON;
+}
+EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
+
static ssize_t pm_debug_messages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b83c8d5e188d..46eb14dc50c3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -26,9 +26,6 @@ extern void __init hibernate_image_size_init(void);
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
-extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
-extern int arch_hibernation_header_restore(void *addr);
-
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
@@ -41,8 +38,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info)
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int hibernate_resume_nonboot_cpu_disable(void);
-
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -59,7 +54,6 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
-extern bool snapshot_test;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -174,11 +168,11 @@ extern int swsusp_swap_in_use(void);
#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
-extern int swsusp_check(void);
+int swsusp_check(bool snapshot_test);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(fmode_t);
+void swsusp_close(bool snapshot_test);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
#endif
@@ -216,6 +210,11 @@ static inline void suspend_test_finish(const char *label) {}
/* kernel/power/main.c */
extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
+void pm_restrict_gfp_mask(void);
+void pm_restore_gfp_mask(void);
+#else
+static inline void pm_restrict_gfp_mask(void) {}
+static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 562aa0e450ed..1f306f158696 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key)
+static void handle_poweroff(u8 key)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index af51ed6d45ef..4244b069442e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -220,6 +220,11 @@ static struct pm_qos_constraints cpu_latency_constraints = {
.type = PM_QOS_MIN,
};
+static inline bool cpu_latency_qos_value_invalid(s32 value)
+{
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
+}
+
/**
* cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit.
*/
@@ -263,7 +268,7 @@ static void cpu_latency_qos_apply(struct pm_qos_request *req,
*/
void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value)
{
- if (!req)
+ if (!req || cpu_latency_qos_value_invalid(value))
return;
if (cpu_latency_qos_request_active(req)) {
@@ -289,7 +294,7 @@ EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request);
*/
void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || cpu_latency_qos_value_invalid(new_value))
return;
if (!cpu_latency_qos_request_active(req)) {
@@ -426,6 +431,11 @@ late_initcall(cpu_latency_qos_init);
/* Definitions related to the frequency QoS below. */
+static inline bool freq_qos_value_invalid(s32 value)
+{
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
+}
+
/**
* freq_constraints_init - Initialize frequency QoS constraints.
* @qos: Frequency QoS constraints to initialize.
@@ -531,7 +541,7 @@ int freq_qos_add_request(struct freq_constraints *qos,
{
int ret;
- if (IS_ERR_OR_NULL(qos) || !req || value < 0)
+ if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value))
return -EINVAL;
if (WARN(freq_qos_request_active(req),
@@ -563,7 +573,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request);
*/
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
{
- if (!req || new_value < 0)
+ if (!req || freq_qos_value_invalid(new_value))
return -EINVAL;
if (WARN(!freq_qos_request_active(req),
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cd8b7b35f1e8..87e9f7e2bdc0 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -398,12 +398,13 @@ struct mem_zone_bm_rtree {
unsigned int blocks; /* Number of Bitmap Blocks */
};
-/* strcut bm_position is used for browsing memory bitmaps */
+/* struct bm_position is used for browsing memory bitmaps */
struct bm_position {
struct mem_zone_bm_rtree *zone;
struct rtree_node *node;
unsigned long node_pfn;
+ unsigned long cur_pfn;
int node_bit;
};
@@ -589,6 +590,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
+ bm->cur.cur_pfn = BM_END_OF_MAP;
bm->cur.node_bit = 0;
}
@@ -799,6 +801,7 @@ node_found:
bm->cur.zone = zone;
bm->cur.node = node;
bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.cur_pfn = pfn;
/* Set return values */
*addr = node->data;
@@ -850,6 +853,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm)
clear_bit(bit, bm->cur.node->data);
}
+static unsigned long memory_bm_get_current(struct memory_bitmap *bm)
+{
+ return bm->cur.cur_pfn;
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -929,10 +937,12 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
if (bit < bits) {
pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
bm->cur.node_bit = bit + 1;
+ bm->cur.cur_pfn = pfn;
return pfn;
}
} while (rtree_next_node(bm));
+ bm->cur.cur_pfn = BM_END_OF_MAP;
return BM_END_OF_MAP;
}
@@ -1228,6 +1238,58 @@ unsigned int snapshot_additional_pages(struct zone *zone)
return 2 * rtree;
}
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
+static void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
+ unsigned long flags;
+ unsigned int order, t;
+ struct page *page;
+
+ if (zone_is_empty(zone))
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], buddy_list) {
+ unsigned long i;
+
+ pfn = page_to_pfn(page);
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
#ifdef CONFIG_HIGHMEM
/**
* count_free_highmem_pages - Compute the total number of free highmem pages.
@@ -1371,14 +1433,19 @@ static unsigned int count_data_pages(void)
/*
* This is needed, because copy_page and memcpy are not usable for copying
- * task structs.
+ * task structs. Returns true if the page was filled with only zeros,
+ * otherwise false.
*/
-static inline void do_copy_page(long *dst, long *src)
+static inline bool do_copy_page(long *dst, long *src)
{
+ long z = 0;
int n;
- for (n = PAGE_SIZE / sizeof(long); n; n--)
+ for (n = PAGE_SIZE / sizeof(long); n; n--) {
+ z |= *src;
*dst++ = *src++;
+ }
+ return !z;
}
/**
@@ -1387,17 +1454,21 @@ static inline void do_copy_page(long *dst, long *src)
* Check if the page we are going to copy is marked as present in the kernel
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
- * always returns 'true'.
+ * always returns 'true'. Returns true if the page was entirely composed of
+ * zeros, otherwise it will return false.
*/
-static void safe_copy_page(void *dst, struct page *s_page)
+static bool safe_copy_page(void *dst, struct page *s_page)
{
+ bool zeros_only;
+
if (kernel_page_present(s_page)) {
- do_copy_page(dst, page_address(s_page));
+ zeros_only = do_copy_page(dst, page_address(s_page));
} else {
hibernate_map_page(s_page);
- do_copy_page(dst, page_address(s_page));
+ zeros_only = do_copy_page(dst, page_address(s_page));
hibernate_unmap_page(s_page);
}
+ return zeros_only;
}
#ifdef CONFIG_HIGHMEM
@@ -1407,17 +1478,18 @@ static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
}
-static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
struct page *s_page, *d_page;
void *src, *dst;
+ bool zeros_only;
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
src = kmap_atomic(s_page);
dst = kmap_atomic(d_page);
- do_copy_page(dst, src);
+ zeros_only = do_copy_page(dst, src);
kunmap_atomic(dst);
kunmap_atomic(src);
} else {
@@ -1426,30 +1498,39 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
* The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
- safe_copy_page(buffer, s_page);
+ zeros_only = safe_copy_page(buffer, s_page);
dst = kmap_atomic(d_page);
copy_page(dst, buffer);
kunmap_atomic(dst);
} else {
- safe_copy_page(page_address(d_page), s_page);
+ zeros_only = safe_copy_page(page_address(d_page), s_page);
}
}
+ return zeros_only;
}
#else
#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
-static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
- safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ return safe_copy_page(page_address(pfn_to_page(dst_pfn)),
pfn_to_page(src_pfn));
}
#endif /* CONFIG_HIGHMEM */
-static void copy_data_pages(struct memory_bitmap *copy_bm,
- struct memory_bitmap *orig_bm)
+/*
+ * Copy data pages will copy all pages into pages pulled from the copy_bm.
+ * If a page was entirely filled with zeros it will be marked in the zero_bm.
+ *
+ * Returns the number of pages copied.
+ */
+static unsigned long copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long copied_pages = 0;
struct zone *zone;
- unsigned long pfn;
+ unsigned long pfn, copy_pfn;
for_each_populated_zone(zone) {
unsigned long max_zone_pfn;
@@ -1462,18 +1543,29 @@ static void copy_data_pages(struct memory_bitmap *copy_bm,
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
+ copy_pfn = memory_bm_next_pfn(copy_bm);
for(;;) {
pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
+ if (copy_data_page(copy_pfn, pfn)) {
+ memory_bm_set_bit(zero_bm, pfn);
+ /* Use this copy_pfn for a page that is not full of zeros */
+ continue;
+ }
+ copied_pages++;
+ copy_pfn = memory_bm_next_pfn(copy_bm);
}
+ return copied_pages;
}
/* Total number of image pages */
static unsigned int nr_copy_pages;
/* Number of pages needed for saving the original pfns of the image pages */
static unsigned int nr_meta_pages;
+/* Number of zero pages */
+static unsigned int nr_zero_pages;
+
/*
* Numbers of normal and highmem page frames allocated for hibernation image
* before suspending devices.
@@ -1494,6 +1586,9 @@ static struct memory_bitmap orig_bm;
*/
static struct memory_bitmap copy_bm;
+/* Memory bitmap which tracks which saveable pages were zero filled. */
+static struct memory_bitmap zero_bm;
+
/**
* swsusp_free - Free pages allocated for hibernation image.
*
@@ -1538,6 +1633,7 @@ loop:
out:
nr_copy_pages = 0;
nr_meta_pages = 0;
+ nr_zero_pages = 0;
restore_pblist = NULL;
buffer = NULL;
alloc_normal = 0;
@@ -1756,8 +1852,15 @@ int hibernate_preallocate_memory(void)
goto err_out;
}
+ error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY);
+ if (error) {
+ pr_err("Cannot allocate zero bitmap\n");
+ goto err_out;
+ }
+
alloc_normal = 0;
alloc_highmem = 0;
+ nr_zero_pages = 0;
/* Count the number of saveable data pages. */
save_highmem = count_highmem_pages();
@@ -2037,19 +2140,19 @@ asmlinkage __visible int swsusp_save(void)
* Kill them.
*/
drain_local_pages(NULL);
- copy_data_pages(&copy_bm, &orig_bm);
+ nr_copy_pages = copy_data_pages(&copy_bm, &orig_bm, &zero_bm);
/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
* touch swap space! Except we must write out our image of course.
*/
-
nr_pages += nr_highmem;
- nr_copy_pages = nr_pages;
+ /* We don't actually copy the zero pages */
+ nr_zero_pages = nr_pages - nr_copy_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Image created (%d pages copied)\n", nr_pages);
+ pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages);
return 0;
}
@@ -2094,15 +2197,22 @@ static int init_header(struct swsusp_info *info)
return init_header_complete(info);
}
+#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1))
+#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG)
+
/**
* pack_pfns - Prepare PFNs for saving.
* @bm: Memory bitmap.
* @buf: Memory buffer to store the PFNs in.
+ * @zero_bm: Memory bitmap containing PFNs of zero pages.
*
* PFNs corresponding to set bits in @bm are stored in the area of memory
- * pointed to by @buf (1 page at a time).
+ * pointed to by @buf (1 page at a time). Pages which were filled with only
+ * zeros will have the highest bit set in the packed format to distinguish
+ * them from PFNs which will be contained in the image file.
*/
-static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
int j;
@@ -2110,6 +2220,8 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
+ if (memory_bm_test_bit(zero_bm, buf[j]))
+ buf[j] |= ENCODED_PFN_ZERO_FLAG;
}
}
@@ -2151,7 +2263,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
memory_bm_position_reset(&copy_bm);
} else if (handle->cur <= nr_meta_pages) {
clear_page(buffer);
- pack_pfns(buffer, &orig_bm);
+ pack_pfns(buffer, &orig_bm, &zero_bm);
} else {
struct page *page;
@@ -2247,24 +2359,35 @@ static int load_header(struct swsusp_info *info)
* unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
* @bm: Memory bitmap.
* @buf: Area of memory containing the PFNs.
+ * @zero_bm: Memory bitmap with the zero PFNs marked.
*
* For each element of the array pointed to by @buf (1 page at a time), set the
- * corresponding bit in @bm.
+ * corresponding bit in @bm. If the page was originally populated with only
+ * zeros then a corresponding bit will also be set in @zero_bm.
*/
-static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long decoded_pfn;
+ bool zero;
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) {
- memory_bm_set_bit(bm, buf[j]);
+ zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG);
+ decoded_pfn = buf[j] & ENCODED_PFN_MASK;
+ if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) {
+ memory_bm_set_bit(bm, decoded_pfn);
+ if (zero) {
+ memory_bm_set_bit(zero_bm, decoded_pfn);
+ nr_zero_pages++;
+ }
} else {
- if (!pfn_valid(buf[j]))
+ if (!pfn_valid(decoded_pfn))
pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
- (unsigned long long)PFN_PHYS(buf[j]));
+ (unsigned long long)PFN_PHYS(decoded_pfn));
return -EFAULT;
}
}
@@ -2486,6 +2609,7 @@ static inline void free_highmem_data(void) {}
* prepare_image - Make room for loading hibernation image.
* @new_bm: Uninitialized memory bitmap structure.
* @bm: Memory bitmap with unsafe pages marked.
+ * @zero_bm: Memory bitmap containing the zero pages.
*
* Use @bm to mark the pages that will be overwritten in the process of
* restoring the system memory state from the suspend image ("unsafe" pages)
@@ -2496,10 +2620,15 @@ static inline void free_highmem_data(void) {}
* pages will be used for just yet. Instead, we mark them all as allocated and
* create a lists of "safe" pages to be used later. On systems with high
* memory a list of "safe" highmem pages is created too.
+ *
+ * Because it was not known which pages were unsafe when @zero_bm was created,
+ * make a copy of it and recreate it within safe pages.
*/
-static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
unsigned int nr_pages, nr_highmem;
+ struct memory_bitmap tmp;
struct linked_page *lp;
int error;
@@ -2516,6 +2645,24 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
duplicate_memory_bitmap(new_bm, bm);
memory_bm_free(bm, PG_UNSAFE_KEEP);
+
+ /* Make a copy of zero_bm so it can be created in safe pages */
+ error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(&tmp, zero_bm);
+ memory_bm_free(zero_bm, PG_UNSAFE_KEEP);
+
+ /* Recreate zero_bm in safe pages */
+ error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(zero_bm, &tmp);
+ memory_bm_free(&tmp, PG_UNSAFE_KEEP);
+ /* At this point zero_bm is in safe pages and it can be used for restoring. */
+
if (nr_highmem > 0) {
error = prepare_highmem_image(bm, &nr_highmem);
if (error)
@@ -2530,7 +2677,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
*
* nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
lp = get_image_page(GFP_ATOMIC, PG_SAFE);
@@ -2543,7 +2690,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
nr_pages--;
}
/* Preallocate memory for the image */
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
if (!lp) {
@@ -2631,8 +2778,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
static struct chain_allocator ca;
int error = 0;
+next:
/* Check if we have already loaded the entire image */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages)
return 0;
handle->sync_read = 1;
@@ -2657,19 +2805,26 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY);
+ if (error)
+ return error;
+
+ nr_zero_pages = 0;
+
hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
- error = unpack_orig_pfns(buffer, &copy_bm);
+ error = unpack_orig_pfns(buffer, &copy_bm, &zero_bm);
if (error)
return error;
if (handle->cur == nr_meta_pages + 1) {
- error = prepare_image(&orig_bm, &copy_bm);
+ error = prepare_image(&orig_bm, &copy_bm, &zero_bm);
if (error)
return error;
chain_init(&ca, GFP_ATOMIC, PG_SAFE);
memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&zero_bm);
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
handle->sync_read = 0;
@@ -2686,6 +2841,14 @@ int snapshot_write_next(struct snapshot_handle *handle)
handle->sync_read = 0;
}
handle->cur++;
+
+ /* Zero pages were not included in the image, memset it and move on. */
+ if (handle->cur > nr_meta_pages + 1 &&
+ memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) {
+ memset(handle->buffer, 0, PAGE_SIZE);
+ goto next;
+ }
+
return PAGE_SIZE;
}
@@ -2702,7 +2865,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
copy_last_highmem_page();
hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) {
memory_bm_recycle(&orig_bm);
free_highmem_data();
}
@@ -2711,7 +2874,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
int snapshot_image_loaded(struct snapshot_handle *handle)
{
return !(!nr_copy_pages || !last_highmem_page_copied() ||
- handle->cur <= nr_meta_pages + nr_copy_pages);
+ handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages);
}
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 92e41ed292ad..f6ebcd00c410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -356,14 +356,14 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE,
- NULL);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ BLK_OPEN_WRITE, NULL, NULL);
if (IS_ERR(hib_resume_bdev))
return PTR_ERR(hib_resume_bdev);
res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ blkdev_put(hib_resume_bdev, NULL);
return res;
}
@@ -443,7 +443,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
err_rel:
release_swap_writer(handle);
err_close:
- swsusp_close(FMODE_WRITE);
+ swsusp_close(false);
return ret;
}
@@ -508,7 +508,7 @@ static int swap_writer_finish(struct swap_map_handle *handle,
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(handle);
- swsusp_close(FMODE_WRITE);
+ swsusp_close(false);
return error;
}
@@ -1510,21 +1510,19 @@ end:
return error;
}
+static void *swsusp_holder;
+
/**
* swsusp_check - Check for swsusp signature in the resume device
*/
-int swsusp_check(void)
+int swsusp_check(bool snapshot_test)
{
+ void *holder = snapshot_test ? &swsusp_holder : NULL;
int error;
- void *holder;
- fmode_t mode = FMODE_READ;
- if (snapshot_test)
- mode |= FMODE_EXCL;
-
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- mode, &holder);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
+ holder, NULL);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
@@ -1551,7 +1549,7 @@ int swsusp_check(void)
put:
if (error)
- blkdev_put(hib_resume_bdev, mode);
+ blkdev_put(hib_resume_bdev, holder);
else
pr_debug("Image signature found, resuming\n");
} else {
@@ -1568,14 +1566,14 @@ put:
* swsusp_close - close swap device.
*/
-void swsusp_close(fmode_t mode)
+void swsusp_close(bool snapshot_test)
{
if (IS_ERR(hib_resume_bdev)) {
pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, mode);
+ blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
}
/**
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1..7d4979d5c3ce 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -103,3 +103,5 @@ struct printk_message {
u64 seq;
unsigned long dropped;
};
+
+bool other_cpu_in_panic(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6a333adce3b3..96fc38cb2e84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress);
static DEFINE_MUTEX(console_mutex);
/*
- * console_sem protects updates to console->seq and console_suspended,
+ * console_sem protects updates to console->seq
* and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem, 1);
@@ -361,7 +361,7 @@ static bool panic_in_progress(void)
* paths in the console code where we end up in places I want
* locked without the console semaphore held).
*/
-static int console_locked, console_suspended;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
seq = raw_read_seqcount_latch(&ls->latch);
idx = seq & 0x1;
val = ls->val[idx];
- } while (read_seqcount_latch_retry(&ls->latch, seq));
+ } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
return val;
}
@@ -538,12 +538,14 @@ char *log_buf_addr_get(void)
{
return log_buf;
}
+EXPORT_SYMBOL_GPL(log_buf_addr_get);
/* Return log buffer size */
u32 log_buf_len_get(void)
{
return log_buf_len;
}
+EXPORT_SYMBOL_GPL(log_buf_len_get);
/*
* Define how much of the log buffer we could take at maximum. The value
@@ -2308,7 +2310,11 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- wake_up_klogd();
+ if (in_sched)
+ defer_console_output();
+ else
+ wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2547,22 +2553,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
pr_flush(1000, true);
- console_lock();
- console_suspended = 1;
- up_console_sem();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
void resume_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
pr_flush(1000, true);
}
@@ -2585,6 +2615,26 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
/**
* console_lock - block the console subsystem from printing
*
@@ -2597,9 +2647,11 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (other_cpu_in_panic())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
@@ -2615,12 +2667,11 @@ EXPORT_SYMBOL(console_lock);
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (other_cpu_in_panic())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2634,25 +2685,6 @@ int is_console_locked(void)
EXPORT_SYMBOL(is_console_locked);
/*
- * Return true when this CPU should unlock console_sem without pushing all
- * messages to the console. This reduces the chance that the console is
- * locked when the panic CPU tries to use it.
- */
-static bool abandon_console_lock_in_panic(void)
-{
- if (!panic_in_progress())
- return false;
-
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
-/*
* Check if the given console is currently capable and allowed to print
* records.
*
@@ -2665,6 +2697,9 @@ static inline bool console_is_usable(struct console *con)
if (!(flags & CON_ENABLED))
return false;
+ if ((flags & CON_SUSPENDED))
+ return false;
+
if (!con->write)
return false;
@@ -2948,7 +2983,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (abandon_console_lock_in_panic())
+ if (other_cpu_in_panic())
goto abandon;
if (do_cond_resched)
@@ -2983,11 +3018,6 @@ void console_unlock(void)
bool flushed;
u64 next_seq;
- if (console_suspended) {
- up_console_sem();
- return;
- }
-
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -3045,10 +3075,28 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
int cookie;
/*
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
+
+ /*
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
*
@@ -3056,6 +3104,16 @@ void console_unblank(void)
* In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -3085,14 +3143,24 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ bool handover;
+ u64 next_seq;
+
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
+ /*
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
*/
- console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
@@ -3105,15 +3173,15 @@ void console_flush_on_panic(enum con_flush_mode mode)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
/*
- * If the above console_trylock() failed, this is an
- * unsynchronized assignment. But in that case, the
+ * This is an unsynchronized assignment, but the
* kernel is in "hope and pray" mode anyway.
*/
c->seq = seq;
}
console_srcu_read_unlock(cookie);
}
- console_unlock();
+
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -3679,8 +3747,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
/*
* Hold the console_lock to guarantee safe access to
- * console->seq and to prevent changes to @console_suspended
- * until all consoles have been processed.
+ * console->seq.
*/
console_lock();
@@ -3688,6 +3755,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for_each_console_srcu(c) {
if (con && con != c)
continue;
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
if (!console_is_usable(c))
continue;
printk_seq = c->seq;
@@ -3696,18 +3768,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
}
console_srcu_read_unlock(cookie);
- /*
- * If consoles are suspended, it cannot be expected that they
- * make forward progress, so timeout immediately. @diff is
- * still used to return a valid flush status.
- */
- if (console_suspended)
- remaining = 0;
- else if (diff != last_diff && reset_on_progress)
+ if (diff != last_diff && reset_on_progress)
remaining = timeout_ms;
console_unlock();
+ /* Note: @diff is 0 if there are no usable consoles. */
if (diff == 0 || remaining == 0)
break;
@@ -3741,7 +3807,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* printer has been seen to make some forward progress.
*
* Context: Process context. May sleep while acquiring console lock.
- * Return: true if all enabled printers are caught up.
+ * Return: true if all usable printers are caught up.
*/
static bool pr_flush(int timeout_ms, bool reset_on_progress)
{
@@ -3798,11 +3864,33 @@ static void __wake_up_klogd(int val)
preempt_enable();
}
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
void wake_up_klogd(void)
{
__wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
void defer_console_output(void)
{
/*
@@ -3819,12 +3907,7 @@ void printk_trigger_flush(void)
int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
int _printk_deferred(const char *fmt, ...)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 2dc4d5a1f1ff..fde338606ce8 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring,
if (!buf || !buf_size)
return true;
- data_size = min_t(u16, buf_size, len);
+ data_size = min_t(unsigned int, buf_size, len);
memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
return true;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da..6d10927a07d8 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if (this_cpu_read(printk_context) || in_nmi()) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- defer_console_output();
- return len;
- }
+ if (this_cpu_read(printk_context) || in_nmi())
+ return vprintk_deferred(fmt, args);
/* No obstacles. */
return vprintk_default(fmt, args);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 9071182b1284..bdd7eadb33d8 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,4 +314,22 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.
+config RCU_DOUBLE_CHECK_CB_TIME
+ bool "RCU callback-batch backup time check"
+ depends on RCU_EXPERT
+ default n
+ help
+ Use this option to provide more precise enforcement of the
+ rcutree.rcu_resched_ns module parameter in situations where
+ a single RCU callback might run for hundreds of microseconds,
+ thus defeating the 32-callback batching used to amortize the
+ cost of the fine-grained but expensive local_clock() function.
+
+ This option rounds rcutree.rcu_resched_ns up to the next
+ jiffy, and overrides the 32-callback batching if this limit
+ is exceeded.
+
+ Say Y here if you need tighter callback-limit enforcement.
+ Say N here if you are unsure.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4a1b9622598b..98e13be411af 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
static inline void rcu_async_hurry(void) { }
static inline void rcu_async_relax(void) { }
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -508,9 +507,16 @@ void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
-void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -642,4 +648,10 @@ void show_rcu_tasks_trace_gp_kthread(void);
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
+#else
+bool rcu_cpu_beenfullyonline(int cpu);
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index e82ec9f9a5d8..ffdb30495e3c 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]>");
#endif
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
"Shutdown at end of scalability tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
@@ -139,6 +141,7 @@ struct rcu_scale_ops {
void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
const char *name;
};
@@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = {
.gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
.name = "tasks"
};
@@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = {
#endif // #else // #ifdef CONFIG_TASKS_RCU
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_rude,
+ .gp_barrier = rcu_barrier_tasks_rude,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
#ifdef CONFIG_TASKS_TRACE_RCU
/*
@@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
.name = "tasks-tracing"
};
@@ -410,10 +453,12 @@ rcu_scale_writer(void *arg)
{
int i = 0;
int i_max;
+ unsigned long jdone;
long me = (long)arg;
struct rcu_head *rhp = NULL;
bool started = false, done = false, alldone = false;
u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
@@ -424,7 +469,7 @@ rcu_scale_writer(void *arg)
sched_set_fifo_low(current);
if (holdoff)
- schedule_timeout_uninterruptible(holdoff * HZ);
+ schedule_timeout_idle(holdoff * HZ);
/*
* Wait until rcu_end_inkernel_boot() is called for normal GP tests
@@ -445,9 +490,12 @@ rcu_scale_writer(void *arg)
}
}
+ jdone = jiffies + minruntime * HZ;
do {
if (writer_holdoff)
udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
if (gp_async) {
@@ -475,7 +523,7 @@ retry:
if (!started &&
atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
started = true;
- if (!done && i >= MIN_MEAS) {
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
@@ -518,91 +566,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
- scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
-}
-
-static void
-rcu_scale_cleanup(void)
-{
- int i;
- int j;
- int ngps = 0;
- u64 *wdp;
- u64 *wdpp;
-
- /*
- * Would like warning at start, but everything is expedited
- * during the mid-boot phase, so have to wait till the end.
- */
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- if (rcu_gp_is_normal() && gp_exp)
- SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- if (gp_exp && gp_async)
- SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
- if (torture_cleanup_begin())
- return;
- if (!cur_ops) {
- torture_cleanup_end();
- return;
- }
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- torture_stop_kthread(rcu_scale_reader,
- reader_tasks[i]);
- kfree(reader_tasks);
- }
-
- if (writer_tasks) {
- for (i = 0; i < nrealwriters; i++) {
- torture_stop_kthread(rcu_scale_writer,
- writer_tasks[i]);
- if (!writer_n_durations)
- continue;
- j = writer_n_durations[i];
- pr_alert("%s%s writer %d gps: %d\n",
- scale_type, SCALE_FLAG, i, j);
- ngps += j;
- }
- pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
- scale_type, SCALE_FLAG,
- t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
- t_rcu_scale_writer_finished -
- t_rcu_scale_writer_started,
- ngps,
- rcuscale_seq_diff(b_rcu_gp_test_finished,
- b_rcu_gp_test_started));
- for (i = 0; i < nrealwriters; i++) {
- if (!writer_durations)
- break;
- if (!writer_n_durations)
- continue;
- wdpp = writer_durations[i];
- if (!wdpp)
- continue;
- for (j = 0; j < writer_n_durations[i]; j++) {
- wdp = &wdpp[j];
- pr_alert("%s%s %4d writer-duration: %5d %llu\n",
- scale_type, SCALE_FLAG,
- i, j, *wdp);
- if (j % 100 == 0)
- schedule_timeout_uninterruptible(1);
- }
- kfree(writer_durations[i]);
- }
- kfree(writer_tasks);
- kfree(writer_durations);
- kfree(writer_n_durations);
- }
-
- /* Do torture-type-specific cleanup operations. */
- if (cur_ops->cleanup != NULL)
- cur_ops->cleanup();
-
- torture_cleanup_end();
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
}
/*
@@ -625,20 +590,6 @@ static int compute_real(int n)
}
/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-/*
* kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
* of iterations and measure total time and number of GP for all iterations to complete.
*/
@@ -653,6 +604,8 @@ static struct task_struct **kfree_reader_tasks;
static int kfree_nrealthreads;
static atomic_t n_kfree_scale_thread_started;
static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
struct kfree_obj {
char kfree_obj[8];
@@ -798,6 +751,10 @@ kfree_scale_init(void)
unsigned long jif_start;
unsigned long orig_jif;
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
// Also, do a quick self-test to ensure laziness is as much as
// expected.
if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
@@ -874,13 +831,127 @@ unwind:
return firsterr;
}
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
static int __init
rcu_scale_init(void)
{
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
@@ -905,6 +976,11 @@ rcu_scale_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
if (kfree_rcu_test)
return kfree_scale_init();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 147551c23baf..ade42d6a9d9b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg)
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
tracing_off();
+ show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1876,7 +1877,7 @@ static int
rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
{
int mask = rcutorture_extend_mask_max();
- unsigned long randmask1 = torture_random(trsp) >> 8;
+ unsigned long randmask1 = torture_random(trsp);
unsigned long randmask2 = randmask1 >> 3;
unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
@@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
if (!((mask - 1) & mask))
return rtrsp; /* Current RCU reader not extendable. */
/* Bias towards larger numbers of loops. */
- i = (torture_random(trsp) >> 3);
+ i = torture_random(trsp);
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
@@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg)
toggle_fuzz = NSEC_PER_USEC;
do {
r = torture_random(&rand);
- cpu = (r >> 4) % (maxcpu + 1);
+ cpu = (r >> 1) % (maxcpu + 1);
if (r & 0x1) {
rcu_nocb_cpu_offload(cpu);
atomic_long_inc(&n_nocb_offload);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1970ce5f22d4..91a0fd0d4d9a 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = {
.name = "clock"
};
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
////////////////////////////////////////////////////////////////////////
//
// Methods leveraging SLAB_TYPESAFE_BY_RCU.
@@ -1047,7 +1079,7 @@ ref_scale_init(void)
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
@@ -1107,12 +1139,11 @@ ref_scale_init(void)
VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
if (torture_init_error(firsterr))
goto unwind;
-
- init_waitqueue_head(&(reader_tasks[i].wq));
}
// Main Task
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 5f4fc8184dd0..8d65f7d576a3 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @cblist: Callback list.
* @lock: Lock protecting per-CPU callback list.
* @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
* @rtp_n_lock_retries: Rough lock-contention statistic.
* @rtp_work: Work queue for invoking callbacks.
* @rtp_irq_work: IRQ work queue for deferred wakeups.
@@ -38,6 +40,8 @@ struct rcu_tasks_percpu {
raw_spinlock_t __private lock;
unsigned long rtp_jiffies;
unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
struct work_struct rtp_work;
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
@@ -51,7 +55,6 @@ struct rcu_tasks_percpu {
* @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
- * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
@@ -61,6 +64,8 @@ struct rcu_tasks_percpu {
* @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
* @pregp_func: This flavor's pre-grace-period function (optional).
* @pertask_func: This flavor's per-task scan function (optional).
* @postscan_func: This flavor's post-task scan function (optional).
@@ -92,6 +97,7 @@ struct rcu_tasks {
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \
.gp_func = gp, \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
.percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
.percpu_enqueue_lim = 1, \
@@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
-#endif
-#ifdef CONFIG_TASKS_RCU
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
@@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100;
module_param(rcu_task_contend_lim, int, 0444);
static int rcu_task_collapse_lim __read_mostly = 10;
module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
#endif /* #ifndef CONFIG_TINY_RCU */
// Initialize per-CPU callback lists for the specified flavor of
-// Tasks RCU.
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
@@ -237,11 +244,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int lim;
int shift;
- raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
rcu_task_cb_adjust = true;
- pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
} else if (rcu_task_enqueue_lim == 0) {
rcu_task_enqueue_lim = 1;
}
@@ -261,18 +266,46 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
+ local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
- raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
- pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
+ data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+}
+
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
}
// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
@@ -291,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
{
int chosen_cpu;
unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
int ideal_cpu;
unsigned long j;
bool needadjust = false;
@@ -315,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
needadjust = true; // Defer adjustment to avoid deadlock.
}
- if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
- cblist_init_generic(rtp);
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
}
- needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ if (needwake)
+ rtpcp->urgent_gp = 3;
rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
if (unlikely(needadjust)) {
@@ -414,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
}
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
- if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
needgpcb |= 0x3;
- if (!rcu_segcblist_empty(&rtpcp->cblist))
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
needgpcb |= 0x1;
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
@@ -463,6 +509,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
{
int cpu;
int cpunext;
+ int cpuwq;
unsigned long flags;
int len;
struct rcu_head *rhp;
@@ -473,11 +520,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
cpunext = cpu * 2 + 1;
if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
cpunext++;
if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
}
}
@@ -521,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
if (unlikely(midboot)) {
needgpcb = 0x2;
} else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
}
if (needgpcb & 0x2) {
@@ -545,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
// RCU-tasks kthread that detects grace periods and invokes callbacks.
static int __noreturn rcu_tasks_kthread(void *arg)
{
+ int cpu;
struct rcu_tasks *rtp = arg;
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current, HK_TYPE_RCU);
- WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
/*
* Each pass through the following loop makes one check for
@@ -631,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
int cpu;
bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
for_each_possible_cpu(cpu) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
- if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
break;
- }
}
- pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
@@ -648,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -1016,11 +1084,16 @@ void rcu_barrier_tasks(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_kthread(void)
{
cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -1038,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
/*
* Contribute to protect against tasklist scan blind spot while the
* task is exiting and may be removed from the tasklist. See
@@ -1169,10 +1248,15 @@ void rcu_barrier_tasks_rude(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
+int rcu_tasks_rude_lazy_ms = -1;
+module_param(rcu_tasks_rude_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_rude_kthread(void)
{
cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
+ if (rcu_tasks_rude_lazy_ms >= 0)
+ rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -1184,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1789,6 +1880,9 @@ void rcu_barrier_tasks_trace(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_trace_kthread(void)
{
cblist_init_generic(&rcu_tasks_trace);
@@ -1803,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
if (rcu_tasks_trace.init_fract <= 0)
rcu_tasks_trace.init_fract = 1;
}
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
@@ -1826,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f52ff7241041..cb1caefa8bd0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void)
// prevents self-deadlock. So we can safely recheck under the lock.
// Note that the nohz_full state currently cannot change.
raw_spin_lock_rcu_node(rdp->mynode);
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
// A nohz_full CPU is in the kernel and RCU needs a
// quiescent state. Turn on the tick!
WRITE_ONCE(rdp->rcu_forced_tick, true);
@@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is not idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
+ *
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
*
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
@@ -2046,19 +2050,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
rcu_report_qs_rdp(rdp);
}
+/* Return true if callback-invocation time limit exceeded. */
+static bool rcu_do_batch_check_time(long count, long tlimit,
+ bool jlimit_check, unsigned long jlimit)
+{
+ // Invoke local_clock() only once per 32 consecutive callbacks.
+ return unlikely(tlimit) &&
+ (!likely(count & 31) ||
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
+ jlimit_check && time_after(jiffies, jlimit))) &&
+ local_clock() >= tlimit;
+}
+
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ long bl;
+ long count = 0;
int div;
bool __maybe_unused empty;
unsigned long flags;
- struct rcu_head *rhp;
+ unsigned long jlimit;
+ bool jlimit_check = false;
+ long pending;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count = 0;
- long pending, tlimit = 0;
+ struct rcu_head *rhp;
+ long tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2082,11 +2102,15 @@ static void rcu_do_batch(struct rcu_data *rdp)
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (in_serving_softirq() && unlikely(bl > 100)) {
+ if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) {
+ const long npj = NSEC_PER_SEC / HZ;
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
tlimit = local_clock() + rrn;
+ jlimit = jiffies + (rrn + npj + 1) / npj;
+ jlimit_check = true;
}
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
@@ -2126,21 +2150,23 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Make sure we don't spend too much time here and deprive other
* softirq vectors of CPU cycles.
*/
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((count & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
+ if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
break;
- }
} else {
- // In rcuoc context, so no worries about depriving
- // other softirq vectors of CPU cycles.
+ // In rcuc/rcuoc context, so no worries about
+ // depriving other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
lockdep_assert_irqs_enabled();
local_bh_disable();
+ // But rcuc kthreads can delay quiescent-state
+ // reporting, so check time limits for them.
+ if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
+ rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
+ rdp->rcu_cpu_has_work = 1;
+ break;
+ }
}
}
@@ -2459,12 +2485,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
work = *workp;
- *workp = 0;
+ WRITE_ONCE(*workp, 0);
local_irq_enable();
if (work)
rcu_core();
local_bh_enable();
- if (*workp == 0) {
+ if (!READ_ONCE(*workp)) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
@@ -2756,7 +2782,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
struct kvfree_rcu_bulk_data {
struct list_head list;
- unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
void *records[];
};
@@ -2773,6 +2799,7 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
@@ -2780,6 +2807,7 @@ struct kvfree_rcu_bulk_data {
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
@@ -2900,6 +2928,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
struct llist_node *page_list, *pos, *n;
int freed = 0;
+ if (!rcu_min_cached_objs)
+ return 0;
+
raw_spin_lock_irqsave(&krcp->lock, flags);
page_list = llist_del_all(&krcp->bkvcache);
WRITE_ONCE(krcp->nr_bkv_objs, 0);
@@ -2920,24 +2951,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
unsigned long flags;
int i;
- debug_rcu_bhead_unqueue(bnode);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (idx == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bnode->nr_records,
- bnode->records);
-
- kfree_bulk(bnode->nr_records, bnode->records);
- } else { // vmalloc() / vfree().
- for (i = 0; i < bnode->nr_records; i++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name, bnode->records[i], 0);
-
- vfree(bnode->records[i]);
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
}
+ rcu_lock_release(&rcu_callback_map);
}
- rcu_lock_release(&rcu_callback_map);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bnode))
@@ -2984,6 +3016,7 @@ static void kfree_rcu_work(struct work_struct *work)
struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
int i;
krwp = container_of(to_rcu_work(work),
@@ -2998,6 +3031,7 @@ static void kfree_rcu_work(struct work_struct *work)
// Channel 3.
head = krwp->head_free;
krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
// Handle the first two channels.
@@ -3014,7 +3048,8 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- kvfree_rcu_list(head);
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
}
static bool
@@ -3081,7 +3116,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
INIT_LIST_HEAD(&bulk_ready[i]);
list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
- if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
break;
atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
@@ -3146,6 +3181,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
atomic_set(&krcp->head_count, 0);
WRITE_ONCE(krcp->head, NULL);
}
@@ -3194,7 +3230,7 @@ static void fill_page_cache_func(struct work_struct *work)
nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1 : rcu_min_cached_objs;
- for (i = 0; i < nr_pages; i++) {
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -3218,6 +3254,10 @@ static void fill_page_cache_func(struct work_struct *work)
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
@@ -3272,7 +3312,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// scenarios.
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- *krcp = krc_this_cpu_lock(flags);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
}
if (!bnode)
@@ -3285,7 +3325,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// Finally insert and update the GP for this page.
bnode->records[bnode->nr_records++] = ptr;
- bnode->gp_snap = get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
@@ -4283,7 +4323,6 @@ int rcutree_prepare_cpu(unsigned int cpu)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rdp->beenonline = true; /* We have now been online. */
rdp->gp_seq = READ_ONCE(rnp->gp_seq);
rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
@@ -4311,6 +4350,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
}
/*
+ * Has the specified (known valid) CPU ever been fully online?
+ */
+bool rcu_cpu_beenfullyonline(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return smp_load_acquire(&rdp->beenonline);
+}
+
+/*
* Near the end of the CPU-online process. Pretty much all services
* enabled, and the CPU is now very much alive.
*/
@@ -4368,15 +4417,16 @@ int rcutree_offline_cpu(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the incoming CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ * This incoming CPU must not have enabled interrupts yet.
*/
void rcu_cpu_starting(unsigned int cpu)
{
- unsigned long flags;
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp;
bool newcpu;
+ lockdep_assert_irqs_disabled();
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu_started)
return;
@@ -4384,7 +4434,6 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- local_irq_save(flags);
arch_spin_lock(&rcu_state.ofl_lock);
rcu_dynticks_eqs_online();
raw_spin_lock(&rcu_state.barrier_lock);
@@ -4403,17 +4452,17 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
/* rcu_report_qs_rnp() *really* wants some flags to restore */
- unsigned long flags2;
+ unsigned long flags;
- local_irq_save(flags2);
+ local_irq_save(flags);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
- rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
raw_spin_unlock_rcu_node(rnp);
}
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ smp_store_release(&rdp->beenonline, true);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3b7abb58157d..8239b39d945b 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -643,7 +643,7 @@ static void synchronize_rcu_expedited_wait(void)
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
- "D."[!!(rdp->cpu_no_qs.b.exp)]);
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f2280616f9d5..5598212d1f27 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
rcu_nocb_poll = true;
- return 0;
+ return 1;
}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
* Don't bother bypassing ->cblist if the call_rcu() rate is low.
@@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
int cpu;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.barrier_mutex))
+ return 0;
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
count += READ_ONCE(rdp->lazy_len);
}
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_EMPTY;
}
@@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long flags;
unsigned long count = 0;
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ /*
+ * But really don't insist if barrier_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
/* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int _count = READ_ONCE(rdp->lazy_len);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
- if (_count == 0)
+ if (!READ_ONCE(rdp->lazy_len))
continue;
+
rcu_nocb_lock_irqsave(rdp, flags);
- WRITE_ONCE(rdp->lazy_len, 0);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (sc->nr_to_scan <= 0)
break;
}
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
return count ? count : SHRINK_STOP;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7b0fe741a088..41021080ad25 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -257,6 +257,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
+ *
+ * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
*/
if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
@@ -941,7 +943,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- if (rdp->cpu_no_qs.b.exp)
+ if (READ_ONCE(rdp->cpu_no_qs.b.exp))
rcu_report_exp_rdp(rdp);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b10b8349bb2a..6f06dc12904a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -1035,7 +1035,7 @@ static bool sysrq_rcu;
module_param(sysrq_rcu, bool, 0444);
/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
+static void sysrq_show_rcu(u8 key)
{
show_rcu_gp_kthreads();
}
diff --git a/kernel/relay.c b/kernel/relay.c
index a80fa01042e9..83fe0325cde1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -375,7 +375,7 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
- struct rchan_buf *buf = NULL;
+ struct rchan_buf *buf;
struct dentry *dentry;
if (chan->is_global)
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 5d113aa59e77..59032aaccd18 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -171,7 +171,8 @@ static void scf_torture_stats_print(void)
scfs.n_all_wait += scf_stats_p[i].n_all_wait;
}
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
- atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
+ atomic_read(&n_mb_out_errs) ||
+ (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs)))
bangstr = "!!! ";
pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
@@ -312,6 +313,7 @@ static void scf_handler_1(void *scfc_in)
// Randomly do an smp_call_function*() invocation.
static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
{
+ bool allocfail = false;
uintptr_t cpu;
int ret = 0;
struct scf_check *scfcp = NULL;
@@ -323,8 +325,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
preempt_disable();
if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
- if (WARN_ON_ONCE(!scfcp)) {
+ if (!scfcp) {
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN));
atomic_inc(&n_alloc_errs);
+ allocfail = true;
} else {
scfcp->scfc_cpu = -1;
scfcp->scfc_wait = scfsp->scfs_wait;
@@ -431,7 +435,9 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
cpus_read_unlock();
else
preempt_enable();
- if (!(torture_random(trsp) & 0xfff))
+ if (allocfail)
+ schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete.
+ else if (!(torture_random(trsp) & 0xfff))
schedule_timeout_uninterruptible(1);
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b5cc2b53464d..3c6193de9cde 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -287,28 +287,35 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-noinstr u64 local_clock(void)
+noinstr u64 local_clock_noinstr(void)
{
u64 clock;
if (static_branch_likely(&__sched_clock_stable))
- return sched_clock() + __sched_clock_offset;
+ return sched_clock_noinstr() + __sched_clock_offset;
if (!static_branch_likely(&sched_clock_running))
- return sched_clock();
+ return sched_clock_noinstr();
- preempt_disable_notrace();
clock = sched_clock_local(this_scd());
- preempt_enable_notrace();
return clock;
}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
EXPORT_SYMBOL_GPL(local_clock);
static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index d57a5c1c1cd9..3561ab533dd4 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -13,6 +13,23 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
+static void complete_with_flags(struct completion *x, int wake_flags)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked(&x->wait, wake_flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
+void complete_on_current_cpu(struct completion *x)
+{
+ return complete_with_flags(x, WF_CURRENT_CPU);
+}
+
/**
* complete: - signals a single thread waiting on this completion
* @x: holds the state of this particular completion
@@ -27,14 +44,7 @@
*/
void complete(struct completion *x)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&x->wait.lock, flags);
-
- if (x->done != UINT_MAX)
- x->done++;
- swake_up_locked(&x->wait);
- raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+ complete_with_flags(x, 0);
}
EXPORT_SYMBOL(complete);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..2299a5cfbfb9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void)
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
- rcu_read_lock();
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+
+ return default_cpu;
}
/*
@@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
@@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
@@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
-done:
- mutex_unlock(&uclamp_mutex);
-
return result;
}
#endif
@@ -2213,6 +2233,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+#ifdef CONFIG_PREEMPT_RT
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+#endif
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+#ifdef CONFIG_PREEMPT_RT
+ int match;
+
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state() and
+ * current_restore_rtlock_saved_state().
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ match = __task_state_match(p, state);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+#else
+ return __task_state_match(p, state);
+#endif
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
static void
@@ -2398,7 +2566,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@@ -2456,10 +2623,12 @@ static int migration_cpu_stop(void *data)
goto out;
}
- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -3264,7 +3433,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -3272,33 +3440,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -3341,114 +3501,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * Wait for the thread to block in any of the states set in @match_state.
- * If it changes, i.e. @p might have woken up, then return zero. When we
- * succeed in waiting for @p to be off its CPU, we return a positive number
- * (its total switch count). If a second call a short while later returns the
- * same number, the caller can be sure that @p has remained unscheduled the
- * whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_on_cpu()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -3681,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
- rcu_read_lock();
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
@@ -3887,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- resched_curr(rq);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
-
-out:
- rcu_read_unlock();
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -4003,15 +4047,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
+ int match;
+
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
- if (READ_ONCE(p->__state) & state) {
- *success = 1;
- return true;
- }
+ *success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
/*
@@ -4027,12 +4070,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* p::saved_state to TASK_RUNNING so any further tests will
* not result in false positives vs. @success
*/
- if (p->saved_state & state) {
+ if (match < 0)
p->saved_state = TASK_RUNNING;
- *success = 1;
- }
#endif
- return false;
+ return match > 0;
}
/*
@@ -4155,13 +4196,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4188,129 +4227,127 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!ttwu_state_match(p, state, &success))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
- goto unlock;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- /*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
- */
- WRITE_ONCE(p->__state, TASK_WAKING);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
- *
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
- */
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
- goto unlock;
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until it's done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
- */
- smp_cond_load_acquire(&p->on_cpu, !VAL);
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
- if (task_cpu(p) != cpu) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
#else
- cpu = task_cpu(p);
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ ttwu_queue(p, cpu, wake_flags);
+ }
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
return success;
}
@@ -4463,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5458,23 +5497,20 @@ unsigned int nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -5632,6 +5668,9 @@ void scheduler_tick(void)
perf_event_task_tick();
+ if (curr->flags & PF_WQ_WORKER)
+ wq_worker_tick(curr);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -5681,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -5693,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (cpu_online(cpu)) {
+ update_rq_clock(rq);
- update_rq_clock(rq);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -6265,19 +6297,19 @@ static bool try_steal_cookie(int this, int that)
unsigned long cookie;
bool success = false;
- local_irq_disable();
- double_rq_lock(dst, src);
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
- goto unlock;
+ return false;
if (dst->curr != dst->idle)
- goto unlock;
+ return false;
p = sched_core_find(src, cookie);
if (!p)
- goto unlock;
+ return false;
do {
if (p == src->core_pick || p == src->curr)
@@ -6289,9 +6321,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
/*
- * sched_core_find() and sched_core_next() will ensure that task @p
- * is not throttled now, we also need to check whether the runqueue
- * of the destination CPU is being throttled.
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
*/
if (sched_task_is_throttled(p, this))
goto next;
@@ -6309,10 +6342,6 @@ next:
p = sched_core_next(p, cookie);
} while (p);
-unlock:
- double_rq_unlock(dst, src);
- local_irq_enable();
-
return success;
}
@@ -6370,20 +6399,24 @@ static void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
/* if we're the first, we'll be our own leader */
if (cpumask_weight(smt_mask) == 1)
- goto unlock;
+ return;
/* find the leader */
for_each_cpu(t, smt_mask) {
@@ -6397,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
- goto unlock;
+ return;
/* install and validate core_rq */
for_each_cpu(t, smt_mask) {
@@ -6408,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu)
WARN_ON_ONCE(rq->core != core_rq);
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
/* if we're the last man standing, nothing to do */
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
- goto unlock;
+ return;
}
/* if we're not the leader, nothing to do */
if (rq->core != rq)
- goto unlock;
+ return;
/* find a new leader */
for_each_cpu(t, smt_mask) {
@@ -6441,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* impossible */
- goto unlock;
+ return;
/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
@@ -6463,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
rq = cpu_rq(t);
rq->core = core_rq;
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static inline void sched_core_cpu_dying(unsigned int cpu)
@@ -6989,7 +7015,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7342,6 +7368,19 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
@@ -7590,6 +7629,7 @@ static int __sched_setscheduler(struct task_struct *p,
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
+ bool cpuset_locked = false;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -7639,8 +7679,14 @@ recheck:
return retval;
}
- if (pi)
- cpuset_read_lock();
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
/*
* Make sure no PI-waiters arrive (or leave) while we are
@@ -7716,8 +7762,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
goto recheck;
}
@@ -7784,7 +7830,8 @@ change:
task_rq_unlock(rq, p, &rf);
if (pi) {
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
rt_mutex_adjust_pi(p);
}
@@ -7796,8 +7843,8 @@ change:
unlock:
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
return retval;
}
@@ -9286,8 +9333,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_effective_cpus)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
@@ -9300,21 +9346,9 @@ int task_can_attach(struct task_struct *p,
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
-
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
- if (unlikely(cpu >= nr_cpu_ids))
- return -EINVAL;
- ret = dl_cpu_busy(cpu, p);
- }
-
-out:
return ret;
}
@@ -9548,6 +9582,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -9596,7 +9631,7 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_cpu_busy(cpu, NULL);
+ int ret = dl_bw_check_overflow(cpu);
if (ret)
return ret;
@@ -9689,7 +9724,6 @@ int sched_cpu_deactivate(unsigned int cpu)
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
- update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
@@ -9904,7 +9938,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -11038,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -11103,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11179,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -11235,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -11413,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e3211455b203..4492608b7d7f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
+ unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5a9a4b81c972..58b542bf2893 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
* Fabio Checconi <[email protected]>
*/
+#include <linux/cpuset.h>
+
/*
* Default limits for DL period; on the top end we guard against small util
* tasks still getting ridiculously long effective runtimes, on the bottom end we
@@ -489,13 +491,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
-}
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@@ -1260,43 +1255,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
}
/*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
- u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
- * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- * we compare u_inact + rq->dl.extra_bw with
- * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- * u_inact + rq->dl.extra_bw can be larger than
- * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- * leading to wrong results)
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
*/
- if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- u_act = u_act_min;
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
else
- u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
@@ -2596,6 +2587,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && p->dl.dl_runtime)
task_non_contending(p);
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
+
if (!task_on_rq_queued(p)) {
/*
* Inactive timer is armed. However, p is leaving DEADLINE and
@@ -2636,6 +2633,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
+
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
add_rq_bw(&p->dl, &rq->dl);
@@ -2795,12 +2798,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- dl_rq->extra_bw = 1 << BW_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- dl_rq->extra_bw = to_ratio(global_rt_period(),
- global_rt_runtime());
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
}
}
@@ -3044,26 +3047,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int dl_cpu_busy(int cpu, struct task_struct *p)
+enum dl_bw_request {
+ dl_bw_req_check_overflow = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags, cap;
+ unsigned long flags;
struct dl_bw *dl_b;
- bool overflow;
+ bool overflow = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
- if (!overflow && p) {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ if (req == dl_bw_req_free) {
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ } else {
+ unsigned long cap = dl_bw_capacity(cpu);
+
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (req == dl_bw_req_alloc && !overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3071,6 +3086,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p)
return overflow ? -EBUSY : 0;
}
+
+int dl_bw_check_overflow(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+}
#endif
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0b2340a79b65..4c3d0d9f3db6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -427,6 +424,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
#undef SDM
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
}
void update_sched_domain_debugfs(void)
@@ -581,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -626,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+ struct sched_entity *last, *first;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -643,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
@@ -863,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_idle_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..8dbff6e7ad4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -57,22 +58,6 @@
#include "autogroup.h"
/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
- *
- * (default: 0.75 msec)
- */
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -277,9 +237,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-continguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load)
+ avg = div_s64(avg, load);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 lag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
- max_vruntime(cfs_rq->min_vruntime, vruntime));
+ __update_min_vruntime(cfs_rq, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ avg_vruntime_add(cfs_rq, se);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
+ avg_vruntime_sub(cfs_rq, se);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
- return __node_2_se(next);
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -684,109 +943,51 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-
+#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
+#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
-
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned int nr_running = cfs_rq->nr_running;
- struct sched_entity *init_se = se;
- unsigned int min_gran;
- u64 slice;
-
- if (sched_feat(ALT_PERIOD))
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
-
- slice = __sched_period(nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
- struct cfs_rq *qcfs_rq;
-
- qcfs_rq = cfs_rq_of(se);
- load = &qcfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = qcfs_rq->load;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
- }
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ se->slice = sysctl_sched_base_slice;
- if (sched_feat(BASE_SLICE)) {
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
- min_gran = sysctl_sched_idle_min_granularity;
- else
- min_gran = sysctl_sched_min_granularity;
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
- slice = max_t(u64, slice, min_gran);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
}
-
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
@@ -921,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -1064,6 +1266,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
@@ -1700,23 +1919,6 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(sibling))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
@@ -3375,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ unsigned long old_weight = se->load.weight;
+
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+ else
+ avg_vruntime_sub(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
+ if (!se->on_rq) {
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
+ }
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3394,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
+ if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
-
+ if (cfs_rq->curr != se)
+ avg_vruntime_add(cfs_rq, se);
+ }
}
void reweight_task(struct task_struct *p, int prio)
@@ -4692,159 +4916,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- u64 sleep_time;
-
- if (se->exec_start == 0)
- return false;
-
- cfs_rq = cfs_rq_of(se);
+ u64 vslice = calc_delta_fair(se->slice, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ /*
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
+ */
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
- return false;
+ lag = se->vlag;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ /*
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
+ */
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- return false;
-}
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
+ se->vruntime = vruntime - lag;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /*
- * Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards.
- * However, min_vruntime can advance much faster than real time, with
- * the extreme being when an entity with the minimal weight always runs
- * on the cfs_rq. If the waking entity slept for a long time, its
- * vruntime difference from min_vruntime may overflow s64 and their
- * comparison may get inversed, so ignore the entity's original
- * vruntime in that case.
- * The maximal vruntime speedup is given by the ratio of normal to
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
- * When placing a migrated waking entity, its exec_start has been set
- * from a different rq. In order to take into account a possible
- * divergence between new and prev rq's clocks task because of irq and
- * stolen time, we take an additional margin.
- * So, cutting off on the sleep time of
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
- * should be safe.
- */
- if (entity_is_long_sleeper(se))
- se->vruntime = vruntime;
- else
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -4855,37 +5045,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
/* Entity has migrated, no longer consider this task hot */
if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq))
+ if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -4900,27 +5099,10 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
-}
-
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
if (cfs_rq->next == se)
__clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4954,20 +5136,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -4986,52 +5160,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_idle_cfs_rq_clock_pelt(cfs_rq);
}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
-
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
-
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
-}
-
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5047,6 +5175,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
@@ -5070,9 +5203,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5083,50 +5213,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
-
/*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
*/
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
- */
- if (cfs_rq->skip && cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
- }
-
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- se = cfs_rq->next;
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- se = cfs_rq->last;
- }
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
- return se;
+ return pick_eevdf(cfs_rq);
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5143,8 +5237,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -5185,9 +5277,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -5377,6 +5466,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -5391,6 +5491,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5480,7 +5584,9 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5498,7 +5604,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
@@ -5577,6 +5686,14 @@ static void __cfsb_csd_unthrottle(void *arg)
rq_lock(rq, &rf);
/*
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
+
+ /*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
* fact that we pair with sched_free_group_rcu(), so that we cannot
@@ -5595,6 +5712,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
+ rq_clock_stop_loop_update(rq);
rq_unlock(rq, &rf);
}
@@ -6005,13 +6123,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -6115,6 +6234,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6137,7 +6263,49 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
+#endif
#else /* CONFIG_CFS_BANDWIDTH */
@@ -6168,9 +6336,8 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
@@ -6181,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -6192,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
@@ -6222,8 +6397,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -6264,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-/*
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
- * of idle_nr_running, which does not consider idle descendants of normal
- * entities.
- */
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->nr_running &&
- cfs_rq->nr_running == cfs_rq->idle_nr_running;
-}
-
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
@@ -7047,7 +7210,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target + 1) {
+ for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7156,7 +7319,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
@@ -7202,14 +7365,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
- * (@dst_cpu = -1) or migrated to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@@ -7217,9 +7424,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
- if (task_cpu(p) == cpu && dst_cpu != cpu)
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
@@ -7255,7 +7462,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
- else if (unlikely(task_on_rq_queued(p) || current == p))
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
@@ -7264,6 +7471,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
return min(util, capacity_orig_of(cpu));
}
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
+}
+
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -7281,9 +7498,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
+ p = NULL;
- return cpu_util_next(cpu, p, -1);
+ return cpu_util(cpu, p, -1, 0);
}
/*
@@ -7330,7 +7547,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
* cpu_capacity.
*
* The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util_next()) and must be calculated
+ * energy cost is removed (by cpu_util()) and must be calculated
* separately (see eenv_task_busy_time). This ensures:
*
* - A stable PD utilization, no matter which CPU of that PD we want to place
@@ -7351,7 +7568,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
int cpu;
for_each_cpu(cpu, pd_cpus) {
- unsigned long util = cpu_util_next(cpu, p, -1);
+ unsigned long util = cpu_util(cpu, p, -1, 0);
busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
@@ -7375,8 +7592,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util;
/*
* Performance domain frequency: utilization clamping
@@ -7385,8 +7602,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, eff_util);
}
return min(max_util, eenv->cpu_cap);
@@ -7521,7 +7738,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
/*
@@ -7666,6 +7883,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (wake_flags & WF_TTWU) {
record_wakee(p);
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
@@ -7723,18 +7944,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
@@ -7772,66 +7981,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
-
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff <= 0)
- return -1;
-
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- if (se_is_idle(se))
- return;
- cfs_rq_of(se)->last = se;
- }
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -7843,12 +7992,6 @@ static void set_next_buddy(struct sched_entity *se)
}
}
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
-}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -7857,7 +8000,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
@@ -7873,7 +8015,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
@@ -7918,35 +8060,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (cse_is_idle != pse_is_idle)
return;
- update_curr(cfs_rq_of(se));
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
goto preempt;
- }
return;
preempt:
resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
-
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
}
#ifdef CONFIG_SMP
@@ -8097,6 +8223,7 @@ done: __maybe_unused;
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
return p;
@@ -8147,8 +8274,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
@@ -8164,21 +8289,19 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ se->deadline += calc_delta_fair(se->slice, se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8341,6 +8464,11 @@ enum group_type {
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -8421,8 +8549,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
@@ -9048,6 +9175,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -9321,6 +9449,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -9331,98 +9462,128 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu: Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env: The load balancing environment
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
- * @sg: The candidate busiest group
+ * @group: The candidate busiest group
*
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
*
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
*
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
- *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
*/
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
- struct sg_lb_stats *sgs,
- struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
{
-#ifdef CONFIG_SCHED_SMT
- bool local_is_smt, sg_is_smt;
- int sg_busy_cpus;
-
- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
- if (!local_is_smt) {
- /*
- * If we are here, @dst_cpu is idle and does not have SMT
- * siblings. Pull tasks if candidate group has two or more
- * busy CPUs.
- */
- if (sg_busy_cpus >= 2) /* implies sg_is_smt */
- return true;
+ /* Ensure that the whole local core is idle, if applicable. */
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
+ return false;
- /*
- * @dst_cpu does not have SMT siblings. @sg may have SMT
- * siblings and only one is busy. In such case, @dst_cpu
- * can help if it has higher priority and is idle (i.e.,
- * it has no running tasks).
- */
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ /*
+ * CPU priorities does not make sense for SMT cores with more than one
+ * busy sibling.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
+ if (sgs->group_weight - sgs->idle_cpus != 1)
+ return false;
}
- /* @dst_cpu has SMT siblings. */
+ return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+}
- if (sg_is_smt) {
- int local_busy_cpus = sds->local->group_weight -
- sds->local_stat.idle_cpus;
- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
- if (busy_cpus_delta == 1)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (env->idle == CPU_NOT_IDLE)
return false;
- }
/*
- * @sg does not have SMT siblings. Ensure that @sds::local does not end
- * up with more than one busy SMT sibling and only pull tasks if there
- * are not busy CPUs (i.e., no CPU has running tasks).
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
*/
- if (!sds->local_stat.sum_nr_running)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
return false;
-#else
- /* Always return false so that callers deal with non-SMT cases. */
- return false;
-#endif
}
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
- struct sched_group *group)
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
{
- /* Only do SMT checks if either local or candidate have SMT siblings */
- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
- (group->flags & SD_SHARE_CPUCAPACITY))
- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+ int ncores_busiest, ncores_local;
+ long imbalance;
- return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance == 0 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
}
static inline bool
@@ -9517,6 +9678,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_asym_packing = 1;
}
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -9601,6 +9766,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
break;
+ case group_smt_balance:
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9610,14 +9776,38 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
/*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+
+ /*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
@@ -9813,6 +10003,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -9944,6 +10135,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -10088,7 +10280,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
@@ -10129,8 +10320,13 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
@@ -10194,6 +10390,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -10241,14 +10444,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
@@ -10440,22 +10641,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (env->idle == CPU_NOT_IDLE) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -10466,12 +10677,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -10542,8 +10755,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
- /* Make sure we only pull tasks from a CPU of lower priority */
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_use_asym_prio(env->sd, i) &&
sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@@ -10581,7 +10801,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util_cfs(i);
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -10632,12 +10852,19 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
@@ -10691,7 +10918,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -10718,10 +10945,24 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ continue;
+ }
+
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
+ if (idle_smt == env->dst_cpu)
+ return true;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
@@ -10744,7 +10985,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
@@ -11371,9 +11612,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing betwen cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
+ if (sched_use_asym_prio(sd, i) &&
+ sched_asym_prefer(i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -11930,8 +12175,8 @@ static void rq_offline_fair(struct rq *rq)
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
return (rtime * min_nr_tasks > slice);
}
@@ -12087,8 +12332,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -12097,22 +12342,9 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
rq_unlock(rq, &rf);
}
@@ -12141,34 +12373,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12239,16 +12443,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
detach_entity_cfs_rq(se);
}
@@ -12256,12 +12450,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12373,7 +12563,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -12626,7 +12816,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..f770168230ae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -20,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, true)
-
-/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
@@ -99,5 +88,4 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
-SCHED_FEAT(ALT_PERIOD, true)
-SCHED_FEAT(BASE_SLICE, true)
+SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e072f6b31bf3..1d0f634725a6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,7 +140,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
-DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
-#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@@ -494,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
continue;
/* Generate an event */
- if (cmpxchg(&t->event, 0, 1) == 0)
- wake_up_interruptible(&t->event_wait);
+ if (cmpxchg(&t->event, 0, 1) == 0) {
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+ }
t->last_event_time = now;
/* Reset threshold breach flag once event got generated */
t->pending_event = false;
@@ -1272,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res, struct file *file)
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+ enum psi_res res, struct file *file,
+ struct kernfs_open_file *of)
{
struct psi_trigger *t;
enum psi_states state;
@@ -1305,8 +1309,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
- if (window_us < WINDOW_MIN_US ||
- window_us > WINDOW_MAX_US)
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
/*
@@ -1333,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
- init_waitqueue_head(&t->event_wait);
+ t->of = of;
+ if (!of)
+ init_waitqueue_head(&t->event_wait);
t->pending_event = false;
t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
@@ -1390,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
* being accessed later. Can happen if cgroup is deleted from under a
* polling process.
*/
- wake_up_pollfree(&t->event_wait);
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
if (t->aggregator == PSI_AVGS) {
mutex_lock(&group->avgs_lock);
@@ -1409,11 +1417,16 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_nr_triggers[t->state]--;
if (!group->rtpoll_nr_triggers[t->state])
group->rtpoll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->rtpoll_triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->rtpoll_min_period = period;
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
/* Destroy rtpoll_task when the last trigger is destroyed */
if (group->rtpoll_states == 0) {
group->rtpoll_until = 0;
@@ -1462,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- poll_wait(file, &t->event_wait, wait);
+ if (t->of)
+ kernfs_generic_poll(t->of, wait);
+ else
+ poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
@@ -1532,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
return -EBUSY;
}
- new = psi_trigger_create(&psi_system, buf, res, file);
+ new = psi_trigger_create(&psi_system, buf, res, file, NULL);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e5074115..0597ba0f85ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000;
int sysctl_sched_rt_runtime = 950000;
#ifdef CONFIG_SYSCTL
-static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
@@ -3062,6 +3062,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..04846272409c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -286,12 +286,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -330,7 +324,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_cpu_busy(int cpu, struct task_struct *p);
+extern int dl_bw_check_overflow(int cpu);
#ifdef CONFIG_CGROUP_SCHED
@@ -460,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
@@ -500,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
@@ -554,6 +550,9 @@ struct cfs_rq {
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -573,8 +572,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
- struct sched_entity *last;
- struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
@@ -642,6 +639,8 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -754,6 +753,12 @@ struct dl_rq {
u64 extra_bw;
/*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
@@ -1245,6 +1250,7 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
bool fi);
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1546,6 +1552,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@@ -1678,6 +1706,21 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
raw_spin_rq_unlock(rq);
}
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
static inline struct rq *
this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
@@ -1772,6 +1815,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1779,16 +1829,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -1844,6 +1903,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
int flags;
@@ -2093,12 +2153,13 @@ static inline int task_on_rq_migrating(struct task_struct *p)
}
/* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
-#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2157,6 +2218,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_INITIAL 0x80
#define RETRY_TASK ((void *)-1UL)
@@ -2360,6 +2422,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
#endif
extern void schedule_idle(void);
+asmlinkage void schedule_user(void);
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
@@ -2378,7 +2441,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@@ -2462,11 +2524,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
+
#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_idle_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2572,6 +2632,12 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
#endif
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
+
#ifdef CONFIG_SMP
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
@@ -2701,6 +2767,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
+}
+
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -2758,6 +2834,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
@@ -2946,53 +3026,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-/**
- * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
- * @cpu: the CPU to get the utilization for.
- *
- * The unit of the return value must be the same as the one of CPU capacity
- * so that CPU utilization can be compared with CPU capacity.
- *
- * CPU utilization is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on that CPU.
- * It represents the amount of CPU capacity currently used by CFS tasks in
- * the range [0..max CPU capacity] with max CPU capacity being the CPU
- * capacity at f_max.
- *
- * The estimated CPU utilization is defined as the maximum between CPU
- * utilization and sum of the estimated utilization of the currently
- * runnable tasks on that CPU. It preserves a utilization "snapshot" of
- * previously-executed tasks, which helps better deduce how busy a CPU will
- * be when a long-sleeping task wakes up. The contribution to CPU utilization
- * of such a task would be significantly decayed at this point of time.
- *
- * CPU utilization can be higher than the current CPU capacity
- * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
- * of rounding errors as well as task migrations or wakeups of new tasks.
- * CPU utilization has to be capped to fit into the [0..max CPU capacity]
- * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
- * could be seen as over-utilized even though CPU1 has 20% of spare CPU
- * capacity. CPU utilization is allowed to overshoot current CPU capacity
- * though since this is useful for predicting the CPU capacity required
- * after task migrations (scheduler-driven DVFS).
- *
- * Return: (Estimated) utilization for the specified CPU.
- */
-static inline unsigned long cpu_util_cfs(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned long util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
- if (sched_feat(UTIL_EST)) {
- util = max_t(unsigned long, util,
- READ_ONCE(cfs_rq->avg.util_est.enqueued));
- }
-
- return min(util, capacity_orig_of(cpu));
-}
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
static inline unsigned long cpu_util_rt(struct rq *rq)
{
@@ -3236,6 +3272,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
#ifdef CONFIG_PREEMPT_DYNAMIC
extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
@@ -3487,4 +3525,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 76b9b796e695..72505cd3b60a 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head);
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
-void swake_up_locked(struct swait_queue_head *q)
+void swake_up_locked(struct swait_queue_head *q, int wake_flags)
{
struct swait_queue *curr;
@@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q)
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- wake_up_process(curr->task);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
@@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked);
void swake_up_all_locked(struct swait_queue_head *q)
{
while (!list_empty(&q->task_list))
- swake_up_locked(q);
+ swake_up_locked(q, 0);
}
void swake_up_one(struct swait_queue_head *q)
@@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q)
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
- swake_up_locked(q);
+ swake_up_locked(q, 0);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up_one);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6682535e37c8..05a5bc678c08 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_rq_lock_irqsave(rq, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -719,8 +719,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
- if (parent->parent)
+
+ if (parent->parent) {
parent->parent->child = tmp;
+ parent->parent->groups->flags = tmp->flags;
+ }
+
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@@ -1270,14 +1274,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
@@ -1676,7 +1690,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 133b74730738..802d98cf2de3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
@@ -425,11 +430,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@@ -459,7 +459,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
- if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d3e584065c7f..255999ba9190 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -110,11 +110,13 @@ struct seccomp_knotif {
* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
* is allowed.
* @ioctl_flags: The flags used for the seccomp_addfd ioctl.
+ * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
* @ret: The return value of the installing process. It is set to the fd num
* upon success (>= 0).
* @completion: Indicates that the installing process has completed fd
* installation, or gone away (either due to successful
* reply, or signal)
+ * @list: list_head for chaining seccomp_kaddfd together.
*
*/
struct seccomp_kaddfd {
@@ -138,14 +140,17 @@ struct seccomp_kaddfd {
* structure is fairly large, we store the notification-specific stuff in a
* separate structure.
*
- * @request: A semaphore that users of this notification can wait on for
- * changes. Actual reads and writes are still controlled with
- * filter->notify_lock.
+ * @requests: A semaphore that users of this notification can wait on for
+ * changes. Actual reads and writes are still controlled with
+ * filter->notify_lock.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
*/
+
struct notification {
- struct semaphore request;
+ atomic_t requests;
+ u32 flags;
u64 next_id;
struct list_head notifications;
};
@@ -555,6 +560,8 @@ static void __seccomp_filter_release(struct seccomp_filter *orig)
* drop its reference count, and notify
* about unused filters
*
+ * @tsk: task the filter should be released from.
+ *
* This function should only be called when the task is exiting as
* it detaches it from its filter tree. As such, READ_ONCE() and
* barriers are not needed here, as would normally be needed.
@@ -574,6 +581,8 @@ void seccomp_filter_release(struct task_struct *tsk)
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
+ * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
+ *
* Expects sighand and cred_guard_mutex locks to be held, and for
* seccomp_can_sync_threads() to have returned success already
* without dropping the locks.
@@ -1116,8 +1125,11 @@ static int seccomp_do_user_notification(int this_syscall,
list_add_tail(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);
- up(&match->notif->request);
- wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ atomic_inc(&match->notif->requests);
+ if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ else
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
/*
* This is where we wait for a reply from userspace.
@@ -1450,6 +1462,37 @@ find_notification(struct seccomp_filter *filter, u64 id)
return NULL;
}
+static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ /* Avoid a wakeup if event not interesting for us. */
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static int recv_wait_event(struct seccomp_filter *filter)
+{
+ DEFINE_WAIT_FUNC(wait, recv_wake_function);
+ int ret;
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ return 0;
+
+ for (;;) {
+ ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ break;
+
+ if (ret)
+ return ret;
+
+ schedule();
+ }
+ finish_wait(&filter->wqh, &wait);
+ return 0;
+}
static long seccomp_notify_recv(struct seccomp_filter *filter,
void __user *buf)
@@ -1467,7 +1510,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
memset(&unotif, 0, sizeof(unotif));
- ret = down_interruptible(&filter->notif->request);
+ ret = recv_wait_event(filter);
if (ret < 0)
return ret;
@@ -1515,7 +1558,8 @@ out:
if (should_sleep_killable(filter, knotif))
complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
- up(&filter->notif->request);
+ atomic_inc(&filter->notif->requests);
+ wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM);
}
mutex_unlock(&filter->notify_lock);
}
@@ -1561,7 +1605,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
- complete(&knotif->ready);
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
return ret;
@@ -1591,6 +1638,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
return ret;
}
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+ unsigned long flags)
+{
+ long ret;
+
+ if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+ filter->notif->flags = flags;
+ mutex_unlock(&filter->notify_lock);
+ return 0;
+}
+
static long seccomp_notify_addfd(struct seccomp_filter *filter,
struct seccomp_notif_addfd __user *uaddfd,
unsigned int size)
@@ -1720,6 +1783,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+ return seccomp_notify_set_flags(filter, arg);
}
/* Extensible Argument ioctls */
@@ -1777,7 +1842,6 @@ static struct file *init_listener(struct seccomp_filter *filter)
if (!filter->notif)
goto out;
- sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2547fa73bde5..09019017d669 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -45,6 +46,7 @@
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
+#include <linux/sysctl.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -561,6 +563,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig)
if (handler != SIG_IGN && handler != SIG_DFL)
return false;
+ /* If dying, we handle all new signals by ignoring them */
+ if (fatal_signal_pending(tsk))
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
@@ -1255,7 +1261,17 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = task_pt_regs(current);
- pr_info("potentially unexpected fatal signal %d.\n", signr);
+ struct file *exe_file;
+
+ exe_file = get_task_exe_file(current);
+ if (exe_file) {
+ pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
+ exe_file, current->comm, signr);
+ fput(exe_file);
+ } else {
+ pr_info("%s: potentially unexpected fatal signal %d.\n",
+ current->comm, signr);
+ }
#if defined(__i386__) && !defined(__arch_um__)
pr_info("code at %08lx: ", regs->ip);
@@ -4773,6 +4789,28 @@ static inline void siginfo_buildtime_checks(void)
#endif
}
+#if defined(CONFIG_SYSCTL)
+static struct ctl_table signal_debug_table[] = {
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
+ {
+ .procname = "exception-trace",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ { }
+};
+
+static int __init init_signal_sysctls(void)
+{
+ register_sysctl_init("debug", signal_debug_table);
+ return 0;
+}
+early_initcall(init_signal_sysctls);
+#endif /* CONFIG_SYSCTL */
+
void __init signals_init(void)
{
siginfo_buildtime_checks();
diff --git a/kernel/smp.c b/kernel/smp.c
index ab3e5dad6cfe..8455a53465af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -27,6 +27,9 @@
#include <linux/jump_label.h>
#include <trace/events/ipi.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/csd.h>
+#undef CREATE_TRACE_POINTS
#include "smpboot.h"
#include "sched/smp.h"
@@ -43,6 +46,8 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
+
static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
@@ -121,6 +126,14 @@ send_call_function_ipi_mask(struct cpumask *mask)
arch_send_call_function_ipi_mask(mask);
}
+static __always_inline void
+csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+{
+ trace_csd_function_entry(func, csd);
+ func(info);
+ trace_csd_function_exit(func, csd);
+}
+
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
@@ -242,13 +255,15 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
*bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
}
if (cpu >= 0) {
- dump_cpu_task(cpu);
+ if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
+ dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
arch_send_call_function_single_ipi(cpu);
}
}
- dump_stack();
+ if (firsttime)
+ dump_stack();
*ts1 = ts2;
return false;
@@ -329,7 +344,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
* even if we haven't sent the smp_call IPI yet (e.g. the stopper
* executes migration_cpu_stop() on the remote CPU).
*/
- if (trace_ipi_send_cpu_enabled()) {
+ if (trace_csd_queue_cpu_enabled()) {
call_single_data_t *csd;
smp_call_func_t func;
@@ -337,7 +352,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
sched_ttwu_pending : csd->func;
- trace_ipi_send_cpu(cpu, _RET_IP_, func);
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
}
/*
@@ -375,7 +390,7 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd)
csd_lock_record(csd);
csd_unlock(csd);
local_irq_save(flags);
- func(info);
+ csd_do_func(func, info, NULL);
csd_lock_record(NULL);
local_irq_restore(flags);
return 0;
@@ -422,9 +437,14 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned;
+ atomic_t *tbt;
lockdep_assert_irqs_disabled();
+ /* Allow waiters to send backtrace NMI from here onwards */
+ tbt = this_cpu_ptr(&trigger_backtrace);
+ atomic_set_release(tbt, 1);
+
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
@@ -477,7 +497,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
}
csd_lock_record(csd);
- func(info);
+ csd_do_func(func, info, csd);
csd_unlock(csd);
csd_lock_record(NULL);
} else {
@@ -508,7 +528,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
csd_lock_record(csd);
csd_unlock(csd);
- func(info);
+ csd_do_func(func, info, csd);
csd_lock_record(NULL);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
@@ -522,8 +542,10 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
- if (entry)
- sched_ttwu_pending(entry);
+ if (entry) {
+ csd = llist_entry(entry, typeof(*csd), node.llist);
+ csd_do_func(sched_ttwu_pending, entry, csd);
+ }
}
@@ -728,7 +750,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
bool wait = scf_flags & SCF_WAIT;
- int nr_cpus = 0, nr_queued = 0;
+ int nr_cpus = 0;
bool run_remote = false;
bool run_local = false;
@@ -786,22 +808,16 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
last_cpu = cpu;
}
- nr_queued++;
}
/*
- * Trace each smp_function_call_*() as an IPI, actual IPIs
- * will be traced with func==generic_smp_call_function_single_ipi().
- */
- if (nr_queued)
- trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func);
-
- /*
* Choose the most efficient way to send an IPI. Note that the
* number of CPUs might be zero due to concurrent changes to the
* provided mask.
@@ -816,7 +832,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
unsigned long flags;
local_irq_save(flags);
- func(info);
+ csd_do_func(func, info, NULL);
local_irq_restore(flags);
}
@@ -892,7 +908,7 @@ EXPORT_SYMBOL(setup_max_cpus);
* SMP mode to <NUM>.
*/
-void __weak arch_disable_smp_support(void) { }
+void __weak __init arch_disable_smp_support(void) { }
static int __init nosmp(char *str)
{
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 2c7396da470c..f47d8f375946 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -325,166 +325,3 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
-
-static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
-
-/*
- * Called to poll specified CPU's state, for example, when waiting for
- * a CPU to come online.
- */
-int cpu_report_state(int cpu)
-{
- return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
-}
-
-/*
- * If CPU has died properly, set its state to CPU_UP_PREPARE and
- * return success. Otherwise, return -EBUSY if the CPU died after
- * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
- * if cpu_wait_death() timed out and the CPU still hasn't gotten around
- * to dying. In the latter two cases, the CPU might not be set up
- * properly, but it is up to the arch-specific code to decide.
- * Finally, -EIO indicates an unanticipated problem.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-int cpu_check_up_prepare(int cpu)
-{
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
- }
-
- switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
-
- case CPU_POST_DEAD:
-
- /* The CPU died properly, so just start it up again. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
-
- case CPU_DEAD_FROZEN:
-
- /*
- * Timeout during CPU death, so let caller know.
- * The outgoing CPU completed its processing, but after
- * cpu_wait_death() timed out and reported the error. The
- * caller is free to proceed, in which case the state
- * will be reset properly by cpu_set_state_online().
- * Proceeding despite this -EBUSY return makes sense
- * for systems where the outgoing CPUs take themselves
- * offline, with no post-death manipulation required from
- * a surviving CPU.
- */
- return -EBUSY;
-
- case CPU_BROKEN:
-
- /*
- * The most likely reason we got here is that there was
- * a timeout during CPU death, and the outgoing CPU never
- * did complete its processing. This could happen on
- * a virtualized system if the outgoing VCPU gets preempted
- * for more than five seconds, and the user attempts to
- * immediately online that same CPU. Trying again later
- * might return -EBUSY above, hence -EAGAIN.
- */
- return -EAGAIN;
-
- case CPU_UP_PREPARE:
- /*
- * Timeout while waiting for the CPU to show up. Allow to try
- * again later.
- */
- return 0;
-
- default:
-
- /* Should not happen. Famous last words. */
- return -EIO;
- }
-}
-
-/*
- * Mark the specified CPU online.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-void cpu_set_state_online(int cpu)
-{
- (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Wait for the specified CPU to exit the idle loop and die.
- */
-bool cpu_wait_death(unsigned int cpu, int seconds)
-{
- int jf_left = seconds * HZ;
- int oldstate;
- bool ret = true;
- int sleep_jf = 1;
-
- might_sleep();
-
- /* The outgoing CPU will normally get done quite quickly. */
- if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
- goto update_state_early;
- udelay(5);
-
- /* But if the outgoing CPU dawdles, wait increasingly long times. */
- while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
- schedule_timeout_uninterruptible(sleep_jf);
- jf_left -= sleep_jf;
- if (jf_left <= 0)
- break;
- sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
- }
-update_state_early:
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
-update_state:
- if (oldstate == CPU_DEAD) {
- /* Outgoing CPU died normally, update state. */
- smp_mb(); /* atomic_read() before update. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
- } else {
- /* Outgoing CPU still hasn't died, set state accordingly. */
- if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- &oldstate, CPU_BROKEN))
- goto update_state;
- ret = false;
- }
- return ret;
-}
-
-/*
- * Called by the outgoing CPU to report its successful death. Return
- * false if this report follows the surviving CPU's timing out.
- *
- * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
- * timed out. This approach allows architectures to omit calls to
- * cpu_check_up_prepare() and cpu_set_state_online() without defeating
- * the next cpu_wait_death()'s polling loop.
- */
-bool cpu_report_death(void)
-{
- int oldstate;
- int newstate;
- int cpu = smp_processor_id();
-
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
- do {
- if (oldstate != CPU_BROKEN)
- newstate = CPU_DEAD;
- else
- newstate = CPU_DEAD_FROZEN;
- } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- &oldstate, newstate));
- return newstate == CPU_DEAD;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1b725510dd0f..210cf5f8d92c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -80,21 +80,6 @@ static void wakeup_softirqd(void)
wake_up_process(tsk);
}
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- if (pending & SOFTIRQ_NOW_MASK)
- return false;
- return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
-}
-
#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_PER_CPU(int, hardirqs_enabled);
DEFINE_PER_CPU(int, hardirq_context);
@@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
goto out;
pending = local_softirq_pending();
- if (!pending || ksoftirqd_running(pending))
+ if (!pending)
goto out;
/*
@@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running(local_softirq_pending()))
- return;
-
if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running(pending))
+ if (pending)
do_softirq_own_stack();
local_irq_restore(flags);
@@ -630,7 +612,7 @@ static inline void tick_irq_exit(void)
int cpu = smp_processor_id();
/* Make sure that timer wheel updates are propagated */
- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
if (!in_hardirq())
tick_nohz_irq_exit();
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 339fee3eff6a..2410e3999ebe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -140,6 +140,12 @@
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
#endif
+#ifndef RISCV_V_SET_CONTROL
+# define RISCV_V_SET_CONTROL(a) (-EINVAL)
+#endif
+#ifndef RISCV_V_GET_CONTROL
+# define RISCV_V_GET_CONTROL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2529,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
return -EINVAL;
break;
- case PR_GET_AUXV:
- if (arg4 || arg5)
- return -EINVAL;
- error = prctl_get_auxv((void __user *)arg2, arg3);
- break;
default:
return -EINVAL;
}
@@ -2688,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
+ case PR_GET_AUXV:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = prctl_get_auxv((void __user *)arg2, arg3);
+ break;
#ifdef CONFIG_KSM
case PR_SET_MEMORY_MERGE:
if (arg3 || arg4 || arg5)
@@ -2708,6 +2714,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
break;
#endif
+ case PR_RISCV_V_SET_CONTROL:
+ error = RISCV_V_SET_CONTROL(arg2);
+ break;
+ case PR_RISCV_V_GET_CONTROL:
+ error = RISCV_V_GET_CONTROL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..e137c1385c56 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,99 +51,35 @@ COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter);
COND_SYSCALL(io_uring_register);
-
-/* fs/xattr.c */
-
-/* fs/dcache.c */
-
-/* fs/cookies.c */
COND_SYSCALL(lookup_dcookie);
COND_SYSCALL_COMPAT(lookup_dcookie);
-
-/* fs/eventfd.c */
COND_SYSCALL(eventfd2);
-
-/* fs/eventfd.c */
COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
COND_SYSCALL(epoll_pwait2);
COND_SYSCALL_COMPAT(epoll_pwait2);
-
-/* fs/fcntl.c */
-
-/* fs/inotify_user.c */
COND_SYSCALL(inotify_init1);
COND_SYSCALL(inotify_add_watch);
COND_SYSCALL(inotify_rm_watch);
-
-/* fs/ioctl.c */
-
-/* fs/ioprio.c */
COND_SYSCALL(ioprio_set);
COND_SYSCALL(ioprio_get);
-
-/* fs/locks.c */
COND_SYSCALL(flock);
-
-/* fs/namei.c */
-
-/* fs/namespace.c */
-
-/* fs/nfsctl.c */
-
-/* fs/open.c */
-
-/* fs/pipe.c */
-
-/* fs/quota.c */
COND_SYSCALL(quotactl);
COND_SYSCALL(quotactl_fd);
-
-/* fs/readdir.c */
-
-/* fs/read_write.c */
-
-/* fs/sendfile.c */
-
-/* fs/select.c */
-
-/* fs/signalfd.c */
COND_SYSCALL(signalfd4);
COND_SYSCALL_COMPAT(signalfd4);
-
-/* fs/splice.c */
-
-/* fs/stat.c */
-
-/* fs/sync.c */
-
-/* fs/timerfd.c */
COND_SYSCALL(timerfd_create);
COND_SYSCALL(timerfd_settime);
COND_SYSCALL(timerfd_settime32);
COND_SYSCALL(timerfd_gettime);
COND_SYSCALL(timerfd_gettime32);
-
-/* fs/utimes.c */
-
-/* kernel/acct.c */
COND_SYSCALL(acct);
-
-/* kernel/capability.c */
COND_SYSCALL(capget);
COND_SYSCALL(capset);
-
-/* kernel/exec_domain.c */
-
-/* kernel/exit.c */
-
-/* kernel/fork.c */
/* __ARCH_WANT_SYS_CLONE3 */
COND_SYSCALL(clone3);
-
-/* kernel/futex/syscalls.c */
COND_SYSCALL(futex);
COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
@@ -151,29 +87,11 @@ COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
COND_SYSCALL_COMPAT(get_robust_list);
COND_SYSCALL(futex_waitv);
-
-/* kernel/hrtimer.c */
-
-/* kernel/itimer.c */
-
-/* kernel/kexec.c */
COND_SYSCALL(kexec_load);
COND_SYSCALL_COMPAT(kexec_load);
-
-/* kernel/module.c */
COND_SYSCALL(init_module);
COND_SYSCALL(delete_module);
-
-/* kernel/posix-timers.c */
-
-/* kernel/printk.c */
COND_SYSCALL(syslog);
-
-/* kernel/ptrace.c */
-
-/* kernel/sched/core.c */
-
-/* kernel/sys.c */
COND_SYSCALL(setregid);
COND_SYSCALL(setgid);
COND_SYSCALL(setreuid);
@@ -186,12 +104,6 @@ COND_SYSCALL(setfsuid);
COND_SYSCALL(setfsgid);
COND_SYSCALL(setgroups);
COND_SYSCALL(getgroups);
-
-/* kernel/time.c */
-
-/* kernel/timer.c */
-
-/* ipc/mqueue.c */
COND_SYSCALL(mq_open);
COND_SYSCALL_COMPAT(mq_open);
COND_SYSCALL(mq_unlink);
@@ -203,8 +115,6 @@ COND_SYSCALL(mq_notify);
COND_SYSCALL_COMPAT(mq_notify);
COND_SYSCALL(mq_getsetattr);
COND_SYSCALL_COMPAT(mq_getsetattr);
-
-/* ipc/msg.c */
COND_SYSCALL(msgget);
COND_SYSCALL(old_msgctl);
COND_SYSCALL(msgctl);
@@ -214,8 +124,6 @@ COND_SYSCALL(msgrcv);
COND_SYSCALL_COMPAT(msgrcv);
COND_SYSCALL(msgsnd);
COND_SYSCALL_COMPAT(msgsnd);
-
-/* ipc/sem.c */
COND_SYSCALL(semget);
COND_SYSCALL(old_semctl);
COND_SYSCALL(semctl);
@@ -224,8 +132,6 @@ COND_SYSCALL_COMPAT(old_semctl);
COND_SYSCALL(semtimedop);
COND_SYSCALL(semtimedop_time32);
COND_SYSCALL(semop);
-
-/* ipc/shm.c */
COND_SYSCALL(shmget);
COND_SYSCALL(old_shmctl);
COND_SYSCALL(shmctl);
@@ -234,8 +140,6 @@ COND_SYSCALL_COMPAT(old_shmctl);
COND_SYSCALL(shmat);
COND_SYSCALL_COMPAT(shmat);
COND_SYSCALL(shmdt);
-
-/* net/socket.c */
COND_SYSCALL(socket);
COND_SYSCALL(socketpair);
COND_SYSCALL(bind);
@@ -256,30 +160,18 @@ COND_SYSCALL(sendmsg);
COND_SYSCALL_COMPAT(sendmsg);
COND_SYSCALL(recvmsg);
COND_SYSCALL_COMPAT(recvmsg);
-
-/* mm/filemap.c */
-
-/* mm/nommu.c, also with MMU */
COND_SYSCALL(mremap);
-
-/* security/keys/keyctl.c */
COND_SYSCALL(add_key);
COND_SYSCALL(request_key);
COND_SYSCALL(keyctl);
COND_SYSCALL_COMPAT(keyctl);
-
-/* security/landlock/syscalls.c */
COND_SYSCALL(landlock_create_ruleset);
COND_SYSCALL(landlock_add_rule);
COND_SYSCALL(landlock_restrict_self);
-
-/* arch/example/kernel/sys_example.c */
-
-/* mm/fadvise.c */
COND_SYSCALL(fadvise64_64);
COND_SYSCALL_COMPAT(fadvise64_64);
-/* mm/, CONFIG_MMU only */
+/* CONFIG_MMU only */
COND_SYSCALL(swapon);
COND_SYSCALL(swapoff);
COND_SYSCALL(mprotect);
@@ -299,6 +191,7 @@ COND_SYSCALL(set_mempolicy);
COND_SYSCALL(migrate_pages);
COND_SYSCALL(move_pages);
COND_SYSCALL(set_mempolicy_home_node);
+COND_SYSCALL(cachestat);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
@@ -381,6 +274,7 @@ COND_SYSCALL(vm86old);
COND_SYSCALL(modify_ldt);
COND_SYSCALL(vm86);
COND_SYSCALL(kexec_file_load);
+COND_SYSCALL(map_shadow_stack);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bfe53e835524..354a2d294f52 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1783,11 +1783,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = sysctl_max_threads,
},
{
- .procname = "usermodehelper",
- .mode = 0555,
- .child = usermodehelper_table,
- },
- {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
@@ -1962,13 +1957,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_KEYS
- {
- .procname = "keys",
- .mode = 0555,
- .child = key_sysctls,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
/*
* User-space scripts rely on the existence of this file
@@ -2120,13 +2108,6 @@ static struct ctl_table vm_table[] = {
},
#endif
{
- .procname = "lowmem_reserve_ratio",
- .data = &sysctl_lowmem_reserve_ratio,
- .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
- .mode = 0644,
- .proc_handler = lowmem_reserve_ratio_sysctl_handler,
- },
- {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
@@ -2136,39 +2117,6 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_FOUR,
},
{
- .procname = "min_free_kbytes",
- .data = &min_free_kbytes,
- .maxlen = sizeof(min_free_kbytes),
- .mode = 0644,
- .proc_handler = min_free_kbytes_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_boost_factor",
- .data = &watermark_boost_factor,
- .maxlen = sizeof(watermark_boost_factor),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_scale_factor",
- .data = &watermark_scale_factor,
- .maxlen = sizeof(watermark_scale_factor),
- .mode = 0644,
- .proc_handler = watermark_scale_factor_sysctl_handler,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_THREE_THOUSAND,
- },
- {
- .procname = "percpu_pagelist_high_fraction",
- .data = &percpu_pagelist_high_fraction,
- .maxlen = sizeof(percpu_pagelist_high_fraction),
- .mode = 0644,
- .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "page_lock_unfairness",
.data = &sysctl_page_lock_unfairness,
.maxlen = sizeof(sysctl_page_lock_unfairness),
@@ -2223,24 +2171,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
- {
- .procname = "min_unmapped_ratio",
- .data = &sysctl_min_unmapped_ratio,
- .maxlen = sizeof(sysctl_min_unmapped_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "min_slab_ratio",
- .data = &sysctl_min_slab_ratio,
- .maxlen = sizeof(sysctl_min_slab_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
#endif
#ifdef CONFIG_SMP
{
@@ -2267,15 +2197,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = mmap_min_addr_handler,
},
#endif
-#ifdef CONFIG_NUMA
- {
- .procname = "numa_zonelist_order",
- .data = &numa_zonelist_order,
- .maxlen = NUMA_ZONELIST_ORDER_LEN,
- .mode = 0644,
- .proc_handler = numa_zonelist_order_handler,
- },
-#endif
#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
@@ -2331,34 +2252,10 @@ static struct ctl_table vm_table[] = {
{ }
};
-static struct ctl_table debug_table[] = {
-#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
- {
- .procname = "exception-trace",
- .data = &show_unhandled_signals,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#endif
- { }
-};
-
-static struct ctl_table dev_table[] = {
- { }
-};
-
-DECLARE_SYSCTL_BASE(kernel, kern_table);
-DECLARE_SYSCTL_BASE(vm, vm_table);
-DECLARE_SYSCTL_BASE(debug, debug_table);
-DECLARE_SYSCTL_BASE(dev, dev_table);
-
int __init sysctl_init_bases(void)
{
- register_sysctl_base(kernel);
- register_sysctl_base(vm);
- register_sysctl_base(debug);
- register_sysctl_base(dev);
+ register_sysctl_init("kernel", kern_table);
+ register_sysctl_init("vm", vm_table);
return 0;
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 82b28ab0f328..8d9f13d847f0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -751,7 +751,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
{
- struct task_struct *task = (struct task_struct *)alarm->data;
+ struct task_struct *task = alarm->data;
alarm->data = NULL;
if (task)
@@ -847,7 +847,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
struct restart_block *restart = &current->restart_block;
struct alarm alarm;
ktime_t exp;
- int ret = 0;
+ int ret;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 91836b727cef..c108ed8a9804 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -473,8 +473,8 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
- u64 cs_wd_msec;
- u64 wd_msec;
+ s64 cs_wd_msec;
+ s64 wd_msec;
u32 wd_rem;
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
@@ -483,8 +483,8 @@ static void clocksource_watchdog(struct timer_list *unused)
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
- cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
- wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
+ wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
@@ -1480,7 +1480,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strlcpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str, sizeof(override_name));
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e8c08292defc..238262e4aba7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -164,6 +164,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
unsigned long *flags)
+ __acquires(&timer->base->lock)
{
struct hrtimer_clock_base *base;
@@ -280,6 +281,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
@@ -1013,6 +1015,7 @@ void hrtimers_resume_local(void)
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 808a247205a9..b924f0f096fa 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,20 +35,17 @@
#include "timekeeping.h"
#include "posix-timers.h"
-/*
- * Management arrays for POSIX timers. Timers are now kept in static hash table
- * with 512 entries.
- * Timer ids are allocated by local routine, which selects proper hash head by
- * key, constructed from current->signal address and per signal struct counter.
- * This keeps timer ids unique per process, but now they can intersect between
- * processes.
- */
+static struct kmem_cache *posix_timers_cache;
/*
- * Lets keep our timers in a slab cache :-)
+ * Timers are managed in a hash table for lockless lookup. The hash key is
+ * constructed from current::signal and the timer ID and the timer is
+ * matched against current::signal and the timer ID when walking the hash
+ * bucket list.
+ *
+ * This allows checkpoint/restore to reconstruct the exact timer IDs for
+ * a process.
*/
-static struct kmem_cache *posix_timers_cache;
-
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);
@@ -56,52 +53,12 @@ static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
-/*
- * we assume that the new SIGEV_THREAD_ID shares no bits with the other
- * SIGEV values. Here we put out an error if this assumption fails.
- */
+/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
- ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-/*
- * The timer ID is turned into a timer address by idr_find().
- * Verifying a valid ID consists of:
- *
- * a) checking that idr_find() returns other than -1.
- * b) checking that the timer id matches the one in the timer itself.
- * c) that the timer owner is in the callers thread group.
- */
-
-/*
- * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
- * to implement others. This structure defines the various
- * clocks.
- *
- * RESOLUTION: Clock resolution is used to round up timer and interval
- * times, NOT to report clock times, which are reported with as
- * much resolution as the system can muster. In some cases this
- * resolution may depend on the underlying clock hardware and
- * may not be quantifiable until run time, and only then is the
- * necessary code is written. The standard says we should say
- * something about this issue in the documentation...
- *
- * FUNCTIONS: The CLOCKs structure defines possible functions to
- * handle various clock functions.
- *
- * The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for
- * the timer. 2.) The list, it_lock, it_clock, it_id and
- * it_pid fields are not modified by timer code.
- *
- * Permissions: It is assumed that the clock_settime() function defined
- * for each clock will take care of permission checks. Some
- * clocks may be set able by any user (i.e. local process
- * clocks) others not. Currently the only set able clock we
- * have is CLOCK_REALTIME and its high res counter part, both of
- * which we beg off on and pass to do_sys_settimeofday().
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
#define lock_timer(tid, flags) \
@@ -121,9 +78,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
{
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash,
- lockdep_is_held(&hash_lock)) {
- if ((timer->it_signal == sig) && (timer->it_id == id))
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ /* timer->it_signal can be set concurrently */
+ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
}
return NULL;
@@ -140,25 +97,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -166,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
spin_unlock_irqrestore(&timr->it_lock, flags);
}
-/* Get clock_realtime */
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
@@ -178,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
return ktime_get_real();
}
-/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
{
@@ -191,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
return do_adjtimex(t);
}
-/*
- * Get monotonic time for posix timers
- */
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
@@ -206,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
return ktime_get();
}
-/*
- * Get monotonic-raw time for posix timers
- */
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
@@ -216,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_coarse_real_ts64(tp);
@@ -267,9 +220,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-/*
- * Initialize everything, well, just everything in Posix clocks/timers ;)
- */
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
@@ -300,15 +250,9 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
}
/*
- * This function is exported for use by the signal deliver code. It is
- * called just prior to the info block being released and passes that
- * block to us. It's function is to update the overrun entry AND to
- * restart the timer. It should only be called if the timer is to be
- * restarted (i.e. we have flagged this in the sys_private entry of the
- * info block).
- *
- * To protect against the timer going away while the interrupt is queued,
- * we require that the it_requeue_pending flag be set.
+ * This function is called from the signal delivery code if
+ * info->si_sys_private is not zero, which indicates that the timer has to
+ * be rearmed. Restart the timer and update info::si_overrun.
*/
void posixtimer_rearm(struct kernel_siginfo *info)
{
@@ -357,18 +301,18 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
}
/*
- * This function gets called when a POSIX.1b interval timer expires. It
- * is used as a callback from the kernel internal timer. The
- * run_timer_list code ALWAYS calls with interrupts on.
-
- * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ * This function gets called when a POSIX.1b interval timer expires from
+ * the HRTIMER interrupt (soft interrupt on RT kernels).
+ *
+ * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
+ * based timers.
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
@@ -379,9 +323,10 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
if (posix_timer_event(timr, si_private)) {
/*
- * signal was not sent because of sig_ignor
- * we will not get a call back to restart it AND
- * it should be restarted.
+ * The signal was not queued due to SIG_IGN. As a
+ * consequence the timer is not going to be rearmed from
+ * the signal delivery path. But as a real signal handler
+ * can be installed later the timer must be rearmed here.
*/
if (timr->it_interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
@@ -390,34 +335,35 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* FIXME: What we really want, is to stop this
* timer completely and restart it in case the
* SIG_IGN is removed. This is a non trivial
- * change which involves sighand locking
- * (sigh !), which we don't want to do late in
- * the release cycle.
+ * change to the signal handling code.
+ *
+ * For now let timers with an interval less than a
+ * jiffie expire every jiffie and recheck for a
+ * valid signal handler.
+ *
+ * This avoids interrupt starvation in case of a
+ * very small interval, which would expire the
+ * timer immediately again.
+ *
+ * Moving now ahead of time by one jiffie tricks
+ * hrtimer_forward() to expire the timer later,
+ * while it still maintains the overrun accuracy
+ * for the price of a slight inconsistency in the
+ * timer_gettime() case. This is at least better
+ * than a timer storm.
*
- * For now we just let timers with an interval
- * less than a jiffie expire every jiffie to
- * avoid softirq starvation in case of SIG_IGN
- * and a very small interval, which would put
- * the timer right back on the softirq pending
- * list. By moving now ahead of time we trick
- * hrtimer_forward() to expire the timer
- * later, while we still maintain the overrun
- * accuracy, but have some inconsistency in
- * the timer_gettime() case. This is at least
- * better than a starved softirq. A more
- * complex fix which solves also another related
- * inconsistency is already in the pipeline.
+ * Only required when high resolution timers are
+ * enabled as the periodic tick based timers are
+ * automatically aligned to the next tick.
*/
-#ifdef CONFIG_HIGH_RES_TIMERS
- {
- ktime_t kj = NSEC_PER_SEC / HZ;
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
+ ktime_t kj = TICK_NSEC;
if (timr->it_interval < kj)
now = ktime_add(now, kj);
}
-#endif
- timr->it_overrun += hrtimer_forward(timer, now,
- timr->it_interval);
+
+ timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
timr->it_active = 1;
@@ -454,8 +400,8 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer * alloc_posix_timer(void)
{
- struct k_itimer *tmr;
- tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+
if (!tmr)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
@@ -473,21 +419,21 @@ static void k_itimer_rcu_free(struct rcu_head *head)
kmem_cache_free(posix_timers_cache, tmr);
}
-#define IT_ID_SET 1
-#define IT_ID_NOT_SET 0
-static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+static void posix_timer_free(struct k_itimer *tmr)
{
- if (it_id_set) {
- unsigned long flags;
- spin_lock_irqsave(&hash_lock, flags);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock_irqrestore(&hash_lock, flags);
- }
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
+static void posix_timer_unhash_and_free(struct k_itimer *tmr)
+{
+ spin_lock(&hash_lock);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock(&hash_lock);
+ posix_timer_free(tmr);
+}
+
static int common_timer_create(struct k_itimer *new_timer)
{
hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -501,7 +447,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
- int it_id_set = IT_ID_NOT_SET;
if (!kc)
return -EINVAL;
@@ -513,13 +458,18 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
+
+ /*
+ * Add the timer to the hash table. The timer is not yet valid
+ * because new_timer::it_signal is still NULL. The timer id is also
+ * not yet visible to user space.
+ */
new_timer_id = posix_timer_add(new_timer);
if (new_timer_id < 0) {
- error = new_timer_id;
- goto out;
+ posix_timer_free(new_timer);
+ return new_timer_id;
}
- it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
@@ -547,30 +497,33 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
+ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
-
+ /*
+ * After succesful copy out, the timer ID is visible to user space
+ * now but not yet valid because new_timer::signal is still NULL.
+ *
+ * Complete the initialization with the clock specific create
+ * callback.
+ */
error = kc->timer_create(new_timer);
if (error)
goto out;
spin_lock_irq(&current->sighand->siglock);
- new_timer->it_signal = current->signal;
+ /* This makes the timer valid in the hash table */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
list_add(&new_timer->list, &current->signal->posix_timers);
spin_unlock_irq(&current->sighand->siglock);
-
- return 0;
/*
- * In the case of the timer belonging to another task, after
- * the task is unlocked, the timer is owned by the other task
- * and may cease to exist at any time. Don't use or modify
- * new_timer after the unlock call.
+ * After unlocking sighand::siglock @new_timer is subject to
+ * concurrent removal and cannot be touched anymore
*/
+ return 0;
out:
- release_posix_timer(new_timer, it_id_set);
+ posix_timer_unhash_and_free(new_timer);
return error;
}
@@ -604,13 +557,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-/*
- * Locking issues: We need to protect the result of the id look up until
- * we get the timer locked down so it is not deleted under us. The
- * removal is done under the idr spinlock so we use that here to bridge
- * the find to the timer lock. To avoid a dead lock, the timer id MUST
- * be release with out holding the timer lock.
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
@@ -622,10 +568,35 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
if ((unsigned long long)timer_id > INT_MAX)
return NULL;
+ /*
+ * The hash lookup and the timers are RCU protected.
+ *
+ * Timers are added to the hash in invalid state where
+ * timr::it_signal == NULL. timer::it_signal is only set after the
+ * rest of the initialization succeeded.
+ *
+ * Timer destruction happens in steps:
+ * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 2) Release timr::it_lock
+ * 3) Remove from the hash under hash_lock
+ * 4) Call RCU for removal after the grace period
+ *
+ * Holding rcu_read_lock() accross the lookup ensures that
+ * the timer cannot be freed.
+ *
+ * The lookup validates locklessly that timr::it_signal ==
+ * current::it_signal and timr::it_id == @timer_id. timr::it_id
+ * can't change, but timr::it_signal becomes NULL during
+ * destruction.
+ */
rcu_read_lock();
timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
+ /*
+ * Validate under timr::it_lock that timr::it_signal is
+ * still valid. Pairs with #1 above.
+ */
if (timr->it_signal == current->signal) {
rcu_read_unlock();
return timr;
@@ -652,20 +623,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
}
/*
- * Get the time remaining on a POSIX.1b interval timer. This function
- * is ALWAYS called with spin_lock_irq on the timer, thus it must not
- * mess with irq.
+ * Get the time remaining on a POSIX.1b interval timer.
*
- * We have a couple of messes to clean up here. First there is the case
- * of a timer that has a requeue pending. These timers should appear to
- * be in the timer list with an expiry as if we were to requeue them
- * now.
+ * Two issues to handle here:
*
- * The second issue is the SIGEV_NONE timer which may be active but is
- * not really ever put in the timer list (to save system resources).
- * This timer may be expired, and if so, we will do it here. Otherwise
- * it is the same as a requeue pending timer WRT to what we should
- * report.
+ * 1) The timer has a requeue pending. The return value must appear as
+ * if the timer has been requeued right now.
+ *
+ * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued
+ * into the hrtimer queue and therefore never expired. Emulate expiry
+ * here taking #1 into account.
*/
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
@@ -681,8 +648,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
cur_setting->it_interval = ktime_to_timespec64(iv);
} else if (!timr->it_active) {
/*
- * SIGEV_NONE oneshot timers are never queued. Check them
- * below.
+ * SIGEV_NONE oneshot timers are never queued and therefore
+ * timr->it_active is always false. The check below
+ * vs. remaining time will handle this case.
+ *
+ * For all other timers there is nothing to update here, so
+ * return.
*/
if (!sig_none)
return;
@@ -691,18 +662,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
now = kc->clock_get_ktime(timr->it_clock);
/*
- * When a requeue is pending or this is a SIGEV_NONE timer move the
- * expiry time forward by intervals, so expiry is > now.
+ * If this is an interval timer and either has requeue pending or
+ * is a SIGEV_NONE timer move the expiry time forward by intervals,
+ * so expiry is > now.
*/
if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
timr->it_overrun += kc->timer_forward(timr, now);
remaining = kc->timer_remaining(timr, now);
- /* Return 0 only, when the timer is expired and not pending */
+ /*
+ * As @now is retrieved before a possible timer_forward() and
+ * cannot be reevaluated by the compiler @remaining is based on the
+ * same @now value. Therefore @remaining is consistent vs. @now.
+ *
+ * Consequently all interval timers, i.e. @iv > 0, cannot have a
+ * remaining time <= 0 because timer_forward() guarantees to move
+ * them forward so that the next timer expiry is > @now.
+ */
if (remaining <= 0) {
/*
- * A single shot SIGEV_NONE timer must return 0, when
- * it is expired !
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
if (!sig_none)
cur_setting->it_value.tv_nsec = 1;
@@ -711,11 +693,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
}
}
-/* Get the time remaining on a POSIX.1b interval timer. */
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- struct k_itimer *timr;
const struct k_clock *kc;
+ struct k_itimer *timr;
unsigned long flags;
int ret = 0;
@@ -765,20 +746,29 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
#endif
-/*
- * Get the number of overruns of a POSIX.1b interval timer. This is to
- * be the overrun of the timer last delivered. At the same time we are
- * accumulating overruns on the next timer. The overrun is frozen when
- * the signal is delivered, either at the notify time (if the info block
- * is not queued) or at the actual delivery time (as we are informed by
- * the call back to posixtimer_rearm(). So all we need to do is
- * to pick up the frozen overrun.
+/**
+ * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
+ * @timer_id: The timer ID which identifies the timer
+ *
+ * The "overrun count" of a timer is one plus the number of expiration
+ * intervals which have elapsed between the first expiry, which queues the
+ * signal and the actual signal delivery. On signal delivery the "overrun
+ * count" is calculated and cached, so it can be returned directly here.
+ *
+ * As this is relative to the last queued signal the returned overrun count
+ * is meaningless outside of the signal delivery path and even there it
+ * does not accurately reflect the current state when user space evaluates
+ * it.
+ *
+ * Returns:
+ * -EINVAL @timer_id is invalid
+ * 1..INT_MAX The number of overruns related to the last delivered signal
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
struct k_itimer *timr;
- int overrun;
unsigned long flags;
+ int overrun;
timr = lock_timer(timer_id, &flags);
if (!timr)
@@ -831,10 +821,18 @@ static void common_timer_wait_running(struct k_itimer *timer)
}
/*
- * On PREEMPT_RT this prevent priority inversion against softirq kthread in
- * case it gets preempted while executing a timer callback. See comments in
- * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
- * cpu_relax().
+ * On PREEMPT_RT this prevents priority inversion and a potential livelock
+ * against the ksoftirqd thread in case that ksoftirqd gets preempted while
+ * executing a hrtimer callback.
+ *
+ * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
+ * just results in a cpu_relax().
+ *
+ * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
+ * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
+ * prevents spinning on an eventually scheduled out task and a livelock
+ * when the task which tries to delete or disarm the timer has preempted
+ * the task which runs the expiry in task work context.
*/
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
unsigned long *flags)
@@ -943,8 +941,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct __kernel_itimerspec __user *, new_setting,
struct __kernel_itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec, old_spec;
- struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
+ struct itimerspec64 new_spec, old_spec, *rtn;
int error = 0;
if (!new_setting)
@@ -953,6 +950,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
+ rtn = old_setting ? &old_spec : NULL;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
if (put_itimerspec64(&old_spec, old_setting))
@@ -1026,38 +1024,71 @@ retry_delete:
list_del(&timer->list);
spin_unlock(&current->sighand->siglock);
/*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
+ * A concurrent lookup could check timer::it_signal lockless. It
+ * will reevaluate with timer::it_lock held and observe the NULL.
*/
- timer->it_signal = NULL;
+ WRITE_ONCE(timer->it_signal, NULL);
unlock_timer(timer, flags);
- release_posix_timer(timer, IT_ID_SET);
+ posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * return timer owned by the process, used by exit_itimers
+ * Delete a timer if it is armed, remove it from the hash and schedule it
+ * for RCU freeing.
*/
static void itimer_delete(struct k_itimer *timer)
{
-retry_delete:
- spin_lock_irq(&timer->it_lock);
+ unsigned long flags;
+ /*
+ * irqsave is required to make timer_wait_running() work.
+ */
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+retry_delete:
+ /*
+ * Even if the timer is not longer accessible from other tasks
+ * it still might be armed and queued in the underlying timer
+ * mechanism. Worse, that timer mechanism might run the expiry
+ * function concurrently.
+ */
if (timer_delete_hook(timer) == TIMER_RETRY) {
- spin_unlock_irq(&timer->it_lock);
+ /*
+ * Timer is expired concurrently, prevent livelocks
+ * and pointless spinning on RT.
+ *
+ * timer_wait_running() drops timer::it_lock, which opens
+ * the possibility for another task to delete the timer.
+ *
+ * That's not possible here because this is invoked from
+ * do_exit() only for the last thread of the thread group.
+ * So no other task can access and delete that timer.
+ */
+ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
+ return;
+
goto retry_delete;
}
list_del(&timer->list);
- spin_unlock_irq(&timer->it_lock);
- release_posix_timer(timer, IT_ID_SET);
+ /*
+ * Setting timer::it_signal to NULL is technically not required
+ * here as nothing can access the timer anymore legitimately via
+ * the hash table. Set it to NULL nevertheless so that all deletion
+ * paths are consistent.
+ */
+ WRITE_ONCE(timer->it_signal, NULL);
+
+ spin_unlock_irqrestore(&timer->it_lock, flags);
+ posix_timer_unhash_and_free(timer);
}
/*
- * This is called by do_exit or de_thread, only when nobody else can
- * modify the signal->posix_timers list. Yet we need sighand->siglock
- * to prevent the race with /proc/pid/timers.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
*/
void exit_itimers(struct task_struct *tsk)
{
@@ -1067,10 +1098,12 @@ void exit_itimers(struct task_struct *tsk)
if (list_empty(&tsk->signal->posix_timers))
return;
+ /* Protect against concurrent read via /proc/$PID/timers */
spin_lock_irq(&tsk->sighand->siglock);
list_replace_init(&tsk->signal->posix_timers, &timers);
spin_unlock_irq(&tsk->sighand->siglock);
+ /* The timers are not longer accessible via tsk::signal */
while (!list_empty(&timers)) {
tmr = list_first_entry(&timers, struct k_itimer, list);
itimer_delete(tmr);
@@ -1089,6 +1122,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (get_timespec64(&new_tp, tp))
return -EFAULT;
+ /*
+ * Permission checks have to be done inside the clock specific
+ * setter callback.
+ */
return kc->clock_set(which_clock, &new_tp);
}
@@ -1139,6 +1176,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
return err;
}
+/**
+ * sys_clock_getres - Get the resolution of a clock
+ * @which_clock: The clock to get the resolution for
+ * @tp: Pointer to a a user space timespec64 for storage
+ *
+ * POSIX defines:
+ *
+ * "The clock_getres() function shall return the resolution of any
+ * clock. Clock resolutions are implementation-defined and cannot be set by
+ * a process. If the argument res is not NULL, the resolution of the
+ * specified clock shall be stored in the location pointed to by res. If
+ * res is NULL, the clock resolution is not returned. If the time argument
+ * of clock_settime() is not a multiple of res, then the value is truncated
+ * to a multiple of res."
+ *
+ * Due to the various hardware constraints the real resolution can vary
+ * wildly and even change during runtime when the underlying devices are
+ * replaced. The kernel also can use hardware devices with different
+ * resolutions for reading the time and for arming timers.
+ *
+ * The kernel therefore deviates from the POSIX spec in various aspects:
+ *
+ * 1) The resolution returned to user space
+ *
+ * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
+ * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
+ * the kernel differentiates only two cases:
+ *
+ * I) Low resolution mode:
+ *
+ * When high resolution timers are disabled at compile or runtime
+ * the resolution returned is nanoseconds per tick, which represents
+ * the precision at which timers expire.
+ *
+ * II) High resolution mode:
+ *
+ * When high resolution timers are enabled the resolution returned
+ * is always one nanosecond independent of the actual resolution of
+ * the underlying hardware devices.
+ *
+ * For CLOCK_*_ALARM the actual resolution depends on system
+ * state. When system is running the resolution is the same as the
+ * resolution of the other clocks. During suspend the actual
+ * resolution is the resolution of the underlying RTC device which
+ * might be way less precise than the clockevent device used during
+ * running state.
+ *
+ * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
+ * returned is always nanoseconds per tick.
+ *
+ * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
+ * returned is always one nanosecond under the assumption that the
+ * underlying scheduler clock has a better resolution than nanoseconds
+ * per tick.
+ *
+ * For dynamic POSIX clocks (PTP devices) the resolution returned is
+ * always one nanosecond.
+ *
+ * 2) Affect on sys_clock_settime()
+ *
+ * The kernel does not truncate the time which is handed in to
+ * sys_clock_settime(). The kernel internal timekeeping is always using
+ * nanoseconds precision independent of the clocksource device which is
+ * used to read the time from. The resolution of that device only
+ * affects the presicion of the time returned by sys_clock_gettime().
+ *
+ * Returns:
+ * 0 Success. @tp contains the resolution
+ * -EINVAL @which_clock is not a valid clock ID
+ * -EFAULT Copying the resolution to @tp faulted
+ * -ENODEV Dynamic POSIX clock is not backed by a device
+ * -EOPNOTSUPP Dynamic POSIX clock does not support getres()
+ */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
@@ -1230,7 +1340,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
#endif
/*
- * nanosleep for monotonic and realtime clocks
+ * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
*/
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
@@ -1242,8 +1352,13 @@ static int common_nsleep(const clockid_t which_clock, int flags,
which_clock);
}
+/*
+ * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
+ *
+ * Absolute nanosleeps for these clocks are time-namespace adjusted.
+ */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
- const struct timespec64 *rqtp)
+ const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 8464c5acc913..68d6c1190ac7 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = {
.actual_read_sched_clock = jiffy_sched_clock_read,
};
-static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
@@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
notrace int sched_clock_read_retry(unsigned int seq)
{
- return read_seqcount_latch_retry(&cd.seq, seq);
+ return raw_read_seqcount_latch_retry(&cd.seq, seq);
}
-unsigned long long notrace sched_clock(void)
+unsigned long long noinstr sched_clock_noinstr(void)
{
- u64 cyc, res;
- unsigned int seq;
struct clock_read_data *rd;
+ unsigned int seq;
+ u64 cyc, res;
do {
- rd = sched_clock_read_begin(&seq);
+ seq = raw_read_seqcount_latch(&cd.seq);
+ rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
- } while (sched_clock_read_retry(seq));
+ } while (raw_read_seqcount_latch_retry(&cd.seq, seq));
return res;
}
+unsigned long long notrace sched_clock(void)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return ns;
+}
+
/*
* Updating the data required to read the clock.
*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 42c0be3080bd..87015e9deacc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1041,11 +1041,11 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit < 10)
+ if (ratelimit >= 10)
return false;
/* On RT, softirqs handling may be waiting on some lock */
- if (!local_bh_blocked())
+ if (local_bh_blocked())
return false;
pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
diff --git a/kernel/time/time.c b/kernel/time/time.c
index f4198af60fee..642647f5046b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
-/*
- * Convert jiffies to milliseconds and back.
+/**
+ * jiffies_to_msecs - Convert jiffies to milliseconds
+ * @j: jiffies value
*
* Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
+ * two most common HZ cases.
+ *
+ * Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
@@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
+/**
+ * jiffies_to_usecs - Convert jiffies to microseconds
+ * @j: jiffies value
+ *
+ * Return: microseconds value
+ */
unsigned int jiffies_to_usecs(const unsigned long j)
{
/*
@@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_usecs);
-/*
+/**
* mktime64 - Converts date to seconds.
+ * @year0: year to convert
+ * @mon0: month to convert
+ * @day: day to convert
+ * @hour: hour to convert
+ * @min: minute to convert
+ * @sec: second to convert
+ *
* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
@@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs);
*
* An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
* tomorrow - (allowable under ISO 8601) is supported.
+ *
+ * Return: seconds since the epoch time for the given input date
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
@@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval);
* Set seconds and nanoseconds field of a timespec variable and
* normalize to the timespec storage format
*
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
+ * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
@@ -501,7 +518,7 @@ EXPORT_SYMBOL(set_normalized_timespec64);
* ns_to_timespec64 - Convert nanoseconds to timespec64
* @nsec: the nanoseconds value to be converted
*
- * Returns the timespec64 representation of the nsec parameter.
+ * Return: the timespec64 representation of the nsec parameter.
*/
struct timespec64 ns_to_timespec64(s64 nsec)
{
@@ -548,6 +565,8 @@ EXPORT_SYMBOL(ns_to_timespec64);
* runtime.
* The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
+ *
+ * Return: jiffies value
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
@@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
}
EXPORT_SYMBOL(__msecs_to_jiffies);
+/**
+ * __usecs_to_jiffies: - convert microseconds to jiffies
+ * @u: time in milliseconds
+ *
+ * Return: jiffies value
+ */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
@@ -568,7 +593,10 @@ unsigned long __usecs_to_jiffies(const unsigned int u)
}
EXPORT_SYMBOL(__usecs_to_jiffies);
-/*
+/**
+ * timespec64_to_jiffies - convert a timespec64 value to jiffies
+ * @value: pointer to &struct timespec64
+ *
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundaries. I.e. the line:
@@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
+ *
+ * Return: jiffies value
*/
-
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
@@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value)
}
EXPORT_SYMBOL(timespec64_to_jiffies);
+/**
+ * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
+ * @jiffies: jiffies value
+ * @value: pointer to &struct timespec64
+ */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
@@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
+
+/**
+ * jiffies_to_clock_t - Convert jiffies to clock_t
+ * @x: jiffies value
+ *
+ * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
+ */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x)
}
EXPORT_SYMBOL(jiffies_to_clock_t);
+/**
+ * clock_t_to_jiffies - Convert clock_t to jiffies
+ * @x: clock_t value
+ *
+ * Return: clock_t value converted to jiffies
+ */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
@@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
}
EXPORT_SYMBOL(clock_t_to_jiffies);
+/**
+ * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
+ * @x: jiffies_64 value
+ *
+ * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x)
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);
+/**
+ * nsec_to_clock_t - Convert nsec value to clock_t
+ * @x: nsec value
+ *
+ * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
@@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+/**
+ * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
+ * @j: jiffies64 value
+ *
+ * Return: nanoseconds value
+ */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
@@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j)
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
+/**
+ * jiffies64_to_msecs - Convert jiffies64 to milliseconds
+ * @j: jiffies64 value
+ *
+ * Return: milliseconds value
+ */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
@@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies64 value
*/
u64 nsecs_to_jiffies64(u64 n)
{
@@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies value
*/
unsigned long nsecs_to_jiffies(u64 n)
{
@@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n)
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
-/*
- * Add two timespec64 values and do a safety check for overflow.
+/**
+ * timespec64_add_safe - Add two timespec64 values and do a safety check
+ * for overflow.
+ * @lhs: first (left) timespec64 to add
+ * @rhs: second (right) timespec64 to add
+ *
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
+ *
+ * Return: sum of @lhs + @rhs
*/
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs)
@@ -778,6 +859,15 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+/**
+ * get_timespec64 - get user's time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's time value as &struct __kernel_timespec
+ *
+ * Handles compat or 32-bit modes.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_timespec64(struct timespec64 *ts,
const struct __kernel_timespec __user *uts)
{
@@ -801,6 +891,14 @@ int get_timespec64(struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(get_timespec64);
+/**
+ * put_timespec64 - convert timespec64 value to __kernel_timespec format and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct __kernel_timespec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_timespec64(const struct timespec64 *ts,
struct __kernel_timespec __user *uts)
{
@@ -839,6 +937,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64,
return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}
+/**
+ * get_old_timespec32 - get user's old-format time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's old-format time value (&struct old_timespec32)
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -848,6 +955,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
}
EXPORT_SYMBOL_GPL(get_old_timespec32);
+/**
+ * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct old_timespec32
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -857,6 +974,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(put_old_timespec32);
+/**
+ * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
+ * @it: destination &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_itimerspec64(struct itimerspec64 *it,
const struct __kernel_itimerspec __user *uit)
{
@@ -872,6 +996,14 @@ int get_itimerspec64(struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(get_itimerspec64);
+/**
+ * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
+ * and copy the latter to userspace
+ * @it: input &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_itimerspec64(const struct itimerspec64 *it,
struct __kernel_itimerspec __user *uit)
{
@@ -887,6 +1019,13 @@ int put_itimerspec64(const struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(put_itimerspec64);
+/**
+ * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
+ * @its: destination &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_itimerspec32(struct itimerspec64 *its,
const struct old_itimerspec32 __user *uits)
{
@@ -898,6 +1037,14 @@ int get_old_itimerspec32(struct itimerspec64 *its,
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);
+/**
+ * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
+ * old_itimerspec32 and copy the latter to userspace
+ * @its: input &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_itimerspec32(const struct itimerspec64 *its,
struct old_itimerspec32 __user *uits)
{
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 831e8e779ace..ca058c8af6ba 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -86,7 +86,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
}
static struct kunit_case time_test_cases[] = {
- KUNIT_CASE(time64_to_tm_test_date_range),
+ KUNIT_CASE_SLOW(time64_to_tm_test_date_range),
{}
};
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 09d594900ee0..266d02809dbb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += fast_tk_get_delta_ns(tkr);
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
delta = fast_tk_get_delta_ns(tkr);
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
*mono = basem + delta;
diff --git a/kernel/torture.c b/kernel/torture.c
index 1a0519b836ac..b28b05bbef02 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -37,6 +37,7 @@
#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include <linux/sched/rt.h>
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
@@ -54,6 +55,9 @@ module_param(verbose_sleep_frequency, int, 0444);
static int verbose_sleep_duration = 1;
module_param(verbose_sleep_duration, int, 0444);
+static int random_shuffle;
+module_param(random_shuffle, int, 0444);
+
static char *torture_type;
static int verbose;
@@ -88,8 +92,8 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_s
ktime_t hto = baset_ns;
if (trsp)
- hto += (torture_random(trsp) >> 3) % fuzzt_ns;
- set_current_state(TASK_UNINTERRUPTIBLE);
+ hto += torture_random(trsp) % fuzzt_ns;
+ set_current_state(TASK_IDLE);
return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
@@ -350,22 +354,22 @@ torture_onoff(void *arg)
if (onoff_holdoff > 0) {
VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
- schedule_timeout_interruptible(onoff_holdoff);
+ torture_hrtimeout_jiffies(onoff_holdoff, &rand);
VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
}
while (!torture_must_stop()) {
if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) {
- schedule_timeout_interruptible(HZ / 10);
+ torture_hrtimeout_jiffies(HZ / 10, &rand);
continue;
}
- cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ cpu = torture_random(&rand) % (maxcpu + 1);
if (!torture_offline(cpu,
&n_offline_attempts, &n_offline_successes,
&sum_offline, &min_offline, &max_offline))
torture_online(cpu,
&n_online_attempts, &n_online_successes,
&sum_online, &min_online, &max_online);
- schedule_timeout_interruptible(onoff_interval);
+ torture_hrtimeout_jiffies(onoff_interval, &rand);
}
stop:
@@ -518,6 +522,7 @@ static void torture_shuffle_task_unregister_all(void)
*/
static void torture_shuffle_tasks(void)
{
+ DEFINE_TORTURE_RANDOM(rand);
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
@@ -537,8 +542,10 @@ static void torture_shuffle_tasks(void)
cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
mutex_lock(&shuffle_task_mutex);
- list_for_each_entry(stp, &shuffle_task_list, st_l)
- set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ list_for_each_entry(stp, &shuffle_task_list, st_l) {
+ if (!random_shuffle || torture_random(&rand) & 0x1)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ }
mutex_unlock(&shuffle_task_mutex);
cpus_read_unlock();
@@ -550,9 +557,11 @@ static void torture_shuffle_tasks(void)
*/
static int torture_shuffle(void *arg)
{
+ DEFINE_TORTURE_RANDOM(rand);
+
VERBOSE_TOROUT_STRING("torture_shuffle task started");
do {
- schedule_timeout_interruptible(shuffle_interval);
+ torture_hrtimeout_jiffies(shuffle_interval, &rand);
torture_shuffle_tasks();
torture_shutdown_absorb("torture_shuffle");
} while (!torture_must_stop());
@@ -728,12 +737,12 @@ bool stutter_wait(const char *title)
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
- if (!ret) {
+ if (!ret && !rt_task(current)) {
sched_set_normal(current, MAX_NICE);
ret = true;
}
if (spt == 1) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_jiffies(1, NULL);
} else if (spt == 2) {
while (READ_ONCE(stutter_pause_test)) {
if (!(i++ & 0xffff))
@@ -741,7 +750,7 @@ bool stutter_wait(const char *title)
cond_resched();
}
} else {
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL);
}
torture_shutdown_absorb(title);
}
@@ -926,7 +935,7 @@ EXPORT_SYMBOL_GPL(torture_kthread_stopping);
* it starts, you will need to open-code your own.
*/
int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
- char *f, struct task_struct **tp)
+ char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp))
{
int ret = 0;
@@ -938,6 +947,10 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
*tp = NULL;
return ret;
}
+
+ if (cbf)
+ cbf(*tp);
+
wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8cf97fa4a4b3..61c541c36596 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
+config HAVE_FUNCTION_GRAPH_RETVAL
+ bool
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+config FUNCTION_GRAPH_RETVAL
+ bool "Kernel Function Graph Return Value"
+ depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on FUNCTION_GRAPH_TRACER
+ default n
+ help
+ Support recording and printing the function return value when
+ using function graph tracer. It can be helpful to locate functions
+ that return errors. This feature is off by default, and you can
+ enable it via the trace option funcgraph-retval.
+ See Documentation/trace/ftrace.rst
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
@@ -650,6 +665,32 @@ config BLK_DEV_IO_TRACE
If unsure, say N.
+config FPROBE_EVENTS
+ depends on FPROBE
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ bool "Enable fprobe-based dynamic events"
+ select TRACING
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ This allows user to add tracing events on the function entry and
+ exit via ftrace interface. The syntax is same as the kprobe events
+ and the kprobe events on function entry and exit will be
+ transparently converted to this fprobe events.
+
+config PROBE_EVENTS_BTF_ARGS
+ depends on HAVE_FUNCTION_ARG_ACCESS_API
+ depends on FPROBE_EVENTS || KPROBE_EVENTS
+ depends on DEBUG_INFO_BTF && BPF_SYSCALL
+ bool "Support BTF function arguments for probe events"
+ default y
+ help
+ The user can specify the arguments of the probe event using the names
+ of the arguments of the probed function, when the probe location is a
+ kernel function entry or a tracepoint.
+ This is available only if BTF (BPF Type Format) support is enabled.
+
config KPROBE_EVENTS
depends on KPROBES
depends on HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c6651e16b557..057cd975d014 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -99,11 +99,13 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
obj-$(CONFIG_FPROBE) += fprobe.o
obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1f4b07da327a..a7264b2c17ad 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/verification.h>
+#include <linux/namei.h>
#include <net/bpf_sk_storage.h>
@@ -86,6 +87,9 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
@@ -223,17 +227,6 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.arg3_type = ARG_ANYTHING,
};
-static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
-{
- int ret;
-
- ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
- if (unlikely(ret < 0))
- memset(dst, 0, size);
- return ret;
-}
-
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
@@ -661,8 +654,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
- int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
+ struct bpf_trace_sample_data *sds;
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -670,7 +662,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
},
};
struct perf_sample_data *sd;
- int err;
+ int nest_level, err;
+
+ preempt_disable();
+ sds = this_cpu_ptr(&bpf_trace_sds);
+ nest_level = this_cpu_inc_return(bpf_trace_nest_level);
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
err = -EBUSY;
@@ -688,9 +684,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
-
out:
this_cpu_dec(bpf_trace_nest_level);
+ preempt_enable();
return err;
}
@@ -715,7 +711,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
struct perf_raw_frag frag = {
.copy = ctx_copy,
.size = ctx_size,
@@ -732,8 +727,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
struct perf_sample_data *sd;
struct pt_regs *regs;
+ int nest_level;
u64 ret;
+ preempt_disable();
+ nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
+
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
ret = -EBUSY;
goto out;
@@ -748,6 +747,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
+ preempt_enable();
return ret;
}
@@ -1059,7 +1059,16 @@ static unsigned long get_entry_ip(unsigned long fentry_ip)
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
- struct kprobe *kp = kprobe_running();
+ struct bpf_trace_run_ctx *run_ctx __maybe_unused;
+ struct kprobe *kp;
+
+#ifdef CONFIG_UPROBES
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ if (run_ctx->is_uprobe)
+ return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr;
+#endif
+
+ kp = kprobe_running();
if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
return 0;
@@ -1098,6 +1107,30 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
+ .func = bpf_get_func_ip_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
+ .func = bpf_get_attach_cookie_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1359,9 +1392,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
}
return verify_pkcs7_signature(data_ptr->data,
- bpf_dynptr_get_size(data_ptr),
+ __bpf_dynptr_size(data_ptr),
sig_ptr->data,
- bpf_dynptr_get_size(sig_ptr),
+ __bpf_dynptr_size(sig_ptr),
trusted_keyring->key,
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
NULL);
@@ -1540,13 +1573,17 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
- &bpf_get_func_ip_proto_kprobe_multi :
- &bpf_get_func_ip_proto_kprobe;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_func_ip_proto_kprobe_multi;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_func_ip_proto_uprobe_multi;
+ return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
- &bpf_get_attach_cookie_proto_kmulti :
- &bpf_get_attach_cookie_proto_trace;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_kmulti;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_umulti;
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2369,9 +2406,13 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
if (is_tracepoint || is_syscall_tp) {
*buf = is_tracepoint ? event->tp_event->tp->name
: event->tp_event->name;
- *fd_type = BPF_FD_TYPE_TRACEPOINT;
- *probe_offset = 0x0;
- *probe_addr = 0x0;
+ /* We allow NULL pointer for tracepoint */
+ if (fd_type)
+ *fd_type = BPF_FD_TYPE_TRACEPOINT;
+ if (probe_offset)
+ *probe_offset = 0x0;
+ if (probe_addr)
+ *probe_addr = 0x0;
} else {
/* kprobe/uprobe */
err = -EOPNOTSUPP;
@@ -2384,7 +2425,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
#ifdef CONFIG_UPROBE_EVENTS
if (flags & TRACE_EVENT_FL_UPROBE)
err = bpf_get_uprobe_info(event, fd_type, buf,
- probe_offset,
+ probe_offset, probe_addr,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
}
@@ -2469,6 +2510,7 @@ struct bpf_kprobe_multi_link {
u32 cnt;
u32 mods_cnt;
struct module **mods;
+ u32 flags;
};
struct bpf_kprobe_multi_run_ctx {
@@ -2558,9 +2600,44 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
kfree(kmulti_link);
}
+static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
+ struct bpf_kprobe_multi_link *kmulti_link;
+ u32 ucount = info->kprobe_multi.count;
+ int err = 0, i;
+
+ if (!uaddrs ^ !ucount)
+ return -EINVAL;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ info->kprobe_multi.count = kmulti_link->cnt;
+ info->kprobe_multi.flags = kmulti_link->flags;
+
+ if (!uaddrs)
+ return 0;
+ if (ucount < kmulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = kmulti_link->cnt;
+
+ if (kallsyms_show_value(current_cred())) {
+ if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
+ return -EFAULT;
+ } else {
+ for (i = 0; i < ucount; i++) {
+ if (put_user(0, uaddrs + i))
+ return -EFAULT;
+ }
+ }
+ return err;
+}
+
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
.dealloc = bpf_kprobe_multi_link_dealloc,
+ .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
@@ -2652,7 +2729,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- struct pt_regs *regs, void *data)
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *data)
{
struct bpf_kprobe_multi_link *link;
@@ -2663,7 +2741,8 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
- struct pt_regs *regs, void *data)
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *data)
{
struct bpf_kprobe_multi_link *link;
@@ -2872,6 +2951,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->addrs = addrs;
link->cookies = cookies;
link->cnt = cnt;
+ link->flags = flags;
if (cookies) {
/*
@@ -2922,3 +3002,301 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif
+
+#ifdef CONFIG_UPROBES
+struct bpf_uprobe_multi_link;
+
+struct bpf_uprobe {
+ struct bpf_uprobe_multi_link *link;
+ loff_t offset;
+ u64 cookie;
+ struct uprobe_consumer consumer;
+};
+
+struct bpf_uprobe_multi_link {
+ struct path path;
+ struct bpf_link link;
+ u32 cnt;
+ struct bpf_uprobe *uprobes;
+ struct task_struct *task;
+};
+
+struct bpf_uprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ unsigned long entry_ip;
+ struct bpf_uprobe *uprobe;
+};
+
+static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
+ u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
+ &uprobes[i].consumer);
+ }
+}
+
+static void bpf_uprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+}
+
+static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
+ kvfree(umulti_link->uprobes);
+ kfree(umulti_link);
+}
+
+static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
+ .release = bpf_uprobe_multi_link_release,
+ .dealloc = bpf_uprobe_multi_link_dealloc,
+};
+
+static int uprobe_prog_run(struct bpf_uprobe *uprobe,
+ unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_uprobe_multi_link *link = uprobe->link;
+ struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .entry_ip = entry_ip,
+ .uprobe = uprobe,
+ };
+ struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->aux->sleepable;
+ struct bpf_run_ctx *old_run_ctx;
+ int err = 0;
+
+ if (link->task && current != link->task)
+ return 0;
+
+ if (sleepable)
+ rcu_read_lock_trace();
+ else
+ rcu_read_lock();
+
+ migrate_disable();
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+
+ migrate_enable();
+
+ if (sleepable)
+ rcu_read_unlock_trace();
+ else
+ rcu_read_unlock();
+ return err;
+}
+
+static bool
+uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe->link->task->mm == mm;
+}
+
+static int
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+}
+
+static int
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, func, regs);
+}
+
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->uprobe->cookie;
+}
+
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_uprobe_multi_link *link = NULL;
+ unsigned long __user *uref_ctr_offsets;
+ unsigned long *ref_ctr_offsets = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_uprobe *uprobes = NULL;
+ struct task_struct *task = NULL;
+ unsigned long __user *uoffsets;
+ u64 __user *ucookies;
+ void __user *upath;
+ u32 flags, cnt, i;
+ struct path path;
+ char *name;
+ pid_t pid;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.uprobe_multi.flags;
+ if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ /*
+ * path, offsets and cnt are mandatory,
+ * ref_ctr_offsets and cookies are optional
+ */
+ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
+ cnt = attr->link_create.uprobe_multi.cnt;
+
+ if (!upath || !uoffsets || !cnt)
+ return -EINVAL;
+
+ uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
+ ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
+
+ name = strndup_user(upath, PATH_MAX);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ return err;
+ }
+
+ err = kern_path(name, LOOKUP_FOLLOW, &path);
+ kfree(name);
+ if (err)
+ return err;
+
+ if (!d_is_reg(path.dentry)) {
+ err = -EBADF;
+ goto error_path_put;
+ }
+
+ pid = attr->link_create.uprobe_multi.pid;
+ if (pid) {
+ rcu_read_lock();
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ goto error_path_put;
+ }
+
+ err = -ENOMEM;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);
+
+ if (!uprobes || !link)
+ goto error_free;
+
+ if (uref_ctr_offsets) {
+ ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
+ if (!ref_ctr_offsets)
+ goto error_free;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+
+ uprobes[i].link = link;
+
+ if (flags & BPF_F_UPROBE_MULTI_RETURN)
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ else
+ uprobes[i].consumer.handler = uprobe_multi_link_handler;
+
+ if (pid)
+ uprobes[i].consumer.filter = uprobe_multi_link_filter;
+ }
+
+ link->cnt = cnt;
+ link->uprobes = uprobes;
+ link->path = path;
+ link->task = task;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
+ &bpf_uprobe_multi_link_lops, prog);
+
+ for (i = 0; i < cnt; i++) {
+ err = uprobe_register_refctr(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ &uprobes[i].consumer);
+ if (err) {
+ bpf_uprobe_unregister(&path, uprobes, i);
+ goto error_free;
+ }
+ }
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error_free;
+
+ kvfree(ref_ctr_offsets);
+ return bpf_link_settle(&link_primer);
+
+error_free:
+ kvfree(ref_ctr_offsets);
+ kvfree(uprobes);
+ kfree(link);
+ if (task)
+ put_task_struct(task);
+error_path_put:
+ path_put(&path);
+ return err;
+}
+#else /* !CONFIG_UPROBES */
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif /* CONFIG_UPROBES */
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 218cd95bf8e4..c83c005e654e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -15,6 +15,7 @@
#include <trace/events/sched.h>
#include "ftrace_internal.h"
+#include "trace.h"
#ifdef CONFIG_DYNAMIC_FTRACE
#define ASSIGN_OPS_HASH(opsname, val) \
@@ -236,16 +237,23 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
+/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
+struct fgraph_ret_regs;
+
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
+ unsigned long frame_pointer)
{
struct ftrace_graph_ret trace;
unsigned long ret;
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ trace.retval = fgraph_ret_regs_return_value(ret_regs);
+#endif
trace.rettime = trace_clock_local();
ftrace_graph_return(&trace);
/*
@@ -266,6 +274,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+/*
+ * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
+ * leave only ftrace_return_to_handler(ret_regs).
+ */
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
+unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+{
+ return __ftrace_return_to_handler(ret_regs,
+ fgraph_ret_regs_frame_pointer(ret_regs));
+}
+#else
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+ return __ftrace_return_to_handler(NULL, frame_pointer);
+}
+#endif
+
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
* @task: The task to read the shadow stack from
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 18d36842faf5..3b21f4063258 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -46,7 +46,7 @@ static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
}
if (fp->entry_handler)
- ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
+ ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
/* If entry_handler returns !0, nmissed is not counted. */
if (rh) {
@@ -100,19 +100,27 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
return;
}
+ /*
+ * This user handler is shared with other kprobes and is not expected to be
+ * called recursively. So if any other kprobe handler is running, this will
+ * exit as kprobe does. See the section 'Share the callbacks with kprobes'
+ * in Documentation/trace/fprobe.rst for more information.
+ */
if (unlikely(kprobe_running())) {
fp->nmissed++;
- return;
+ goto recursion_unlock;
}
kprobe_busy_begin();
__fprobe_handler(ip, parent_ip, ops, fregs);
kprobe_busy_end();
+
+recursion_unlock:
ftrace_test_recursion_unlock(bit);
}
static void fprobe_exit_handler(struct rethook_node *rh, void *data,
- struct pt_regs *regs)
+ unsigned long ret_ip, struct pt_regs *regs)
{
struct fprobe *fp = (struct fprobe *)data;
struct fprobe_rethook_node *fpr;
@@ -133,7 +141,7 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
return;
}
- fp->exit_handler(fp, fpr->entry_ip, regs,
+ fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
fp->entry_data_size ? (void *)fpr->data : NULL);
ftrace_test_recursion_unlock(bit);
}
@@ -348,6 +356,14 @@ int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
}
EXPORT_SYMBOL_GPL(register_fprobe_syms);
+bool fprobe_is_registered(struct fprobe *fp)
+{
+ if (!fp || (fp->ops.saved_func != fprobe_handler &&
+ fp->ops.saved_func != fprobe_kprobe_handler))
+ return false;
+ return true;
+}
+
/**
* unregister_fprobe() - Unregister fprobe from ftrace
* @fp: A fprobe data structure to be unregistered.
@@ -360,23 +376,19 @@ int unregister_fprobe(struct fprobe *fp)
{
int ret;
- if (!fp || (fp->ops.saved_func != fprobe_handler &&
- fp->ops.saved_func != fprobe_kprobe_handler))
+ if (!fprobe_is_registered(fp))
return -EINVAL;
- /*
- * rethook_free() starts disabling the rethook, but the rethook handlers
- * may be running on other processors at this point. To make sure that all
- * current running handlers are finished, call unregister_ftrace_function()
- * after this.
- */
if (fp->rethook)
- rethook_free(fp->rethook);
+ rethook_stop(fp->rethook);
ret = unregister_ftrace_function(&fp->ops);
if (ret < 0)
return ret;
+ if (fp->rethook)
+ rethook_free(fp->rethook);
+
ftrace_free_filter(&fp->ops);
return ret;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 764668467155..8de8bec5f366 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
return cnt;
}
+static void ftrace_free_pages(struct ftrace_page *pages)
+{
+ struct ftrace_page *pg = pages;
+
+ while (pg) {
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
+ pages = pg->next;
+ kfree(pg);
+ pg = pages;
+ ftrace_number_of_groups--;
+ }
+}
+
static struct ftrace_page *
ftrace_allocate_pages(unsigned long num_to_init)
{
@@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- pg = start_pg;
- while (pg) {
- if (pg->records) {
- free_pages((unsigned long)pg->records, pg->order);
- ftrace_number_of_pages -= 1 << pg->order;
- }
- start_pg = pg->next;
- kfree(pg);
- pg = start_pg;
- ftrace_number_of_groups--;
- }
+ ftrace_free_pages(start_pg);
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
}
@@ -3861,6 +3867,9 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
+ if (iter->flags & FTRACE_ITER_ADDRS)
+ seq_printf(m, "%lx ", rec->ip);
+
if (print_rec(m, rec->ip)) {
/* This should only happen when a rec is disabled */
WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
@@ -3996,6 +4005,30 @@ ftrace_touched_open(struct inode *inode, struct file *file)
return 0;
}
+static int
+ftrace_avail_addrs_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ADDRS;
+ iter->ops = &global_ops;
+
+ return 0;
+}
+
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -5743,7 +5776,7 @@ bool ftrace_filter_param __initdata;
static int __init set_ftrace_notrace(char *str)
{
ftrace_filter_param = true;
- strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
@@ -5751,7 +5784,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
ftrace_filter_param = true;
- strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -5763,14 +5796,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static int __init set_graph_function(char *str)
{
- strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_filter=", set_graph_function);
static int __init set_graph_notrace_function(char *str)
{
- strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
@@ -5916,6 +5949,13 @@ static const struct file_operations ftrace_touched_fops = {
.release = seq_release_private,
};
+static const struct file_operations ftrace_avail_addrs_fops = {
+ .open = ftrace_avail_addrs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
@@ -6377,6 +6417,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
trace_create_file("available_filter_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_avail_fops);
+ trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_avail_addrs_fops);
+
trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
@@ -6434,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg_unuse = NULL;
struct ftrace_page *start_pg;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ unsigned long skipped = 0;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -6499,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod,
* object files to satisfy alignments.
* Skip any NULL pointers.
*/
- if (!addr)
+ if (!addr) {
+ skipped++;
continue;
+ }
end_offset = (pg->index+1) * sizeof(pg->records[0]);
if (end_offset > PAGE_SIZE << pg->order) {
@@ -6514,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod,
rec->ip = addr;
}
- /* We should have used all pages */
- WARN_ON(pg->next);
+ if (pg->next) {
+ pg_unuse = pg->next;
+ pg->next = NULL;
+ }
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
@@ -6537,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod,
out:
mutex_unlock(&ftrace_lock);
+ /* We should have used all pages unless we skipped some */
+ if (pg_unuse) {
+ WARN_ON(!skipped);
+ ftrace_free_pages(pg_unuse);
+ }
return ret;
}
@@ -6569,8 +6623,8 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum,
continue;
*value = op->trampoline;
*type = 't';
- strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
- strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
*exported = 0;
return 0;
}
@@ -6725,8 +6779,7 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod) ||
- within_module_init(rec->ip, mod)) {
+ if (within_module(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -6798,8 +6851,7 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod) &&
- !within_module_init(rec->ip, mod))
+ if (!within_module(rec->ip, mod))
break;
/* Weak functions should still be ignored */
@@ -6933,7 +6985,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
if (off)
*off = addr - found_func->ip;
if (sym)
- strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+ strscpy(sym, found_func->name, KSYM_NAME_LEN);
return found_func->name;
}
@@ -6987,8 +7039,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
*value = mod_func->ip;
*type = 'T';
- strlcpy(name, mod_func->name, KSYM_NAME_LEN);
- strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+ strscpy(name, mod_func->name, KSYM_NAME_LEN);
+ strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
*exported = 1;
preempt_enable();
return 0;
@@ -7088,9 +7140,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct dyn_ftrace key;
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
- struct list_head clear_hash;
-
- INIT_LIST_HEAD(&clear_hash);
+ LIST_HEAD(clear_hash);
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 382775edf690..5012c04f92c0 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -2,6 +2,9 @@
#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H
#define _LINUX_KERNEL_FTRACE_INTERNAL_H
+int __register_ftrace_function(struct ftrace_ops *ops);
+int __unregister_ftrace_function(struct ftrace_ops *ops);
+
#ifdef CONFIG_FUNCTION_TRACER
extern struct mutex ftrace_lock;
@@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
#else /* !CONFIG_DYNAMIC_FTRACE */
-int __register_ftrace_function(struct ftrace_ops *ops);
-int __unregister_ftrace_function(struct ftrace_ops *ops);
/* Keep as macros so we do not need to define the commands */
# define ftrace_startup(ops, command) \
({ \
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 60f6cb2b486b..5eb9b598f4e9 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -54,6 +54,19 @@ static void rethook_free_rcu(struct rcu_head *head)
}
/**
+ * rethook_stop() - Stop using a rethook.
+ * @rh: the struct rethook to stop.
+ *
+ * Stop using a rethook to prepare for freeing it. If you want to wait for
+ * all running rethook handler before calling rethook_free(), you need to
+ * call this first and wait RCU, and call rethook_free().
+ */
+void rethook_stop(struct rethook *rh)
+{
+ WRITE_ONCE(rh->handler, NULL);
+}
+
+/**
* rethook_free() - Free struct rethook.
* @rh: the struct rethook to be freed.
*
@@ -301,7 +314,8 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
break;
handler = READ_ONCE(rhn->rethook->handler);
if (handler)
- handler(rhn, rhn->rethook->data, regs);
+ handler(rhn, rhn->rethook->data,
+ correct_ret_addr, regs);
if (first == node)
break;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 834b361a4a66..78502d4c7214 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -523,6 +523,8 @@ struct ring_buffer_per_cpu {
rb_time_t before_stamp;
u64 event_stamp[MAX_NEST];
u64 read_stamp;
+ /* pages removed since last reset */
+ unsigned long pages_removed;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -536,6 +538,7 @@ struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resizing;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -558,6 +561,7 @@ struct ring_buffer_iter {
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
+ unsigned long cache_pages_removed;
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
@@ -688,10 +692,7 @@ static void rb_time_set(rb_time_t *t, u64 val)
static inline bool
rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
{
- unsigned long ret;
-
- ret = local_cmpxchg(l, expect, set);
- return ret == expect;
+ return local_try_cmpxchg(l, &expect, set);
}
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
@@ -748,9 +749,7 @@ static void rb_time_set(rb_time_t *t, u64 val)
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
- u64 val;
- val = local64_cmpxchg(&t->time, expect, set);
- return val == expect;
+ return local64_try_cmpxchg(&t->time, &expect, set);
}
#endif
@@ -946,6 +945,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
/**
* ring_buffer_wake_waiters - wake up any waiters on this ring buffer
* @buffer: The ring buffer to wake waiters on
+ * @cpu: The CPU buffer to wake waiters on
*
* In the case of a file that represents a ring buffer is closing,
* it is prudent to wake up any waiters that are on this.
@@ -1489,14 +1489,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
- unsigned long ret;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
- ret = cmpxchg(ptr, val, (unsigned long)&new->list);
-
- return ret == val;
+ return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
}
/*
@@ -1956,6 +1953,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
to_remove = rb_list_head(to_remove)->next;
head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
+ /* Read iterators need to reset themselves when some pages removed */
+ cpu_buffer->pages_removed += nr_removed;
next_page = rb_list_head(to_remove)->next;
@@ -1977,12 +1976,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cpu_buffer->head_page = list_entry(next_page,
struct buffer_page, list);
- /*
- * change read pointer to make sure any read iterators reset
- * themselves
- */
- cpu_buffer->read = 0;
-
/* pages are removed, resume tracing and then free the pages */
atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
@@ -2167,7 +2160,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
-
+ atomic_inc(&buffer->resizing);
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
@@ -2322,6 +2315,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
atomic_dec(&buffer->record_disabled);
}
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return 0;
@@ -2342,6 +2336,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
}
out_err_unlock:
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return err;
}
@@ -3000,7 +2995,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long new_index, old_index;
struct buffer_page *bpage;
- unsigned long index;
unsigned long addr;
u64 write_stamp;
u64 delta;
@@ -3057,8 +3051,9 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
*/
old_index += write_mask;
new_index += write_mask;
- index = local_cmpxchg(&bpage->write, old_index, new_index);
- if (index == old_index) {
+
+ /* caution: old_index gets updated on cmpxchg failure */
+ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
return true;
@@ -3373,7 +3368,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer)
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
- * @event: The event pointer to commit.
*
* This commits the data to the ring buffer, and releases any locks held.
*
@@ -4392,6 +4386,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
@@ -4846,12 +4841,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
/*
- * Check if someone performed a consuming read to
- * the buffer. A consuming read invalidates the iterator
- * and we need to reset the iterator in this case.
+ * Check if someone performed a consuming read to the buffer
+ * or removed some pages from the buffer. In these cases,
+ * iterator was invalidated and we need to reset it.
*/
if (unlikely(iter->cache_read != cpu_buffer->read ||
- iter->cache_reader_page != cpu_buffer->reader_page))
+ iter->cache_reader_page != cpu_buffer->reader_page ||
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
rb_iter_reset(iter);
again:
@@ -5242,28 +5238,34 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+static void rb_clear_buffer_page(struct buffer_page *page)
+{
+ local_set(&page->write, 0);
+ local_set(&page->entries, 0);
+ rb_init_page(page->page);
+ page->read = 0;
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct buffer_page *page;
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
- local_set(&cpu_buffer->head_page->write, 0);
- local_set(&cpu_buffer->head_page->entries, 0);
- local_set(&cpu_buffer->head_page->page->commit, 0);
-
- cpu_buffer->head_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->head_page);
+ list_for_each_entry(page, cpu_buffer->pages, list) {
+ rb_clear_buffer_page(page);
+ }
cpu_buffer->tail_page = cpu_buffer->head_page;
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- local_set(&cpu_buffer->reader_page->write, 0);
- local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
- cpu_buffer->reader_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->reader_page);
local_set(&cpu_buffer->entries_bytes, 0);
local_set(&cpu_buffer->overrun, 0);
@@ -5289,6 +5291,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_overrun = 0;
rb_head_page_activate(cpu_buffer);
+ cpu_buffer->pages_removed = 0;
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5347,7 +5350,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
- * @cpu: The CPU buffer to be reset
*/
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
@@ -5535,6 +5537,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (local_read(&cpu_buffer_b->committing))
goto out_dec;
+ /*
+ * When resize is in progress, we cannot swap it because
+ * it will mess the state of the cpu buffer.
+ */
+ if (atomic_read(&buffer_a->resizing))
+ goto out_dec;
+ if (atomic_read(&buffer_b->resizing))
+ goto out_dec;
+
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64a4dde073ef..2b4ded753367 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -199,7 +199,7 @@ static int boot_snapshot_index;
static int __init set_cmdline_ftrace(char *str)
{
- strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+ strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
ring_buffer_expanded = true;
@@ -284,7 +284,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
- strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -294,7 +294,7 @@ static char *trace_boot_clock __initdata;
static int __init set_trace_boot_clock(char *str)
{
- strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+ strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
return 1;
}
@@ -1928,9 +1928,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* place on this CPU. We fail to record, but we reset
* the max trace buffer (no one writes directly to it)
* and flag that it failed.
+ * Another reason is resize is in progress.
*/
trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit in progress\n");
+ "Failed to swap buffers due to commit or resize in progress\n");
}
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
@@ -2546,7 +2547,7 @@ static void __trace_find_cmdline(int pid, char comm[])
if (map != NO_CMDLINE_MAP) {
tpid = savedcmd->map_cmdline_to_pid[map];
if (tpid == pid) {
- strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
return;
}
}
@@ -3155,16 +3156,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
- size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- (sizeof(*entry) - sizeof(entry->caller)) + size,
+ struct_size(entry, caller, nr_entries),
trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = nr_entries;
+ memcpy(&entry->caller, fstack->calls,
+ flex_array_size(entry, caller, nr_entries));
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -4188,15 +4189,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t l = 0;
int cpu;
- /*
- * copy the tracer to avoid using a global lock all around.
- * iter->trace is a copy of current_trace, the pointer to the
- * name may be used instead of a strcmp(), as iter->trace->name
- * will point to the same string as current_trace->name.
- */
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
- *iter->trace = *tr->current_trace;
+ if (unlikely(tr->current_trace != iter->trace)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
+ iter->trace = tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -4804,6 +4806,25 @@ static const struct seq_operations tracer_seq_ops = {
.show = s_show,
};
+/*
+ * Note, as iter itself can be allocated and freed in different
+ * ways, this function is only used to free its content, and not
+ * the iterator itself. The only requirement to all the allocations
+ * is that it must zero all fields (kzalloc), as freeing works with
+ * ethier allocated content or NULL.
+ */
+static void free_trace_iter_content(struct trace_iterator *iter)
+{
+ /* The fmt is either NULL, allocated or points to static_fmt_buf */
+ if (iter->fmt != static_fmt_buf)
+ kfree(iter->fmt);
+
+ kfree(iter->temp);
+ kfree(iter->buffer_iter);
+ mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
+}
+
static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
@@ -4845,16 +4866,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->fmt = NULL;
iter->fmt_size = 0;
- /*
- * We make a copy of the current tracer to avoid concurrent
- * changes on it while we are reading.
- */
mutex_lock(&trace_types_lock);
- iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
- if (!iter->trace)
- goto fail;
-
- *iter->trace = *tr->current_trace;
+ iter->trace = tr->current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
@@ -4919,9 +4932,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
fail:
mutex_unlock(&trace_types_lock);
- kfree(iter->trace);
- kfree(iter->temp);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
release:
seq_release_private(inode, file);
return ERR_PTR(-ENOMEM);
@@ -5000,12 +5011,7 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
- mutex_destroy(&iter->mutex);
- free_cpumask_var(iter->started);
- kfree(iter->fmt);
- kfree(iter->temp);
- kfree(iter->trace);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
seq_release_private(inode, file);
return 0;
@@ -5199,7 +5205,7 @@ static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.read_iter = seq_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = copy_splice_read,
.write = tracing_write_stub,
.llseek = tracing_lseek,
.release = tracing_release,
@@ -5259,11 +5265,17 @@ int tracing_set_cpumask(struct trace_array *tr,
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
}
arch_spin_unlock(&tr->max_lock);
@@ -5672,10 +5684,17 @@ static const char readme_msg[] =
" uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
-#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \
+ defined(CONFIG_FPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t Format: p[:[<group>/][<event>]] <place> [<args>]\n"
"\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n"
+#endif
+#ifdef CONFIG_FPROBE_EVENTS
+ "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n"
+ "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n"
+#endif
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
@@ -5691,7 +5710,12 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+ "\t <argname>[->field[->field|.field...]],\n"
+#else
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
@@ -6276,6 +6300,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
per_cpu_ptr(buf->data, cpu)->entries = val;
}
+static void update_buffer_entries(struct array_buffer *buf, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ } else {
+ per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
+ }
+}
+
#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
@@ -6354,18 +6387,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
return ret;
}
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->max_buffer, size);
- else
- per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+ update_buffer_entries(&tr->max_buffer, cpu);
out:
#endif /* CONFIG_TRACER_MAX_TRACE */
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->array_buffer, size);
- else
- per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
+ update_buffer_entries(&tr->array_buffer, cpu);
return ret;
}
@@ -6676,10 +6703,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6687,13 +6740,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6716,7 +6772,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6726,12 +6782,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6748,12 +6807,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
- free_cpumask_var(iter->started);
- kfree(iter->fmt);
- mutex_destroy(&iter->mutex);
+ free_trace_iter_content(iter);
kfree(iter);
trace_array_put(tr);
@@ -7543,6 +7600,11 @@ out:
return ret;
}
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -7601,13 +7663,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
- local_irq_disable();
/* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
update_max_tr(tr, current, smp_processor_id(), NULL);
- else
- update_max_tr_single(tr, current, iter->cpu_file);
- local_irq_enable();
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
break;
default:
if (tr->allocated_snapshot) {
@@ -8135,7 +8199,7 @@ static const struct file_operations tracing_err_log_fops = {
.open = tracing_err_log_open,
.write = tracing_err_log_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = tracing_lseek,
.release = tracing_err_log_release,
};
@@ -9411,6 +9475,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9452,6 +9519,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9554,6 +9622,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10351,12 +10420,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10409,6 +10480,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79bdefe9261b..5669dd1f90d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -77,6 +77,16 @@ enum trace_type {
#undef __array
#define __array(type, item, size) type item[size];
+/*
+ * For backward compatibility, older user space expects to see the
+ * kernel_stack event with a fixed size caller field. But today the fix
+ * size is ignored by the kernel, and the real structure is dynamic.
+ * Expose to user space: "unsigned long caller[8];" but the real structure
+ * will be "unsigned long caller[] __counted_by(size)"
+ */
+#undef __stack_array
+#define __stack_array(type, item, size, field) type item[] __counted_by(field);
+
#undef __array_desc
#define __array_desc(type, container, item, size)
@@ -113,6 +123,8 @@ enum trace_type {
#define MEM_FAIL(condition, fmt, ...) \
DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
+#define FAULT_STRING "(fault)"
+
#define HIST_STACKTRACE_DEPTH 16
#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
#define HIST_STACKTRACE_SKIP 5
@@ -148,6 +160,17 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
+struct fentry_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long ip;
+};
+
+struct fexit_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
+};
+
#define TRACE_BUF_SIZE 1024
struct trace_array;
@@ -364,6 +387,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
@@ -581,7 +606,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
-void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -682,7 +706,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
int trace_pid_show(struct seq_file *m, void *v);
-void trace_free_pid_list(struct trace_pid_list *pid_list);
int trace_pid_write(struct trace_pid_list *filtered_pids,
struct trace_pid_list **new_pid_list,
const char __user *ubuf, size_t cnt);
@@ -832,6 +855,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_TAIL 0x100
#define TRACE_GRAPH_SLEEP_TIME 0x200
#define TRACE_GRAPH_GRAPH_TIME 0x400
+#define TRACE_GRAPH_PRINT_RETVAL 0x800
+#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -1280,6 +1305,14 @@ static inline void trace_branch_disable(void)
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
+union trace_synth_field {
+ u8 as_u8;
+ u16 as_u16;
+ u32 as_u32;
+ u64 as_u64;
+ struct trace_dynamic_info as_dynamic;
+};
+
struct ftrace_event_field {
struct list_head link;
const char *name;
@@ -1309,7 +1342,7 @@ struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
- struct dentry *entry;
+ struct eventfs_file *ef;
int ref_count;
int nr_events;
};
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 778200dd8ede..7ccc7a8e155b 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
/* Common ftrace options */
xbc_node_for_each_array_value(node, "options", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
pr_err("String is too long: %s\n", p);
continue;
}
@@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node)
const char *p;
xbc_node_for_each_array_value(node, "events", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
pr_err("String is too long: %s\n", p);
continue;
}
@@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
p = xbc_node_find_value(enode, "filter", NULL);
if (p && *p != '\0') {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
pr_err("filter string is too long: %s\n", p);
else if (apply_event_filter(file, buf) < 0)
pr_err("Failed to apply filter: %s\n", buf);
@@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
xbc_node_for_each_array_value(enode, "actions", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
pr_err("action string is too long: %s\n", p);
else if (trigger_process_regex(file, buf) < 0)
pr_err("Failed to apply an action: %s\n", p);
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
new file mode 100644
index 000000000000..ca224d53bfdc
--- /dev/null
+++ b/kernel/trace/trace_btf.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "trace_btf.h"
+
+/*
+ * Find a function proto type by name, and return the btf_type with its btf
+ * in *@btf_p. Return NULL if not found.
+ * Note that caller has to call btf_put(*@btf_p) after using the btf_type.
+ */
+const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p)
+{
+ const struct btf_type *t;
+ s32 id;
+
+ id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p);
+ if (id < 0)
+ return NULL;
+
+ /* Get BTF_KIND_FUNC type */
+ t = btf_type_by_id(*btf_p, id);
+ if (!t || !btf_type_is_func(t))
+ goto err;
+
+ /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */
+ t = btf_type_by_id(*btf_p, t->type);
+ if (!t || !btf_type_is_func_proto(t))
+ goto err;
+
+ return t;
+err:
+ btf_put(*btf_p);
+ return NULL;
+}
+
+/*
+ * Get function parameter with the number of parameters.
+ * This can return NULL if the function has no parameters.
+ * It can return -EINVAL if the @func_proto is not a function proto type.
+ */
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr)
+{
+ if (!btf_type_is_func_proto(func_proto))
+ return ERR_PTR(-EINVAL);
+
+ *nr = btf_type_vlen(func_proto);
+ if (*nr > 0)
+ return (const struct btf_param *)(func_proto + 1);
+ else
+ return NULL;
+}
+
+#define BTF_ANON_STACK_MAX 16
+
+struct btf_anon_stack {
+ u32 tid;
+ u32 offset;
+};
+
+/*
+ * Find a member of data structure/union by name and return it.
+ * Return NULL if not found, or -EINVAL if parameter is invalid.
+ * If the member is an member of anonymous union/structure, the offset
+ * of that anonymous union/structure is stored into @anon_offset. Caller
+ * can calculate the correct offset from the root data structure by
+ * adding anon_offset to the member's offset.
+ */
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset)
+{
+ struct btf_anon_stack *anon_stack;
+ const struct btf_member *member;
+ u32 tid, cur_offset = 0;
+ const char *name;
+ int i, top = 0;
+
+ anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL);
+ if (!anon_stack)
+ return ERR_PTR(-ENOMEM);
+
+retry:
+ if (!btf_type_is_struct(type)) {
+ member = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ for_each_member(i, type, member) {
+ if (!member->name_off) {
+ /* Anonymous union/struct: push it for later use */
+ type = btf_type_skip_modifiers(btf, member->type, &tid);
+ if (type && top < BTF_ANON_STACK_MAX) {
+ anon_stack[top].tid = tid;
+ anon_stack[top++].offset =
+ cur_offset + member->offset;
+ }
+ } else {
+ name = btf_name_by_offset(btf, member->name_off);
+ if (name && !strcmp(member_name, name)) {
+ if (anon_offset)
+ *anon_offset = cur_offset;
+ goto out;
+ }
+ }
+ }
+ if (top > 0) {
+ /* Pop from the anonymous stack and retry */
+ tid = anon_stack[--top].tid;
+ cur_offset = anon_stack[top].offset;
+ type = btf_type_by_id(btf, tid);
+ goto retry;
+ }
+ member = NULL;
+
+out:
+ kfree(anon_stack);
+ return member;
+}
+
diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h
new file mode 100644
index 000000000000..4bc44bc261e6
--- /dev/null
+++ b/kernel/trace/trace_btf.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/btf.h>
+
+const struct btf_type *btf_find_func_proto(const char *func_name,
+ struct btf **btf_p);
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
+ s32 *nr);
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index cd41e863b51c..c47422b20908 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
+
+ TRACE_GRAPH_RET,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ret, ret )
+ __field_packed( unsigned long, ret, func )
+ __field_packed( unsigned long, ret, retval )
+ __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, overrun )
+ __field_packed( unsigned long long, ret, calltime)
+ __field_packed( unsigned long long, ret, rettime )
+ ),
+
+ F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
+ (void *)__entry->func, __entry->depth,
+ __entry->calltime, __entry->rettime,
+ __entry->depth, __entry->retval)
+);
+
+#else
+
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__entry->depth)
);
+#endif
+
/*
* Context switch trace entry - which task (and prio) we switched from/to:
*
@@ -164,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size)
),
F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 67e854979d53..72714cbf475c 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -41,6 +41,10 @@ struct eprobe_data {
struct trace_eprobe *ep;
};
+
+#define for_each_trace_eprobe_tp(ep, _tp) \
+ list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list)
+
static int __trace_eprobe_create(int argc, const char *argv[]);
static void trace_event_probe_cleanup(struct trace_eprobe *ep)
@@ -227,37 +231,6 @@ error:
return ERR_PTR(ret);
}
-static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
-{
- struct probe_arg *parg = &ep->tp.args[i];
- struct ftrace_event_field *field;
- struct list_head *head;
- int ret = -ENOENT;
-
- head = trace_get_fields(ep->event);
- list_for_each_entry(field, head, link) {
- if (!strcmp(parg->code->data, field->name)) {
- kfree(parg->code->data);
- parg->code->data = field;
- return 0;
- }
- }
-
- /*
- * Argument not found on event. But allow for comm and COMM
- * to be used to get the current->comm.
- */
- if (strcmp(parg->code->data, "COMM") == 0 ||
- strcmp(parg->code->data, "comm") == 0) {
- parg->code->op = FETCH_OP_COMM;
- ret = 0;
- }
-
- kfree(parg->code->data);
- parg->code->data = NULL;
- return ret;
-}
-
static int eprobe_event_define_fields(struct trace_event_call *event_call)
{
struct eprobe_trace_entry_head field;
@@ -671,10 +644,11 @@ static int disable_eprobe(struct trace_eprobe *ep,
static int enable_trace_eprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_eprobe *ep;
bool enabled;
int ret = 0;
+ int cnt = 0;
tp = trace_probe_primary_from_call(call);
if (WARN_ON_ONCE(!tp))
@@ -692,18 +666,29 @@ static int enable_trace_eprobe(struct trace_event_call *call,
if (enabled)
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- ep = container_of(pos, struct trace_eprobe, tp);
+ for_each_trace_eprobe_tp(ep, tp) {
ret = enable_eprobe(ep, file);
if (ret)
break;
enabled = true;
+ cnt++;
}
if (ret) {
/* Failed to enable one of them. Roll back all */
- if (enabled)
- disable_eprobe(ep, file->tr);
+ if (enabled) {
+ /*
+ * It's a bug if one failed for something other than memory
+ * not being available but another eprobe succeeded.
+ */
+ WARN_ON_ONCE(ret != -ENOMEM);
+
+ for_each_trace_eprobe_tp(ep, tp) {
+ disable_eprobe(ep, file->tr);
+ if (!--cnt)
+ break;
+ }
+ }
if (file)
trace_probe_remove_file(tp, file);
else
@@ -716,7 +701,7 @@ static int enable_trace_eprobe(struct trace_event_call *call,
static int disable_trace_eprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_eprobe *ep;
tp = trace_probe_primary_from_call(call);
@@ -733,10 +718,8 @@ static int disable_trace_eprobe(struct trace_event_call *call,
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
if (!trace_probe_is_enabled(tp)) {
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- ep = container_of(pos, struct trace_eprobe, tp);
+ for_each_trace_eprobe_tp(ep, tp)
disable_eprobe(ep, file->tr);
- }
}
out:
@@ -817,23 +800,18 @@ find_and_get_event(const char *system, const char *event_name)
static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
{
- unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT;
+ struct traceprobe_parse_context ctx = {
+ .event = ep->event,
+ .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT,
+ };
int ret;
- ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags);
- if (ret)
- return ret;
-
- if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) {
- ret = trace_eprobe_tp_arg_update(ep, i);
- if (ret)
- trace_probe_log_err(0, BAD_ATTACH_ARG);
- }
-
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
/* Handle symbols "@" */
if (!ret)
ret = traceprobe_update_arg(&ep->tp.args[i]);
+ traceprobe_finish_parse(&ctx);
return ret;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e539d47989..ed367d713be0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
- unsigned long file_flags = file->flags;
int ret = 0;
int disable;
@@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Disable use of trace_buffered_event */
+ trace_buffered_event_disable();
} else
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
@@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_inc_return(&file->sm_ref) > 1)
break;
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Enable use of trace_buffered_event */
+ trace_buffered_event_enable();
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
@@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
}
- /* Enable or disable use of trace_buffered_event */
- if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
- (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
- if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
- trace_buffered_event_enable();
- else
- trace_buffered_event_disable();
- }
-
return ret;
}
@@ -990,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- tracefs_remove(dir->entry);
+ eventfs_remove(dir->ef);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -1011,7 +1005,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
tracefs_remove(dir);
}
-
+ eventfs_remove(file->ef);
list_del(&file->list);
remove_subsystem(file->system);
free_event_filter(file->filter);
@@ -2297,13 +2291,13 @@ create_new_subsystem(const char *name)
return NULL;
}
-static struct dentry *
+static struct eventfs_file *
event_subsystem_dir(struct trace_array *tr, const char *name,
struct trace_event_file *file, struct dentry *parent)
{
struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct dentry *entry;
+ int res;
/* First see if we did not already create this dir */
list_for_each_entry(dir, &tr->systems, list) {
@@ -2311,7 +2305,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
- return dir->entry;
+ return dir->ef;
}
}
@@ -2335,8 +2329,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = tracefs_create_dir(name, parent);
- if (!dir->entry) {
+ dir->ef = eventfs_add_subsystem_dir(name, parent);
+ if (IS_ERR(dir->ef)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
@@ -2351,22 +2345,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
/* the ftrace system is special, do not create enable or filter files */
if (strcmp(name, "ftrace") != 0) {
- entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
- dir->entry, dir,
+ res = eventfs_add_file("filter", TRACE_MODE_WRITE,
+ dir->ef, dir,
&ftrace_subsystem_filter_fops);
- if (!entry) {
+ if (res) {
kfree(system->filter);
system->filter = NULL;
pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
- trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
+ eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
&ftrace_system_enable_fops);
}
list_add(&dir->list, &tr->systems);
- return dir->entry;
+ return dir->ef;
out_free:
kfree(dir);
@@ -2419,36 +2413,37 @@ static int
event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
+ struct eventfs_file *ef_subsystem = NULL;
struct trace_array *tr = file->tr;
- struct dentry *d_events;
const char *name;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
- * then the system would be called "TRACE_SYSTEM".
+ * then the system would be called "TRACE_SYSTEM". This should
+ * never happen.
*/
- if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
- d_events = event_subsystem_dir(tr, call->class->system, file, parent);
- if (!d_events)
- return -ENOMEM;
- } else
- d_events = parent;
+ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
+ return -ENODEV;
+
+ ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!ef_subsystem)
+ return -ENOMEM;
name = trace_event_name(call);
- file->dir = tracefs_create_dir(name, d_events);
- if (!file->dir) {
+ file->ef = eventfs_add_dir(name, ef_subsystem);
+ if (IS_ERR(file->ef)) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
+ eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
&ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", TRACE_MODE_READ, file->dir,
+ eventfs_add_file("id", TRACE_MODE_READ, file->ef,
(void *)(long)call->event.type,
&ftrace_event_id_fops);
#endif
@@ -2464,27 +2459,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
* triggers or filters.
*/
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
+ eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
file, &ftrace_event_filter_fops);
- trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
file, &event_trigger_fops);
}
#ifdef CONFIG_HIST_TRIGGERS
- trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
+ eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
&event_hist_fops);
#endif
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
+ eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
&event_hist_debug_fops);
#endif
- trace_create_file("format", TRACE_MODE_READ, file->dir, call,
+ eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
&ftrace_event_format_fops);
#ifdef CONFIG_TRACE_EVENT_INJECT
if (call->event.type && call->class->reg)
- trace_create_file("inject", 0200, file->dir, file,
+ eventfs_add_file("inject", 0200, file->ef, file,
&event_inject_fops);
#endif
@@ -2833,7 +2828,7 @@ static __init int setup_trace_triggers(char *str)
char *buf;
int i;
- strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
ring_buffer_expanded = true;
disable_tracing_selftest("running event triggers");
@@ -3623,7 +3618,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
- strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
ring_buffer_expanded = true;
disable_tracing_selftest("running event tracing");
@@ -3637,21 +3632,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
struct dentry *d_events;
struct dentry *entry;
+ int error = 0;
entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry)
return -ENOMEM;
- d_events = tracefs_create_dir("events", parent);
- if (!d_events) {
+ d_events = eventfs_create_events_dir("events", parent);
+ if (IS_ERR(d_events)) {
pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
- entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
+ error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
tr, &ftrace_tr_enable_fops);
- if (!entry)
+ if (error)
return -ENOMEM;
/* There are not as crucial, just warn if they are not created */
@@ -3664,11 +3660,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
&ftrace_set_event_notrace_pid_fops);
/* ring buffer internal formats */
- trace_create_file("header_page", TRACE_MODE_READ, d_events,
+ eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
ring_buffer_print_page_header,
&ftrace_show_header_fops);
- trace_create_file("header_event", TRACE_MODE_READ, d_events,
+ eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
ring_buffer_print_entry_header,
&ftrace_show_header_fops);
@@ -3756,7 +3752,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- tracefs_remove(tr->event_dir);
+ eventfs_remove_events_dir(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1dad64267878..33264e510d16 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -46,15 +46,19 @@ static const char * ops[] = { OPS };
enum filter_pred_fn {
FILTER_PRED_FN_NOP,
FILTER_PRED_FN_64,
+ FILTER_PRED_FN_64_CPUMASK,
FILTER_PRED_FN_S64,
FILTER_PRED_FN_U64,
FILTER_PRED_FN_32,
+ FILTER_PRED_FN_32_CPUMASK,
FILTER_PRED_FN_S32,
FILTER_PRED_FN_U32,
FILTER_PRED_FN_16,
+ FILTER_PRED_FN_16_CPUMASK,
FILTER_PRED_FN_S16,
FILTER_PRED_FN_U16,
FILTER_PRED_FN_8,
+ FILTER_PRED_FN_8_CPUMASK,
FILTER_PRED_FN_S8,
FILTER_PRED_FN_U8,
FILTER_PRED_FN_COMM,
@@ -64,21 +68,25 @@ enum filter_pred_fn {
FILTER_PRED_FN_PCHAR_USER,
FILTER_PRED_FN_PCHAR,
FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_CPU_CPUMASK,
+ FILTER_PRED_FN_CPUMASK,
+ FILTER_PRED_FN_CPUMASK_CPU,
FILTER_PRED_FN_FUNCTION,
FILTER_PRED_FN_,
FILTER_PRED_TEST_VISITED,
};
struct filter_pred {
- enum filter_pred_fn fn_num;
- u64 val;
- u64 val2;
- struct regex regex;
+ struct regex *regex;
+ struct cpumask *mask;
unsigned short *ops;
struct ftrace_event_field *field;
- int offset;
+ u64 val;
+ u64 val2;
+ enum filter_pred_fn fn_num;
+ int offset;
int not;
- int op;
+ int op;
};
/*
@@ -94,6 +102,8 @@ struct filter_pred {
C(TOO_MANY_OPEN, "Too many '('"), \
C(TOO_MANY_CLOSE, "Too few '('"), \
C(MISSING_QUOTE, "Missing matching quote"), \
+ C(MISSING_BRACE_OPEN, "Missing '{'"), \
+ C(MISSING_BRACE_CLOSE, "Missing '}'"), \
C(OPERAND_TOO_LONG, "Operand too long"), \
C(EXPECT_STRING, "Expecting string field"), \
C(EXPECT_DIGIT, "Expecting numeric field"), \
@@ -103,6 +113,7 @@ struct filter_pred {
C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
C(INVALID_FILTER, "Meaningless filter expression"), \
+ C(INVALID_CPULIST, "Invalid cpulist"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
C(NO_FUNCTION, "Function not found"), \
@@ -186,6 +197,15 @@ enum {
PROCESS_OR = 4,
};
+static void free_predicate(struct filter_pred *pred)
+{
+ if (pred) {
+ kfree(pred->regex);
+ kfree(pred->mask);
+ kfree(pred);
+ }
+}
+
/*
* Without going into a formal proof, this explains the method that is used in
* parsing the logical expressions.
@@ -623,12 +643,64 @@ out_free:
kfree(inverts);
if (prog_stack) {
for (i = 0; prog_stack[i].pred; i++)
- kfree(prog_stack[i].pred);
+ free_predicate(prog_stack[i].pred);
kfree(prog_stack);
}
return ERR_PTR(ret);
}
+static inline int
+do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_equal(mask, cmp);
+ case OP_NE:
+ return !cpumask_equal(mask, cmp);
+ case OP_BAND:
+ return cpumask_intersects(mask, cmp);
+ default:
+ return 0;
+ }
+}
+
+/* Optimisation of do_filter_cpumask() for scalar fields */
+static inline int
+do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask)
+{
+ /*
+ * Per the weight-of-one cpumask optimisations, the mask passed in this
+ * function has a weight >= 2, so it is never equal to a single scalar.
+ */
+ switch (op) {
+ case OP_EQ:
+ return false;
+ case OP_NE:
+ return true;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
+static inline int
+do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_test_cpu(cpu, mask) &&
+ cpumask_nth(1, mask) >= nr_cpu_ids;
+ case OP_NE:
+ return !cpumask_test_cpu(cpu, mask) ||
+ cpumask_nth(1, mask) < nr_cpu_ids;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
enum pred_cmp_types {
PRED_CMP_TYPE_NOP,
PRED_CMP_TYPE_LT,
@@ -672,6 +744,18 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
} \
}
+#define DEFINE_CPUMASK_COMPARISON_PRED(size) \
+static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ unsigned int cpu = *addr; \
+ \
+ if (cpu >= nr_cpu_ids) \
+ return 0; \
+ \
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \
+}
+
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
@@ -693,6 +777,11 @@ DEFINE_COMPARISON_PRED(u16);
DEFINE_COMPARISON_PRED(s8);
DEFINE_COMPARISON_PRED(u8);
+DEFINE_CPUMASK_COMPARISON_PRED(64);
+DEFINE_CPUMASK_COMPARISON_PRED(32);
+DEFINE_CPUMASK_COMPARISON_PRED(16);
+DEFINE_CPUMASK_COMPARISON_PRED(8);
+
DEFINE_EQUALITY_PRED(64);
DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
@@ -750,7 +839,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
char *addr = (char *)(event + pred->offset);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
+ cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len);
match = cmp ^ pred->not;
@@ -763,7 +852,7 @@ static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
int len;
len = strlen(str) + 1; /* including tailing '\0' */
- cmp = pred->regex.match(str, &pred->regex, len);
+ cmp = pred->regex->match(str, pred->regex, len);
match = cmp ^ pred->not;
@@ -813,7 +902,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, str_len);
+ cmp = pred->regex->match(addr, pred->regex, str_len);
match = cmp ^ pred->not;
@@ -836,7 +925,7 @@ static int filter_pred_strrelloc(struct filter_pred *pred, void *event)
char *addr = (char *)(&item[1]) + str_loc;
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, str_len);
+ cmp = pred->regex->match(addr, pred->regex, str_len);
match = cmp ^ pred->not;
@@ -869,12 +958,42 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event)
}
}
+/* Filter predicate for current CPU vs user-provided cpumask */
+static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event)
+{
+ int cpu = raw_smp_processor_id();
+
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask);
+}
+
+/* Filter predicate for cpumask field vs user-provided cpumask */
+static int filter_pred_cpumask(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ const struct cpumask *cmp = pred->mask;
+
+ return do_filter_cpumask(pred->op, mask, cmp);
+}
+
+/* Filter predicate for cpumask field vs user-provided scalar */
+static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ unsigned int cpu = pred->val;
+
+ return do_filter_cpumask_scalar(pred->op, mask, cpu);
+}
+
/* Filter predicate for COMM. */
static int filter_pred_comm(struct filter_pred *pred, void *event)
{
int cmp;
- cmp = pred->regex.match(current->comm, &pred->regex,
+ cmp = pred->regex->match(current->comm, pred->regex,
TASK_COMM_LEN);
return cmp ^ pred->not;
}
@@ -1004,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
static void filter_build_regex(struct filter_pred *pred)
{
- struct regex *r = &pred->regex;
+ struct regex *r = pred->regex;
char *search;
enum regex_type type = MATCH_FULL;
@@ -1169,7 +1288,7 @@ static void free_prog(struct event_filter *filter)
return;
for (i = 0; prog[i].pred; i++)
- kfree(prog[i].pred);
+ free_predicate(prog[i].pred);
kfree(prog);
}
@@ -1236,8 +1355,12 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
int filter_assign_type(const char *type)
{
- if (strstr(type, "__data_loc") && strstr(type, "char"))
- return FILTER_DYN_STRING;
+ if (strstr(type, "__data_loc")) {
+ if (strstr(type, "char"))
+ return FILTER_DYN_STRING;
+ if (strstr(type, "cpumask_t"))
+ return FILTER_CPUMASK;
+ }
if (strstr(type, "__rel_loc") && strstr(type, "char"))
return FILTER_RDYN_STRING;
@@ -1313,24 +1436,32 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
switch (pred->fn_num) {
case FILTER_PRED_FN_64:
return filter_pred_64(pred, event);
+ case FILTER_PRED_FN_64_CPUMASK:
+ return filter_pred_64_cpumask(pred, event);
case FILTER_PRED_FN_S64:
return filter_pred_s64(pred, event);
case FILTER_PRED_FN_U64:
return filter_pred_u64(pred, event);
case FILTER_PRED_FN_32:
return filter_pred_32(pred, event);
+ case FILTER_PRED_FN_32_CPUMASK:
+ return filter_pred_32_cpumask(pred, event);
case FILTER_PRED_FN_S32:
return filter_pred_s32(pred, event);
case FILTER_PRED_FN_U32:
return filter_pred_u32(pred, event);
case FILTER_PRED_FN_16:
return filter_pred_16(pred, event);
+ case FILTER_PRED_FN_16_CPUMASK:
+ return filter_pred_16_cpumask(pred, event);
case FILTER_PRED_FN_S16:
return filter_pred_s16(pred, event);
case FILTER_PRED_FN_U16:
return filter_pred_u16(pred, event);
case FILTER_PRED_FN_8:
return filter_pred_8(pred, event);
+ case FILTER_PRED_FN_8_CPUMASK:
+ return filter_pred_8_cpumask(pred, event);
case FILTER_PRED_FN_S8:
return filter_pred_s8(pred, event);
case FILTER_PRED_FN_U8:
@@ -1349,6 +1480,12 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
return filter_pred_pchar(pred, event);
case FILTER_PRED_FN_CPU:
return filter_pred_cpu(pred, event);
+ case FILTER_PRED_FN_CPU_CPUMASK:
+ return filter_pred_cpu_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK:
+ return filter_pred_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK_CPU:
+ return filter_pred_cpumask_cpu(pred, event);
case FILTER_PRED_FN_FUNCTION:
return filter_pred_function(pred, event);
case FILTER_PRED_TEST_VISITED:
@@ -1553,9 +1690,130 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
+
+ } else if (!strncmp(str + i, "CPUS", 4)) {
+ unsigned int maskstart;
+ bool single;
+ char *tmp;
+
+ switch (field->filter_type) {
+ case FILTER_CPUMASK:
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ switch (op) {
+ case OP_EQ:
+ case OP_NE:
+ case OP_BAND:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Skip CPUS */
+ i += 4;
+ if (str[i++] != '{') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i);
+ goto err_free;
+ }
+ maskstart = i;
+
+ /* Walk the cpulist until closing } */
+ for (; str[i] && str[i] != '}'; i++)
+ ;
+
+ if (str[i] != '}') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i);
+ goto err_free;
+ }
+
+ if (maskstart == i) {
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+
+ /* Copy the cpulist between { and } */
+ tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL);
+ if (!tmp)
+ goto err_mem;
+
+ strscpy(tmp, str + maskstart, (i - maskstart) + 1);
+ pred->mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!pred->mask) {
+ kfree(tmp);
+ goto err_mem;
+ }
+
+ /* Now parse it */
+ if (cpulist_parse(tmp, pred->mask)) {
+ kfree(tmp);
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+ kfree(tmp);
+
+ /* Move along */
+ i++;
+
+ /*
+ * Optimisation: if the user-provided mask has a weight of one
+ * then we can treat it as a scalar input.
+ */
+ single = cpumask_weight(pred->mask) == 1;
+ if (single) {
+ pred->val = cpumask_first(pred->mask);
+ kfree(pred->mask);
+ pred->mask = NULL;
+ }
+
+ if (field->filter_type == FILTER_CPUMASK) {
+ pred->fn_num = single ?
+ FILTER_PRED_FN_CPUMASK_CPU :
+ FILTER_PRED_FN_CPUMASK;
+ } else if (field->filter_type == FILTER_CPU) {
+ if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = FILTER_PRED_FN_CPU;
+ } else {
+ pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK;
+ }
+ } else if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = select_comparison_fn(pred->op, field->size, false);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ } else {
+ switch (field->size) {
+ case 8:
+ pred->fn_num = FILTER_PRED_FN_64_CPUMASK;
+ break;
+ case 4:
+ pred->fn_num = FILTER_PRED_FN_32_CPUMASK;
+ break;
+ case 2:
+ pred->fn_num = FILTER_PRED_FN_16_CPUMASK;
+ break;
+ case 1:
+ pred->fn_num = FILTER_PRED_FN_8_CPUMASK;
+ break;
+ }
+ }
/* This is either a string, or an integer */
} else if (str[i] == '\'' || str[i] == '"') {
@@ -1597,9 +1855,12 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
filter_build_regex(pred);
@@ -1608,7 +1869,7 @@ static int parse_pred(const char *str, void *data,
} else if (field->filter_type == FILTER_STATIC_STRING) {
pred->fn_num = FILTER_PRED_FN_STRING;
- pred->regex.field_len = field->size;
+ pred->regex->field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
pred->fn_num = FILTER_PRED_FN_STRLOC;
@@ -1691,10 +1952,10 @@ static int parse_pred(const char *str, void *data,
return i;
err_free:
- kfree(pred);
+ free_predicate(pred);
return -EINVAL;
err_mem:
- kfree(pred);
+ free_predicate(pred);
return -ENOMEM;
}
@@ -2287,8 +2548,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred,
return ret;
return __ftrace_function_set_filter(pred->op == OP_EQ,
- pred->regex.pattern,
- pred->regex.len,
+ pred->regex->pattern,
+ pred->regex->len,
data);
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index b97d3ad832f1..d06938ae0717 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6663,13 +6663,16 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
if (get_named_trigger_data(trigger_data))
goto enable;
- if (has_hist_vars(hist_data))
- save_hist_vars(hist_data);
-
ret = create_actions(hist_data);
if (ret)
goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_unreg;
+ }
+
ret = tracing_map_init(hist_data->map);
if (ret)
goto out_unreg;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index d6b4935a78c0..abe805d471eb 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -217,7 +217,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
char *addr = (char *)(unsigned long) val;
if (field->filter_type == FILTER_STATIC_STRING) {
- strlcpy(entry + field->offset, addr, field->size);
+ strscpy(entry + field->offset, addr, field->size);
} else if (field->filter_type == FILTER_DYN_STRING ||
field->filter_type == FILTER_RDYN_STRING) {
int str_len = strlen(addr) + 1;
@@ -232,7 +232,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
}
entry = *pentry;
- strlcpy(entry + (entry_size - str_len), addr, str_len);
+ strscpy(entry + (entry_size - str_len), addr, str_len);
str_item = (u32 *)(entry + field->offset);
if (field->filter_type == FILTER_RDYN_STRING)
str_loc -= field->offset + field->size;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index d6a70aff2410..9897d0bfcab7 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event,
struct synth_trace_event {
struct trace_entry ent;
- u64 fields[];
+ union trace_synth_field fields[];
};
static int synth_event_define_fields(struct trace_event_call *call)
@@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type)
static void print_synth_event_num_val(struct trace_seq *s,
char *print_fmt, char *name,
- int size, u64 val, char *space)
+ int size, union trace_synth_field *val, char *space)
{
switch (size) {
case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u8, space);
break;
case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u16, space);
break;
case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u32, space);
break;
default:
@@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct synth_trace_event *entry;
struct synth_event *se;
- unsigned int i, n_u64;
+ unsigned int i, j, n_u64;
char print_fmt[32];
const char *fmt;
@@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
/* parameter values */
if (se->fields[i]->is_string) {
if (se->fields[i]->is_dynamic) {
- u32 offset, data_offset;
- char *str_field;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
-
- str_field = (char *)entry + data_offset;
+ union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- str_field,
+ (char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- (char *)&entry->fields[n_u64],
+ (char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
}
} else if (se->fields[i]->is_stack) {
- u32 offset, data_offset, len;
- unsigned long *p, *end;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
- len = offset >> 16;
-
- p = (void *)entry + data_offset;
- end = (void *)p + len - (sizeof(long) - 1);
+ union trace_synth_field *data = &entry->fields[n_u64];
+ unsigned long *p = (void *)entry + data->as_dynamic.offset;
trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
-
- for (; *p && p < end; p++)
- trace_seq_printf(s, "=> %pS\n", (void *)*p);
+ for (j = 1; j < data->as_dynamic.len / sizeof(long); j++)
+ trace_seq_printf(s, "=> %pS\n", (void *)p[j]);
n_u64++;
-
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
print_synth_event_num_val(s, print_fmt,
se->fields[i]->name,
se->fields[i]->size,
- entry->fields[n_u64],
+ &entry->fields[n_u64],
space);
if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
+ entry->fields[n_u64].as_u64,
__flags);
trace_seq_putc(s, ')');
}
@@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry,
int ret;
if (is_dynamic) {
- u32 data_offset;
+ union trace_synth_field *data = &entry->fields[*n_u64];
- data_offset = struct_size(entry, fields, event->n_u64);
- data_offset += data_size;
-
- len = fetch_store_strlen((unsigned long)str_val);
-
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+ data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
+ data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val);
ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
- str_field = (char *)&entry->fields[*n_u64];
+ str_field = (char *)&entry->fields[*n_u64].as_u64;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
@@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
unsigned int data_size,
unsigned int *n_u64)
{
+ union trace_synth_field *data = &entry->fields[*n_u64];
unsigned int len;
u32 data_offset;
void *data_loc;
@@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
break;
}
- /* Include the zero'd element if it fits */
- if (len < HIST_STACKTRACE_DEPTH)
- len++;
-
len *= sizeof(long);
/* Find the dynamic section to copy the stack into. */
@@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
memcpy(data_loc, stack, len);
/* Fill in the field that holds the offset/len combo */
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+
+ data->as_dynamic.offset = data_offset;
+ data->as_dynamic.len = len;
(*n_u64)++;
@@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data,
str_val = (char *)(long)var_ref_vals[val_idx];
if (event->dynamic_fields[i]->is_stack) {
- len = *((unsigned long *)str_val);
+ /* reserve one extra element for size */
+ len = *((unsigned long *)str_val) + 1;
len *= sizeof(unsigned long);
} else {
len = fetch_store_strlen((unsigned long)str_val);
@@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data,
switch (field->size) {
case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
+ entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
+ entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
+ entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- entry->fields[n_u64] = val;
+ entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1230,6 +1209,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
* synth_event_gen_cmd_array_start - Start synthetic event command from an array
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
* @fields: An array of type/name field descriptions
* @n_fields: The number of field descriptions contained in the fields array
*
@@ -1790,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1883,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -2030,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val,
} else {
switch (field->size) {
case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ trace_state->entry->fields[field->offset].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ trace_state->entry->fields[field->offset].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ trace_state->entry->fields[field->offset].as_u32 = (u32)val;
break;
default:
- trace_state->entry->fields[field->offset] = val;
+ trace_state->entry->fields[field->offset].as_u64 = val;
break;
}
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index e535959939d3..46439e3bcec4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data)
/**
* event_triggers_call - Call triggers associated with a trace event
* @file: The trace_event_file associated with the event
+ * @buffer: The ring buffer that the event is being written to
* @rec: The trace entry for the event, NULL for unconditional invocation
+ * @event: The event meta data in the ring buffer
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command. If rec is
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 8df0550415e7..6f046650e527 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -498,7 +498,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
return -EBUSY;
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
- &page, NULL, NULL);
+ &page, NULL);
if (unlikely(ret <= 0)) {
if (!fixup_fault)
@@ -1317,6 +1317,9 @@ static int user_field_set_string(struct ftrace_event_field *field,
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+ if (str_has_prefix(field->type, "struct "))
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size);
+
if (colon)
pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
@@ -1325,14 +1328,14 @@ static int user_field_set_string(struct ftrace_event_field *field,
static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
{
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head = &user->fields;
int pos = 0, depth = 0;
const char *str_func;
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (depth != 0)
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
@@ -1344,7 +1347,7 @@ static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (user_field_is_dyn_string(field->type, &str_func))
pos += snprintf(buf + pos, LEN_OR_ZERO,
", %s(%s)", str_func, field->name);
@@ -1729,7 +1732,7 @@ static int user_event_create(const char *raw_command)
static int user_event_show(struct seq_file *m, struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head;
int depth = 0;
@@ -1737,7 +1740,7 @@ static int user_event_show(struct seq_file *m, struct dyn_event *ev)
head = trace_get_fields(&user->call);
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (depth == 0)
seq_puts(m, " ");
else
@@ -1813,13 +1816,14 @@ out:
static bool user_fields_match(struct user_event *user, int argc,
const char **argv)
{
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head = &user->fields;
int i = 0;
- list_for_each_entry_safe_reverse(field, next, head, link)
+ list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, &i))
return false;
+ }
if (i != argc)
return false;
@@ -2096,7 +2100,8 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
if (unlikely(faulted))
return -EFAULT;
- }
+ } else
+ return -EBADF;
return ret;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 58f3946081e2..1698fc22afa0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __array
#define __array(type, item, size) type item[size];
+#undef __stack_array
+#define __stack_array(type, item, size, field) __array(type, item, size)
+
#undef __array_desc
#define __array_desc(type, container, item, size) type item[size];
@@ -114,6 +117,9 @@ static void __always_unused ____ftrace_check_##name(void) \
is_signed_type(_type), .filter_type = FILTER_OTHER, \
.len = _len },
+#undef __stack_array
+#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len)
+
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
@@ -149,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __array
#define __array(type, item, len)
+#undef __stack_array
+#define __stack_array(type, item, len, field)
+
#undef __array_desc
#define __array_desc(type, container, item, len)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
new file mode 100644
index 000000000000..8bfe23af9c73
--- /dev/null
+++ b/kernel/trace/trace_fprobe.c
@@ -0,0 +1,1230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Fprobe-based tracing events
+ * Copyright (C) 2022 Google LLC.
+ */
+#define pr_fmt(fmt) "trace_fprobe: " fmt
+
+#include <linux/fprobe.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/security.h>
+#include <linux/tracepoint.h>
+#include <linux/uaccess.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
+
+#define FPROBE_EVENT_SYSTEM "fprobes"
+#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
+#define RETHOOK_MAXACTIVE_MAX 4096
+
+static int trace_fprobe_create(const char *raw_command);
+static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_fprobe_release(struct dyn_event *ev);
+static bool trace_fprobe_is_busy(struct dyn_event *ev);
+static bool trace_fprobe_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev);
+
+static struct dyn_event_operations trace_fprobe_ops = {
+ .create = trace_fprobe_create,
+ .show = trace_fprobe_show,
+ .is_busy = trace_fprobe_is_busy,
+ .free = trace_fprobe_release,
+ .match = trace_fprobe_match,
+};
+
+/*
+ * Fprobe event core functions
+ */
+struct trace_fprobe {
+ struct dyn_event devent;
+ struct fprobe fp;
+ const char *symbol;
+ struct tracepoint *tpoint;
+ struct module *mod;
+ struct trace_probe tp;
+};
+
+static bool is_trace_fprobe(struct dyn_event *ev)
+{
+ return ev->ops == &trace_fprobe_ops;
+}
+
+static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_fprobe, devent);
+}
+
+/**
+ * for_each_trace_fprobe - iterate over the trace_fprobe list
+ * @pos: the struct trace_fprobe * for each entry
+ * @dpos: the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_fprobe(pos, dpos) \
+ for_each_dyn_event(dpos) \
+ if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos)))
+
+static bool trace_fprobe_is_return(struct trace_fprobe *tf)
+{
+ return tf->fp.exit_handler != NULL;
+}
+
+static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
+{
+ return tf->tpoint != NULL;
+}
+
+static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
+{
+ return tf->symbol ? tf->symbol : "unknown";
+}
+
+static bool trace_fprobe_is_busy(struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+
+ return trace_probe_is_enabled(&tf->tp);
+}
+
+static bool trace_fprobe_match_command_head(struct trace_fprobe *tf,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ if (!argc)
+ return true;
+
+ snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf));
+ if (strcmp(buf, argv[0]))
+ return false;
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tf->tp, argc, argv);
+}
+
+static bool trace_fprobe_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+
+ if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event))
+ return false;
+
+ if (system && strcmp(trace_probe_group_name(&tf->tp), system))
+ return false;
+
+ return trace_fprobe_match_command_head(tf, argc, argv);
+}
+
+static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
+{
+ return fprobe_is_registered(&tf->fp);
+}
+
+/*
+ * Note that we don't verify the fetch_insn code, since it does not come
+ * from user space.
+ */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ struct pt_regs *regs = rec;
+ unsigned long val;
+ int ret;
+
+retry:
+ /* 1st stage: get value from context */
+ switch (code->op) {
+ case FETCH_OP_STACK:
+ val = regs_get_kernel_stack_nth(regs, code->param);
+ break;
+ case FETCH_OP_STACKP:
+ val = kernel_stack_pointer(regs);
+ break;
+ case FETCH_OP_RETVAL:
+ val = regs_return_value(regs);
+ break;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+#endif
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
+ }
+ code++;
+
+ return process_fetch_insn_bottom(code, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* function entry handler */
+static nokprobe_inline void
+__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs,
+ struct trace_event_file *trace_file)
+{
+ struct fentry_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != trace_file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tf->tp.size + dsize);
+ if (!entry)
+ return;
+
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ entry->ip = entry_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static void
+fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ trace_probe_for_each_link_rcu(link, &tf->tp)
+ __fentry_trace_func(tf, entry_ip, regs, link->file);
+}
+NOKPROBE_SYMBOL(fentry_trace_func);
+
+/* Kretprobe handler */
+static nokprobe_inline void
+__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ struct trace_event_file *trace_file)
+{
+ struct fexit_trace_entry_head *entry;
+ struct trace_event_buffer fbuffer;
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ int dsize;
+
+ if (WARN_ON_ONCE(call != trace_file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tf->tp.size + dsize);
+ if (!entry)
+ return;
+
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ entry->func = entry_ip;
+ entry->ret_ip = ret_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static void
+fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ trace_probe_for_each_link_rcu(link, &tf->tp)
+ __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
+}
+NOKPROBE_SYMBOL(fexit_trace_func);
+
+#ifdef CONFIG_PERF_EVENTS
+
+static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct fentry_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return 0;
+
+ dsize = __get_data_size(&tf->tp, regs);
+ __size = sizeof(*entry) + tf->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ if (!entry)
+ return 0;
+
+ entry->ip = entry_ip;
+ memset(&entry[1], 0, dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
+ return 0;
+}
+NOKPROBE_SYMBOL(fentry_perf_func);
+
+static void
+fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct fexit_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+ __size = sizeof(*entry) + tf->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->func = entry_ip;
+ entry->ret_ip = ret_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
+}
+NOKPROBE_SYMBOL(fexit_perf_func);
+#endif /* CONFIG_PERF_EVENTS */
+
+static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+ int ret = 0;
+
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
+ fentry_trace_func(tf, entry_ip, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
+ ret = fentry_perf_func(tf, entry_ip, regs);
+#endif
+ return ret;
+}
+NOKPROBE_SYMBOL(fentry_dispatcher);
+
+static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
+ fexit_trace_func(tf, entry_ip, ret_ip, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
+ fexit_perf_func(tf, entry_ip, ret_ip, regs);
+#endif
+}
+NOKPROBE_SYMBOL(fexit_dispatcher);
+
+static void free_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (tf) {
+ trace_probe_cleanup(&tf->tp);
+ kfree(tf->symbol);
+ kfree(tf);
+ }
+}
+
+/*
+ * Allocate new trace_probe and initialize it (including fprobe).
+ */
+static struct trace_fprobe *alloc_trace_fprobe(const char *group,
+ const char *event,
+ const char *symbol,
+ struct tracepoint *tpoint,
+ int maxactive,
+ int nargs, bool is_return)
+{
+ struct trace_fprobe *tf;
+ int ret = -ENOMEM;
+
+ tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
+ if (!tf)
+ return ERR_PTR(ret);
+
+ tf->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tf->symbol)
+ goto error;
+
+ if (is_return)
+ tf->fp.exit_handler = fexit_dispatcher;
+ else
+ tf->fp.entry_handler = fentry_dispatcher;
+
+ tf->tpoint = tpoint;
+ tf->fp.nr_maxactive = maxactive;
+
+ ret = trace_probe_init(&tf->tp, event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&tf->devent, &trace_fprobe_ops);
+ return tf;
+error:
+ free_trace_fprobe(tf);
+ return ERR_PTR(ret);
+}
+
+static struct trace_fprobe *find_trace_fprobe(const char *event,
+ const char *group)
+{
+ struct dyn_event *pos;
+ struct trace_fprobe *tf;
+
+ for_each_trace_fprobe(tf, pos)
+ if (strcmp(trace_probe_name(&tf->tp), event) == 0 &&
+ strcmp(trace_probe_group_name(&tf->tp), group) == 0)
+ return tf;
+ return NULL;
+}
+
+static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (trace_fprobe_is_registered(tf))
+ enable_fprobe(&tf->fp);
+
+ return 0;
+}
+
+static void __disable_trace_fprobe(struct trace_probe *tp)
+{
+ struct trace_fprobe *tf;
+
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ if (!trace_fprobe_is_registered(tf))
+ continue;
+ disable_fprobe(&tf->fp);
+ }
+}
+
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_fprobe *tf;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (!enabled) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ /* TODO: check the fprobe is gone */
+ __enable_trace_fprobe(tf);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp))
+ __disable_trace_fprobe(tp);
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+/* Event entry printers */
+static enum print_line_t
+print_fentry_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct fentry_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct fentry_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_fexit_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct fexit_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct fexit_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_puts(s, " <- ");
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+
+ out:
+ return trace_handle_return(s);
+}
+
+static int fentry_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct fentry_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static int fexit_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct fexit_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_functions fentry_funcs = {
+ .trace = print_fentry_event
+};
+
+static struct trace_event_functions fexit_funcs = {
+ .trace = print_fexit_event
+};
+
+static struct trace_event_fields fentry_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = fentry_event_define_fields },
+ {}
+};
+
+static struct trace_event_fields fexit_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = fexit_event_define_fields },
+ {}
+};
+
+static int fprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data);
+
+static inline void init_trace_event_call(struct trace_fprobe *tf)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+
+ if (trace_fprobe_is_return(tf)) {
+ call->event.funcs = &fexit_funcs;
+ call->class->fields_array = fexit_fields_array;
+ } else {
+ call->event.funcs = &fentry_funcs;
+ call->class->fields_array = fentry_fields_array;
+ }
+
+ call->flags = TRACE_EVENT_FL_FPROBE;
+ call->class->reg = fprobe_register;
+}
+
+static int register_fprobe_event(struct trace_fprobe *tf)
+{
+ init_trace_event_call(tf);
+
+ return trace_probe_register_event_call(&tf->tp);
+}
+
+static int unregister_fprobe_event(struct trace_fprobe *tf)
+{
+ return trace_probe_unregister_event_call(&tf->tp);
+}
+
+/* Internal register function - just handle fprobe and flags */
+static int __register_trace_fprobe(struct trace_fprobe *tf)
+{
+ int i, ret;
+
+ /* Should we need new LOCKDOWN flag for fprobe? */
+ ret = security_locked_down(LOCKDOWN_KPROBES);
+ if (ret)
+ return ret;
+
+ if (trace_fprobe_is_registered(tf))
+ return -EINVAL;
+
+ for (i = 0; i < tf->tp.nr_args; i++) {
+ ret = traceprobe_update_arg(&tf->tp.args[i]);
+ if (ret)
+ return ret;
+ }
+
+ /* Set/clear disabled flag according to tp->flag */
+ if (trace_probe_is_enabled(&tf->tp))
+ tf->fp.flags &= ~FPROBE_FL_DISABLED;
+ else
+ tf->fp.flags |= FPROBE_FL_DISABLED;
+
+ if (trace_fprobe_is_tracepoint(tf)) {
+ struct tracepoint *tpoint = tf->tpoint;
+ unsigned long ip = (unsigned long)tpoint->probestub;
+ /*
+ * Here, we do 2 steps to enable fprobe on a tracepoint.
+ * At first, put __probestub_##TP function on the tracepoint
+ * and put a fprobe on the stub function.
+ */
+ ret = tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+ if (ret < 0)
+ return ret;
+ return register_fprobe_ips(&tf->fp, &ip, 1);
+ }
+
+ /* TODO: handle filter, nofilter or symbol list */
+ return register_fprobe(&tf->fp, tf->symbol, NULL);
+}
+
+/* Internal unregister function - just handle fprobe and flags */
+static void __unregister_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (trace_fprobe_is_registered(tf)) {
+ unregister_fprobe(&tf->fp);
+ memset(&tf->fp, 0, sizeof(tf->fp));
+ if (trace_fprobe_is_tracepoint(tf)) {
+ tracepoint_probe_unregister(tf->tpoint,
+ tf->tpoint->probestub, NULL);
+ tf->tpoint = NULL;
+ tf->mod = NULL;
+ }
+ }
+}
+
+/* TODO: make this trace_*probe common function */
+/* Unregister a trace_probe and probe_event */
+static int unregister_trace_fprobe(struct trace_fprobe *tf)
+{
+ /* If other probes are on the event, just unregister fprobe */
+ if (trace_probe_has_sibling(&tf->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&tf->tp))
+ return -EBUSY;
+
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp)))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_fprobe_event(tf))
+ return -EBUSY;
+
+unreg:
+ __unregister_trace_fprobe(tf);
+ dyn_event_remove(&tf->devent);
+ trace_probe_unlink(&tf->tp);
+
+ return 0;
+}
+
+static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
+ struct trace_fprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ int i;
+
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
+ if (strcmp(trace_fprobe_symbol(orig),
+ trace_fprobe_symbol(comp)))
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+{
+ int ret;
+
+ if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) ||
+ trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ return -EEXIST;
+ }
+ ret = trace_probe_compare_arg_type(&tf->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_fprobe_has_same_fprobe(to, tf)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tf->tp, &to->tp);
+ if (ret)
+ return ret;
+
+ ret = __register_trace_fprobe(tf);
+ if (ret)
+ trace_probe_unlink(&tf->tp);
+ else
+ dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
+
+ return ret;
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_fprobe(struct trace_fprobe *tf)
+{
+ struct trace_fprobe *old_tf;
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
+ trace_probe_group_name(&tf->tp));
+ if (old_tf) {
+ ret = append_trace_fprobe(tf, old_tf);
+ goto end;
+ }
+
+ /* Register new event */
+ ret = register_fprobe_event(tf);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ /* Register fprobe */
+ ret = __register_trace_fprobe(tf);
+ if (ret < 0)
+ unregister_fprobe_event(tf);
+ else
+ dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
+
+end:
+ mutex_unlock(&event_mutex);
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+static int __tracepoint_probe_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct tp_module *tp_mod = data;
+ struct trace_fprobe *tf;
+ struct dyn_event *pos;
+
+ if (val != MODULE_STATE_GOING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&event_mutex);
+ for_each_trace_fprobe(tf, pos) {
+ if (tp_mod->mod == tf->mod) {
+ tracepoint_probe_unregister(tf->tpoint,
+ tf->tpoint->probestub, NULL);
+ tf->tpoint = NULL;
+ tf->mod = NULL;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+#endif /* CONFIG_MODULES */
+
+struct __find_tracepoint_cb_data {
+ const char *tp_name;
+ struct tracepoint *tpoint;
+};
+
+static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name))
+ data->tpoint = tp;
+}
+
+static struct tracepoint *find_tracepoint(const char *tp_name)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ };
+
+ for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
+
+ return data.tpoint;
+}
+
+static int parse_symbol_and_return(int argc, const char *argv[],
+ char **symbol, bool *is_return,
+ bool is_tracepoint)
+{
+ char *tmp = strchr(argv[1], '%');
+ int i;
+
+ if (tmp) {
+ int len = tmp - argv[1];
+
+ if (!is_tracepoint && !strcmp(tmp, "%return")) {
+ *is_return = true;
+ } else {
+ trace_probe_log_err(len, BAD_ADDR_SUFFIX);
+ return -EINVAL;
+ }
+ *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL);
+ } else
+ *symbol = kstrdup(argv[1], GFP_KERNEL);
+ if (!*symbol)
+ return -ENOMEM;
+
+ if (*is_return)
+ return 0;
+
+ /* If there is $retval, this should be a return fprobe. */
+ for (i = 2; i < argc; i++) {
+ tmp = strstr(argv[i], "$retval");
+ if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') {
+ *is_return = true;
+ /*
+ * NOTE: Don't check is_tracepoint here, because it will
+ * be checked when the argument is parsed.
+ */
+ break;
+ }
+ }
+ return 0;
+}
+
+static int __trace_fprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * - Add fentry probe:
+ * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS]
+ * - Add fexit probe:
+ * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS]
+ * - Add tracepoint probe:
+ * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS]
+ *
+ * Fetch args:
+ * $retval : fetch return value
+ * $stack : fetch stack address
+ * $stackN : fetch Nth entry of stack (N:0-)
+ * $argN : fetch Nth argument (N:1-)
+ * $comm : fetch current task comm
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * Dereferencing memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ * Alias name of args:
+ * NAME=FETCHARG : set NAME as alias of FETCHARG.
+ * Type of args:
+ * FETCHARG:TYPE : use TYPE instead of unsigned long.
+ */
+ struct trace_fprobe *tf = NULL;
+ int i, len, new_argc = 0, ret = 0;
+ bool is_return = false;
+ char *symbol = NULL;
+ const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
+ const char **new_argv = NULL;
+ int maxactive = 0;
+ char buf[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
+ char sbuf[KSYM_NAME_LEN];
+ char abuf[MAX_BTF_ARGS_LEN];
+ bool is_tracepoint = false;
+ struct tracepoint *tpoint = NULL;
+ struct traceprobe_parse_context ctx = {
+ .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
+ };
+
+ if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
+ return -ECANCELED;
+
+ if (argv[0][0] == 't') {
+ is_tracepoint = true;
+ group = TRACEPOINT_EVENT_SYSTEM;
+ }
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event)
+ event++;
+
+ if (isdigit(argv[0][1])) {
+ if (event)
+ len = event - &argv[0][1] - 1;
+ else
+ len = strlen(&argv[0][1]);
+ if (len > MAX_EVENT_NAME_LEN - 1) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
+ }
+ memcpy(buf, &argv[0][1], len);
+ buf[len] = '\0';
+ ret = kstrtouint(buf, 0, &maxactive);
+ if (ret || !maxactive) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
+ }
+ /* fprobe rethook instances are iterated over via a list. The
+ * maximum should stay reasonable.
+ */
+ if (maxactive > RETHOOK_MAXACTIVE_MAX) {
+ trace_probe_log_err(1, MAXACT_TOO_BIG);
+ goto parse_error;
+ }
+ }
+
+ trace_probe_log_set_index(1);
+
+ /* a symbol(or tracepoint) must be specified */
+ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
+ if (ret < 0)
+ goto parse_error;
+
+ if (!is_return && maxactive) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(1, BAD_MAXACT_TYPE);
+ goto parse_error;
+ }
+
+ trace_probe_log_set_index(0);
+ if (event) {
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ }
+
+ if (!event) {
+ /* Make a new event name */
+ if (is_tracepoint)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s",
+ isdigit(*symbol) ? "_" : "", symbol);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
+ is_return ? "exit" : "entry");
+ sanitize_event_name(buf);
+ event = buf;
+ }
+
+ if (is_return)
+ ctx.flags |= TPARG_FL_RETURN;
+ else
+ ctx.flags |= TPARG_FL_FENTRY;
+
+ if (is_tracepoint) {
+ ctx.flags |= TPARG_FL_TPOINT;
+ tpoint = find_tracepoint(symbol);
+ if (!tpoint) {
+ trace_probe_log_set_index(1);
+ trace_probe_log_err(0, NO_TRACEPOINT);
+ goto parse_error;
+ }
+ ctx.funcname = kallsyms_lookup(
+ (unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
+ } else
+ ctx.funcname = symbol;
+
+ argc -= 2; argv += 2;
+ new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
+ abuf, MAX_BTF_ARGS_LEN, &ctx);
+ if (IS_ERR(new_argv)) {
+ ret = PTR_ERR(new_argv);
+ new_argv = NULL;
+ goto out;
+ }
+ if (new_argv) {
+ argc = new_argc;
+ argv = new_argv;
+ }
+
+ /* setup a probe */
+ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
+ argc, is_return);
+ if (IS_ERR(tf)) {
+ ret = PTR_ERR(tf);
+ /* This must return -ENOMEM, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
+ goto out; /* We know tf is not allocated */
+ }
+
+ if (is_tracepoint)
+ tf->mod = __module_text_address(
+ (unsigned long)tf->tpoint->probestub);
+
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ctx.offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ if (ret)
+ goto error; /* This can be -ENOMEM */
+ }
+
+ ret = traceprobe_set_print_fmt(&tf->tp,
+ is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
+ if (ret < 0)
+ goto error;
+
+ ret = register_trace_fprobe(tf);
+ if (ret) {
+ trace_probe_log_set_index(1);
+ if (ret == -EILSEQ)
+ trace_probe_log_err(0, BAD_INSN_BNDRY);
+ else if (ret == -ENOENT)
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ else if (ret != -ENOMEM && ret != -EEXIST)
+ trace_probe_log_err(0, FAIL_REG_PROBE);
+ goto error;
+ }
+
+out:
+ traceprobe_finish_parse(&ctx);
+ trace_probe_log_clear();
+ kfree(new_argv);
+ kfree(symbol);
+ return ret;
+
+parse_error:
+ ret = -EINVAL;
+error:
+ free_trace_fprobe(tf);
+ goto out;
+}
+
+static int trace_fprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_fprobe_create);
+}
+
+static int trace_fprobe_release(struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+ int ret = unregister_trace_fprobe(tf);
+
+ if (!ret)
+ free_trace_fprobe(tf);
+ return ret;
+}
+
+static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+ int i;
+
+ if (trace_fprobe_is_tracepoint(tf))
+ seq_putc(m, 't');
+ else
+ seq_putc(m, 'f');
+ if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
+ seq_printf(m, "%d", tf->fp.nr_maxactive);
+ seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
+ trace_probe_name(&tf->tp));
+
+ seq_printf(m, " %s%s", trace_fprobe_symbol(tf),
+ trace_fprobe_is_return(tf) ? "%return" : "");
+
+ for (i = 0; i < tf->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+/*
+ * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
+ */
+static int fprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_fprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_fprobe(event, file);
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return enable_trace_fprobe(event, NULL);
+ case TRACE_REG_PERF_UNREGISTER:
+ return disable_trace_fprobe(event, NULL);
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup fprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int init_fprobe_trace_early(void)
+{
+ int ret;
+
+ ret = dyn_event_register(&trace_fprobe_ops);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
+ ret = register_tracepoint_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ return ret;
+#endif
+
+ return 0;
+}
+core_initcall(init_fprobe_trace_early);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 203204cadf92..c35fbaab2a47 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ /* Display function return value ? */
+ { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) },
+ /* Display function return value in hexadecimal format ? */
+ { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
trace_seq_puts(s, "| ");
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+
+#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+
+static void print_graph_retval(struct trace_seq *s, unsigned long retval,
+ bool leaf, void *func, bool hex_format)
+{
+ unsigned long err_code = 0;
+
+ if (retval == 0 || hex_format)
+ goto done;
+
+ /* Check if the return value matches the negative format */
+ if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+ (((u64)retval) >> 32) == 0) {
+ /* sign extension */
+ err_code = (unsigned long)(s32)retval;
+ } else {
+ err_code = retval;
+ }
+
+ if (!IS_ERR_VALUE(err_code))
+ err_code = 0;
+
+done:
+ if (leaf) {
+ if (hex_format || (err_code == 0))
+ trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
+ func, retval);
+ else
+ trace_seq_printf(s, "%ps(); /* = %ld */\n",
+ func, err_code);
+ } else {
+ if (hex_format || (err_code == 0))
+ trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
+ func, retval);
+ else
+ trace_seq_printf(s, "} /* %ps = %ld */\n",
+ func, err_code);
+ }
+}
+
+#else
+
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+
+#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+
+#endif
+
/* Case of a leaf function on its call entry */
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
@@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ /*
+ * Write out the function return value if the option function-retval is
+ * enabled.
+ */
+ if (flags & __TRACE_GRAPH_PRINT_RETVAL)
+ print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
+ !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ else
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
trace_seq_putc(s, ' ');
/*
- * If the return function does not have a matching entry,
- * then the entry was lost. Instead of just printing
- * the '}' and letting the user guess what function this
- * belongs to, write out the function name. Always do
- * that if the funcgraph-tail option is enabled.
+ * Always write out the function name and its return value if the
+ * function-retval option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
- trace_seq_puts(s, "}\n");
- else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
+ print_graph_retval(s, trace->retval, false, (void *)trace->func,
+ !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ } else {
+ /*
+ * If the return function does not have a matching entry,
+ * then the entry was lost. Instead of just printing
+ * the '}' and letting the user guess what function this
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
+ */
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ }
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 2f37a6e68aa9..b791524a6536 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -635,7 +635,7 @@ static int s_mode_show(struct seq_file *s, void *v)
else
seq_printf(s, "%s", thread_mode_str[mode]);
- if (mode != MODE_MAX)
+ if (mode < MODE_MAX - 1) /* if mode is any but last */
seq_puts(s, " ");
return 0;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 590b3d51afae..ba37f768e2f2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 59cda19a9033..3d7a180a8427 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,7 +30,7 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
static int __init set_kprobe_boot_events(char *str)
{
- strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
return 1;
@@ -732,9 +732,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_kprobe *tk = NULL;
- int i, len, ret = 0;
+ int i, len, new_argc = 0, ret = 0;
bool is_return = false;
char *symbol = NULL, *tmp = NULL;
+ const char **new_argv = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
enum probe_print_type ptype;
int maxactive = 0;
@@ -742,7 +743,8 @@ static int __trace_kprobe_create(int argc, const char *argv[])
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
- unsigned int flags = TPARG_FL_KERNEL;
+ char abuf[MAX_BTF_ARGS_LEN];
+ struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
switch (argv[0][0]) {
case 'r':
@@ -764,7 +766,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (isdigit(argv[0][1])) {
if (!is_return) {
- trace_probe_log_err(1, MAXACT_NO_KPROBE);
+ trace_probe_log_err(1, BAD_MAXACT_TYPE);
goto parse_error;
}
if (event)
@@ -823,10 +825,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
goto parse_error;
}
if (is_return)
- flags |= TPARG_FL_RETURN;
+ ctx.flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
- if (ret == 0)
- flags |= TPARG_FL_FENTRY;
+ if (ret == 0 && !is_return)
+ ctx.flags |= TPARG_FL_FENTRY;
/* Defer the ENOENT case until register kprobe */
if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
@@ -854,21 +856,35 @@ static int __trace_kprobe_create(int argc, const char *argv[])
event = buf;
}
+ argc -= 2; argv += 2;
+ ctx.funcname = symbol;
+ new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
+ abuf, MAX_BTF_ARGS_LEN, &ctx);
+ if (IS_ERR(new_argv)) {
+ ret = PTR_ERR(new_argv);
+ new_argv = NULL;
+ goto out;
+ }
+ if (new_argv) {
+ argc = new_argc;
+ argv = new_argv;
+ }
+
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
- argc - 2, is_return);
+ argc, is_return);
if (IS_ERR(tk)) {
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
goto out; /* We know tk is not allocated */
}
- argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags);
+ ctx.offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
if (ret)
goto error; /* This can be -ENOMEM */
}
@@ -891,7 +907,9 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
out:
+ traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
+ kfree(new_argv);
kfree(symbol);
return ret;
@@ -1544,15 +1562,10 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
: BPF_FD_TYPE_KPROBE;
- if (tk->symbol) {
- *symbol = tk->symbol;
- *probe_offset = tk->rp.kp.offset;
- *probe_addr = 0;
- } else {
- *symbol = NULL;
- *probe_offset = 0;
- *probe_addr = (unsigned long)tk->rp.kp.addr;
- }
+ *probe_offset = tk->rp.kp.offset;
+ *probe_addr = kallsyms_show_value(current_cred()) ?
+ (unsigned long)tk->rp.kp.addr : 0;
+ *symbol = tk->symbol;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c
index 16548ee4c8c6..3851cd1e6a62 100644
--- a/kernel/trace/trace_kprobe_selftest.c
+++ b/kernel/trace/trace_kprobe_selftest.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+
+#include "trace_kprobe_selftest.h"
+
/*
* Function used during the kprobe self test. This function is in a separate
* compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e97e3fa5cbed..bd0d01d00fb9 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -181,6 +181,7 @@ struct osn_irq {
#define IRQ_CONTEXT 0
#define THREAD_CONTEXT 1
+#define THREAD_URET 2
/*
* sofirq runtime info.
*/
@@ -238,6 +239,7 @@ struct timerlat_variables {
u64 abs_period;
bool tracing_thread;
u64 count;
+ bool uthread_migrate;
};
static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
@@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
osn_var->thread.arrival_time = 0;
}
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * osnoise_stop_exception - Stop tracing and the tracer.
+ */
+static __always_inline void osnoise_stop_exception(char *msg, int cpu)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
+
+ if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
+ panic("tracer hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
+
+ tracer_tracing_off(tr);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler
+ *
+ * his function is hooked to the sched:sched_migrate_task trace event, and monitors
+ * timerlat user-space thread migration.
+ */
+static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu)
+{
+ struct osnoise_variables *osn_var;
+ long cpu = task_cpu(p);
+
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ if (osn_var->pid == p->pid && dest_cpu != cpu) {
+ per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
+ osnoise_taint("timerlat user-thread migrated\n");
+ osnoise_stop_exception("timerlat user-thread migrated", cpu);
+ }
+}
+
+static int register_migration_monitor(void)
+{
+ int ret = 0;
+
+ /*
+ * Timerlat thread migration check is only required when running timerlat in user-space.
+ * Thus, enable callback only if timerlat is set with no workload.
+ */
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+
+ return ret;
+}
+
+static void unregister_migration_monitor(void)
+{
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+}
+#else
+static int register_migration_monitor(void)
+{
+ return 0;
+}
+static void unregister_migration_monitor(void) {}
+#endif
/*
* trace_sched_switch - sched:sched_switch trace event handler
*
@@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt,
}
/*
- * hook_thread_events - Hook the insturmentation for thread noise
+ * hook_thread_events - Hook the instrumentation for thread noise
*
* Hook the osnoise tracer callbacks to handle the noise from other
* threads on the necessary kernel events.
@@ -1217,11 +1291,19 @@ static int hook_thread_events(void)
if (ret)
return -EINVAL;
+ ret = register_migration_monitor();
+ if (ret)
+ goto out_unreg;
+
return 0;
+
+out_unreg:
+ unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+ return -EINVAL;
}
/*
- * unhook_thread_events - *nhook the insturmentation for thread noise
+ * unhook_thread_events - unhook the instrumentation for thread noise
*
* Unook the osnoise tracer callbacks to handle the noise from other
* threads on the necessary kernel events.
@@ -1229,6 +1311,7 @@ static int hook_thread_events(void)
static void unhook_thread_events(void)
{
unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+ unregister_migration_monitor();
}
/*
@@ -1286,6 +1369,22 @@ static __always_inline void osnoise_stop_tracing(void)
}
/*
+ * osnoise_has_tracing_on - Check if there is at least one instance on
+ */
+static __always_inline int osnoise_has_tracing_on(void)
+{
+ struct osnoise_instance *inst;
+ int trace_is_on = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list)
+ trace_is_on += tracer_tracing_is_on(inst->tr);
+ rcu_read_unlock();
+
+ return trace_is_on;
+}
+
+/*
* notify_new_max_latency - Notify a new max latency via fsnotify interface.
*/
static void notify_new_max_latency(u64 latency)
@@ -1517,13 +1616,16 @@ static struct cpumask save_cpumask;
/*
* osnoise_sleep - sleep until the next period
*/
-static void osnoise_sleep(void)
+static void osnoise_sleep(bool skip_period)
{
u64 interval;
ktime_t wake_time;
mutex_lock(&interface_lock);
- interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ if (skip_period)
+ interval = osnoise_data.sample_period;
+ else
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
mutex_unlock(&interface_lock);
/*
@@ -1546,6 +1648,39 @@ static void osnoise_sleep(void)
}
/*
+ * osnoise_migration_pending - checks if the task needs to migrate
+ *
+ * osnoise/timerlat threads are per-cpu. If there is a pending request to
+ * migrate the thread away from the current CPU, something bad has happened.
+ * Play the good citizen and leave.
+ *
+ * Returns 0 if it is safe to continue, 1 otherwise.
+ */
+static inline int osnoise_migration_pending(void)
+{
+ if (!current->migration_pending)
+ return 0;
+
+ /*
+ * If migration is pending, there is a task waiting for the
+ * tracer to enable migration. The tracer does not allow migration,
+ * thus: taint and leave to unblock the blocked thread.
+ */
+ osnoise_taint("migration requested to osnoise threads, leaving.");
+
+ /*
+ * Unset this thread from the threads managed by the interface.
+ * The tracers are responsible for cleaning their env before
+ * exiting.
+ */
+ mutex_lock(&interface_lock);
+ this_cpu_osn_var()->kthread = NULL;
+ mutex_unlock(&interface_lock);
+
+ return 1;
+}
+
+/*
* osnoise_main - The osnoise detection kernel thread
*
* Calls run_osnoise() function to measure the osnoise for the configured runtime,
@@ -1553,12 +1688,35 @@ static void osnoise_sleep(void)
*/
static int osnoise_main(void *data)
{
+ unsigned long flags;
+
+ /*
+ * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
+ * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
+ *
+ * To work around this limitation, disable migration and remove the
+ * flag.
+ */
+ migrate_disable();
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ current->flags &= ~(PF_NO_SETAFFINITY);
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
while (!kthread_should_stop()) {
+ if (osnoise_migration_pending())
+ break;
+
+ /* skip a period if tracing is off on all instances */
+ if (!osnoise_has_tracing_on()) {
+ osnoise_sleep(true);
+ continue;
+ }
+
run_osnoise();
- osnoise_sleep();
+ osnoise_sleep(false);
}
+ migrate_enable();
return 0;
}
@@ -1706,6 +1864,7 @@ static int timerlat_main(void *data)
struct timerlat_variables *tlat = this_cpu_tmr_var();
struct timerlat_sample s;
struct sched_param sp;
+ unsigned long flags;
u64 now, diff;
/*
@@ -1714,6 +1873,18 @@ static int timerlat_main(void *data)
sp.sched_priority = DEFAULT_TIMERLAT_PRIO;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ /*
+ * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
+ * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
+ *
+ * To work around this limitation, disable migration and remove the
+ * flag.
+ */
+ migrate_disable();
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ current->flags &= ~(PF_NO_SETAFFINITY);
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+
tlat->count = 0;
tlat->tracing_thread = false;
@@ -1731,6 +1902,7 @@ static int timerlat_main(void *data)
osn_var->sampling = 1;
while (!kthread_should_stop()) {
+
now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
diff = now - tlat->abs_period;
@@ -1749,10 +1921,14 @@ static int timerlat_main(void *data)
if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
osnoise_stop_tracing();
+ if (osnoise_migration_pending())
+ break;
+
wait_next_period(tlat);
}
hrtimer_cancel(&tlat->timer);
+ migrate_enable();
return 0;
}
#else /* CONFIG_TIMERLAT_TRACER */
@@ -1771,10 +1947,24 @@ static void stop_kthread(unsigned int cpu)
kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
if (kthread) {
- kthread_stop(kthread);
+ if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ kthread_stop(kthread);
+ } else {
+ /*
+ * This is a user thread waiting on the timerlat_fd. We need
+ * to close all users, and the best way to guarantee this is
+ * by killing the thread. NOTE: this is a purpose specific file.
+ */
+ kill_pid(kthread->thread_pid, SIGKILL, 1);
+ put_task_struct(kthread);
+ }
per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
} else {
+ /* if no workload, just return */
if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ /*
+ * This is set in the osnoise tracer case.
+ */
per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
barrier();
return;
@@ -1819,7 +2009,6 @@ static int start_kthread(unsigned int cpu)
barrier();
return 0;
}
-
snprintf(comm, 24, "osnoise/%d", cpu);
}
@@ -1848,6 +2037,11 @@ static int start_per_cpu_kthreads(void)
int retval = 0;
int cpu;
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (timerlat_enabled())
+ return 0;
+ }
+
cpus_read_lock();
/*
* Run only on online CPUs in which osnoise is allowed to run.
@@ -2188,6 +2382,223 @@ err_free:
return err;
}
+#ifdef CONFIG_TIMERLAT_TRACER
+static int timerlat_fd_open(struct inode *inode, struct file *file)
+{
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat;
+ long cpu = (long) inode->i_cdev;
+
+ mutex_lock(&interface_lock);
+
+ /*
+ * This file is accessible only if timerlat is enabled, and
+ * NO_OSNOISE_WORKLOAD is set.
+ */
+ if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ mutex_unlock(&interface_lock);
+ return -EINVAL;
+ }
+
+ migrate_disable();
+
+ osn_var = this_cpu_osn_var();
+
+ /*
+ * The osn_var->pid holds the single access to this file.
+ */
+ if (osn_var->pid) {
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * timerlat tracer is a per-cpu tracer. Check if the user-space too
+ * is pinned to a single CPU. The tracer laters monitor if the task
+ * migrates and then disables tracer if it does. However, it is
+ * worth doing this basic acceptance test to avoid obviusly wrong
+ * setup.
+ */
+ if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) {
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return -EPERM;
+ }
+
+ /*
+ * From now on, it is good to go.
+ */
+ file->private_data = inode->i_cdev;
+
+ get_task_struct(current);
+
+ osn_var->kthread = current;
+ osn_var->pid = current->pid;
+
+ /*
+ * Setup is done.
+ */
+ mutex_unlock(&interface_lock);
+
+ tlat = this_cpu_tmr_var();
+ tlat->count = 0;
+
+ migrate_enable();
+ return 0;
+};
+
+/*
+ * timerlat_fd_read - Read function for "timerlat_fd" file
+ * @file: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error.
+ */
+static ssize_t
+timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ long cpu = (long) file->private_data;
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat;
+ struct timerlat_sample s;
+ s64 diff;
+ u64 now;
+
+ migrate_disable();
+
+ tlat = this_cpu_tmr_var();
+
+ /*
+ * While in user-space, the thread is migratable. There is nothing
+ * we can do about it.
+ * So, if the thread is running on another CPU, stop the machinery.
+ */
+ if (cpu == smp_processor_id()) {
+ if (tlat->uthread_migrate) {
+ migrate_enable();
+ return -EINVAL;
+ }
+ } else {
+ per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
+ osnoise_taint("timerlat user thread migrate\n");
+ osnoise_stop_tracing();
+ migrate_enable();
+ return -EINVAL;
+ }
+
+ osn_var = this_cpu_osn_var();
+
+ /*
+ * The timerlat in user-space runs in a different order:
+ * the read() starts from the execution of the previous occurrence,
+ * sleeping for the next occurrence.
+ *
+ * So, skip if we are entering on read() before the first wakeup
+ * from timerlat IRQ:
+ */
+ if (likely(osn_var->sampling)) {
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ /*
+ * it was not a timer firing, but some other signal?
+ */
+ if (diff < 0)
+ goto out;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_URET;
+
+ trace_timerlat_sample(&s);
+
+ notify_new_max_latency(diff);
+
+ tlat->tracing_thread = false;
+ if (osnoise_data.stop_tracing_total)
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+ } else {
+ tlat->tracing_thread = false;
+ tlat->kthread = current;
+
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+
+ /* Annotate now to drift new period */
+ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
+
+ osn_var->sampling = 1;
+ }
+
+ /* wait for the next period */
+ wait_next_period(tlat);
+
+ /* This is the wakeup from this cycle */
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ /*
+ * it was not a timer firing, but some other signal?
+ */
+ if (diff < 0)
+ goto out;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+ if (osnoise_data.stop_tracing_total) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
+ timerlat_dump_stack(time_to_us(diff));
+ notify_new_max_latency(diff);
+ osnoise_stop_tracing();
+ }
+ }
+
+out:
+ migrate_enable();
+ return 0;
+}
+
+static int timerlat_fd_release(struct inode *inode, struct file *file)
+{
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat_var;
+ long cpu = (long) file->private_data;
+
+ migrate_disable();
+ mutex_lock(&interface_lock);
+
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+
+ hrtimer_cancel(&tlat_var->timer);
+ memset(tlat_var, 0, sizeof(*tlat_var));
+
+ osn_var->sampling = 0;
+ osn_var->pid = 0;
+
+ /*
+ * We are leaving, not being stopped... see stop_kthread();
+ */
+ if (osn_var->kthread) {
+ put_task_struct(osn_var->kthread);
+ osn_var->kthread = NULL;
+ }
+
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return 0;
+}
+#endif
+
/*
* osnoise/runtime_us: cannot be greater than the period.
*/
@@ -2251,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = {
.max = &timerlat_max_period,
.min = &timerlat_min_period,
};
+
+static const struct file_operations timerlat_fd_fops = {
+ .open = timerlat_fd_open,
+ .read = timerlat_fd_read,
+ .release = timerlat_fd_release,
+ .llseek = generic_file_llseek,
+};
#endif
static const struct file_operations cpus_fops = {
@@ -2288,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir)
}
#endif /* CONFIG_STACKTRACE */
+static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
+{
+ struct dentry *timerlat_fd;
+ struct dentry *per_cpu;
+ struct dentry *cpu_dir;
+ char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */
+ long cpu;
+
+ /*
+ * Why not using tracing instance per_cpu/ dir?
+ *
+ * Because osnoise/timerlat have a single workload, having
+ * multiple files like these are wast of memory.
+ */
+ per_cpu = tracefs_create_dir("per_cpu", top_dir);
+ if (!per_cpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ snprintf(cpu_str, 30, "cpu%ld", cpu);
+ cpu_dir = tracefs_create_dir(cpu_str, per_cpu);
+ if (!cpu_dir)
+ goto out_clean;
+
+ timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ,
+ cpu_dir, NULL, &timerlat_fd_fops);
+ if (!timerlat_fd)
+ goto out_clean;
+
+ /* Record the CPU */
+ d_inode(timerlat_fd)->i_cdev = (void *)(cpu);
+ }
+
+ return 0;
+
+out_clean:
+ tracefs_remove(per_cpu);
+ return -ENOMEM;
+}
+
/*
* init_timerlat_tracefs - A function to initialize the timerlat interface files
*/
static int init_timerlat_tracefs(struct dentry *top_dir)
{
struct dentry *tmp;
+ int retval;
tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
&timerlat_period, &trace_min_max_fops);
if (!tmp)
return -ENOMEM;
+ retval = osnoise_create_cpu_timerlat_fd(top_dir);
+ if (retval)
+ return retval;
+
return init_timerlat_stack_tracefs(top_dir);
}
#else /* CONFIG_TIMERLAT_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1e33f367783e..db575094c498 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = {
};
/* TRACE_TIMERLAT */
+
+static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"};
static enum print_line_t
trace_timerlat_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags,
trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n",
field->seqnum,
- field->context ? "thread" : "irq",
+ timerlat_lat_context[field->context],
field->timer_latency);
return trace_handle_return(s);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 2d2616678295..4dc74d73fc1d 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -11,6 +11,9 @@
*/
#define pr_fmt(fmt) "trace_probe: " fmt
+#include <linux/bpf.h>
+#include "trace_btf.h"
+
#include "trace_probe.h"
#undef C
@@ -65,7 +68,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent)
int len = *(u32 *)data >> 16;
if (!len)
- trace_seq_puts(s, "(fault)");
+ trace_seq_puts(s, FAULT_STRING);
else
trace_seq_printf(s, "\"%s\"",
(const char *)get_loc_data(data, ent));
@@ -254,7 +257,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
trace_probe_log_err(offset, GROUP_TOO_LONG);
return -EINVAL;
}
- strlcpy(buf, event, slash - event + 1);
+ strscpy(buf, event, slash - event + 1);
if (!is_good_system_name(buf)) {
trace_probe_log_err(offset, BAD_GROUP_NAME);
return -EINVAL;
@@ -283,69 +286,574 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
return 0;
}
+static int parse_trace_event_arg(char *arg, struct fetch_insn *code,
+ struct traceprobe_parse_context *ctx)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ head = trace_get_fields(ctx->event);
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(arg, field->name)) {
+ code->op = FETCH_OP_TP_ARG;
+ code->data = field;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type)
+{
+ const struct btf_type *real_type;
+ u32 intdata;
+ s32 tid;
+
+ real_type = btf_type_skip_modifiers(btf, type->type, &tid);
+ if (!real_type)
+ return false;
+
+ if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT)
+ return false;
+
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
+}
+
+static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
+{
+ const struct btf_type *real_type;
+ const struct btf_array *array;
+ u32 intdata;
+ s32 tid;
+
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY)
+ return false;
+
+ array = (const struct btf_array *)(type + 1);
+
+ real_type = btf_type_skip_modifiers(btf, array->type, &tid);
+
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
+}
+
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = ctx->btf;
+
+ if (!btf || !ctx->last_type)
+ return 0;
+
+ /* char [] does not need any change. */
+ if (btf_type_is_char_array(btf, ctx->last_type))
+ return 0;
+
+ /* char * requires dereference the pointer. */
+ if (btf_type_is_char_ptr(btf, ctx->last_type)) {
+ struct fetch_insn *code = *pcode + 1;
+
+ if (code->op == FETCH_OP_END) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -E2BIG;
+ }
+ if (typename[0] == 'u')
+ code->op = FETCH_OP_UDEREF;
+ else
+ code->op = FETCH_OP_DEREF;
+ code->offset = 0;
+ *pcode = code;
+ return 0;
+ }
+ /* Other types are not available for string */
+ trace_probe_log_err(ctx->offset, BAD_TYPE4STR);
+ return -EINVAL;
+}
+
+static const char *fetch_type_from_btf_type(struct btf *btf,
+ const struct btf_type *type,
+ struct traceprobe_parse_context *ctx)
+{
+ u32 intdata;
+
+ /* TODO: const char * could be converted as a string */
+ switch (BTF_INFO_KIND(type->info)) {
+ case BTF_KIND_ENUM:
+ /* enum is "int", so convert to "s32" */
+ return "s32";
+ case BTF_KIND_ENUM64:
+ return "s64";
+ case BTF_KIND_PTR:
+ /* pointer will be converted to "x??" */
+ if (IS_ENABLED(CONFIG_64BIT))
+ return "x64";
+ else
+ return "x32";
+ case BTF_KIND_INT:
+ intdata = btf_type_int(type);
+ if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) {
+ switch (BTF_INT_BITS(intdata)) {
+ case 8:
+ return "s8";
+ case 16:
+ return "s16";
+ case 32:
+ return "s32";
+ case 64:
+ return "s64";
+ }
+ } else { /* unsigned */
+ switch (BTF_INT_BITS(intdata)) {
+ case 8:
+ return "u8";
+ case 16:
+ return "u16";
+ case 32:
+ return "u32";
+ case 64:
+ return "u64";
+ }
+ /* bitfield, size is encoded in the type */
+ ctx->last_bitsize = BTF_INT_BITS(intdata);
+ ctx->last_bitoffs += BTF_INT_OFFSET(intdata);
+ return "u64";
+ }
+ }
+ /* TODO: support other types */
+
+ return NULL;
+}
+
+static int query_btf_context(struct traceprobe_parse_context *ctx)
+{
+ const struct btf_param *param;
+ const struct btf_type *type;
+ struct btf *btf;
+ s32 nr;
+
+ if (ctx->btf)
+ return 0;
+
+ if (!ctx->funcname)
+ return -EINVAL;
+
+ type = btf_find_func_proto(ctx->funcname, &btf);
+ if (!type)
+ return -ENOENT;
+
+ ctx->btf = btf;
+ ctx->proto = type;
+
+ /* ctx->params is optional, since func(void) will not have params. */
+ nr = 0;
+ param = btf_get_func_param(type, &nr);
+ if (!IS_ERR_OR_NULL(param)) {
+ /* Hide the first 'data' argument of tracepoint */
+ if (ctx->flags & TPARG_FL_TPOINT) {
+ nr--;
+ param++;
+ }
+ }
+
+ if (nr > 0) {
+ ctx->nr_params = nr;
+ ctx->params = param;
+ } else {
+ ctx->nr_params = 0;
+ ctx->params = NULL;
+ }
+
+ return 0;
+}
+
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
+{
+ if (ctx->btf) {
+ btf_put(ctx->btf);
+ ctx->btf = NULL;
+ ctx->proto = NULL;
+ ctx->params = NULL;
+ ctx->nr_params = 0;
+ }
+}
+
+/* Return 1 if the field separater is arrow operator ('->') */
+static int split_next_field(char *varname, char **next_field,
+ struct traceprobe_parse_context *ctx)
+{
+ char *field;
+ int ret = 0;
+
+ field = strpbrk(varname, ".-");
+ if (field) {
+ if (field[0] == '-' && field[1] == '>') {
+ field[0] = '\0';
+ field += 2;
+ ret = 1;
+ } else if (field[0] == '.') {
+ field[0] = '\0';
+ field += 1;
+ } else {
+ trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN);
+ return -EINVAL;
+ }
+ *next_field = field;
+ }
+
+ return ret;
+}
+
+/*
+ * Parse the field of data structure. The @type must be a pointer type
+ * pointing the target data structure type.
+ */
+static int parse_btf_field(char *fieldname, const struct btf_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+ const struct btf_member *field;
+ u32 bitoffs, anon_offs;
+ char *next;
+ int is_ptr;
+ s32 tid;
+
+ do {
+ /* Outer loop for solving arrow operator ('->') */
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+ /* Convert a struct pointer type to a struct type */
+ type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ bitoffs = 0;
+ do {
+ /* Inner loop for solving dot operator ('.') */
+ next = NULL;
+ is_ptr = split_next_field(fieldname, &next, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+
+ anon_offs = 0;
+ field = btf_find_struct_member(ctx->btf, type, fieldname,
+ &anon_offs);
+ if (!field) {
+ trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+ return -ENOENT;
+ }
+ /* Add anonymous structure/union offset */
+ bitoffs += anon_offs;
+
+ /* Accumulate the bit-offsets of the dot-connected fields */
+ if (btf_type_kflag(type)) {
+ bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+ ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+ } else {
+ bitoffs += field->offset;
+ ctx->last_bitsize = 0;
+ }
+
+ type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ ctx->offset += next - fieldname;
+ fieldname = next;
+ } while (!is_ptr && fieldname);
+
+ if (++code == end) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ code->op = FETCH_OP_DEREF; /* TODO: user deref support */
+ code->offset = bitoffs / 8;
+ *pcode = code;
+
+ ctx->last_bitoffs = bitoffs % 8;
+ ctx->last_type = type;
+ } while (fieldname);
+
+ return 0;
+}
+
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+ const struct btf_param *params;
+ const struct btf_type *type;
+ char *field = NULL;
+ int i, is_ptr, ret;
+ u32 tid;
+
+ if (WARN_ON_ONCE(!ctx->funcname))
+ return -EINVAL;
+
+ is_ptr = split_next_field(varname, &field, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+ if (!is_ptr && field) {
+ /* dot-connected field on an argument is not supported. */
+ trace_probe_log_err(ctx->offset + field - varname,
+ NOSUP_DAT_ARG);
+ return -EOPNOTSUPP;
+ }
+
+ if (ctx->flags & TPARG_FL_RETURN) {
+ if (strcmp(varname, "$retval") != 0) {
+ trace_probe_log_err(ctx->offset, NO_BTFARG);
+ return -ENOENT;
+ }
+ code->op = FETCH_OP_RETVAL;
+ /* Check whether the function return type is not void */
+ if (query_btf_context(ctx) == 0) {
+ if (ctx->proto->type == 0) {
+ trace_probe_log_err(ctx->offset, NO_RETVAL);
+ return -ENOENT;
+ }
+ tid = ctx->proto->type;
+ goto found;
+ }
+ if (field) {
+ trace_probe_log_err(ctx->offset + field - varname,
+ NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ return 0;
+ }
+
+ if (!ctx->btf) {
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
+ trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
+ return PTR_ERR(params);
+ }
+ }
+ params = ctx->params;
+
+ for (i = 0; i < ctx->nr_params; i++) {
+ const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
+
+ if (name && !strcmp(name, varname)) {
+ code->op = FETCH_OP_ARG;
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param = i + 1;
+ else
+ code->param = i;
+ tid = params[i].type;
+ goto found;
+ }
+ }
+ trace_probe_log_err(ctx->offset, NO_BTFARG);
+ return -ENOENT;
+
+found:
+ type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+ /* Initialize the last type information */
+ ctx->last_type = type;
+ ctx->last_bitoffs = 0;
+ ctx->last_bitsize = 0;
+ if (field) {
+ ctx->offset += field - varname;
+ return parse_btf_field(field, type, pcode, end, ctx);
+ }
+ return 0;
+}
+
+static const struct fetch_type *find_fetch_type_from_btf_type(
+ struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = ctx->btf;
+ const char *typestr = NULL;
+
+ if (btf && ctx->last_type)
+ typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx);
+
+ return find_fetch_type(typestr, ctx->flags);
+}
+
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+
+ if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0)
+ return 0;
+
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ *pcode = code;
+
+ code->op = FETCH_OP_MOD_BF;
+ code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs);
+ code->rshift = 64 - ctx->last_bitsize;
+ code->basesize = 64 / 8;
+ return 0;
+}
+
+#else
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
+{
+ ctx->btf = NULL;
+}
+
+static int query_btf_context(struct traceprobe_parse_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+#define find_fetch_type_from_btf_type(ctx) \
+ find_fetch_type(NULL, ctx->flags)
+
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ return 0;
+}
+
+#endif
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_insn *code, unsigned int flags, int offs)
+/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
+static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
+ struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
{
+ struct fetch_insn *code = *pcode;
+ int err = TP_ERR_BAD_VAR;
+ char *arg = orig_arg + 1;
unsigned long param;
int ret = 0;
int len;
- if (flags & TPARG_FL_TPOINT) {
+ if (ctx->flags & TPARG_FL_TEVENT) {
if (code->data)
return -EFAULT;
- code->data = kstrdup(arg, GFP_KERNEL);
- if (!code->data)
- return -ENOMEM;
- code->op = FETCH_OP_TP_ARG;
- } else if (strcmp(arg, "retval") == 0) {
- if (flags & TPARG_FL_RETURN) {
+ ret = parse_trace_event_arg(arg, code, ctx);
+ if (!ret)
+ return 0;
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_COMM;
+ return 0;
+ }
+ /* backward compatibility */
+ ctx->offset = 0;
+ goto inval;
+ }
+
+ if (str_has_prefix(arg, "retval")) {
+ if (!(ctx->flags & TPARG_FL_RETURN)) {
+ err = TP_ERR_RETVAL_ON_PROBE;
+ goto inval;
+ }
+ if (!(ctx->flags & TPARG_FL_KERNEL) ||
+ !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) {
code->op = FETCH_OP_RETVAL;
- } else {
- trace_probe_log_err(offs, RETVAL_ON_PROBE);
- ret = -EINVAL;
+ return 0;
}
- } else if ((len = str_has_prefix(arg, "stack"))) {
+ return parse_btf_arg(orig_arg, pcode, end, ctx);
+ }
+
+ len = str_has_prefix(arg, "stack");
+ if (len) {
+
if (arg[len] == '\0') {
code->op = FETCH_OP_STACKP;
- } else if (isdigit(arg[len])) {
+ return 0;
+ }
+
+ if (isdigit(arg[len])) {
ret = kstrtoul(arg + len, 10, &param);
- if (ret) {
- goto inval_var;
- } else if ((flags & TPARG_FL_KERNEL) &&
- param > PARAM_MAX_STACK) {
- trace_probe_log_err(offs, BAD_STACK_NUM);
- ret = -EINVAL;
- } else {
- code->op = FETCH_OP_STACK;
- code->param = (unsigned int)param;
+ if (ret)
+ goto inval;
+
+ if ((ctx->flags & TPARG_FL_KERNEL) &&
+ param > PARAM_MAX_STACK) {
+ err = TP_ERR_BAD_STACK_NUM;
+ goto inval;
}
- } else
- goto inval_var;
- } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_STACK;
+ code->param = (unsigned int)param;
+ return 0;
+ }
+ goto inval;
+ }
+
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
code->op = FETCH_OP_COMM;
+ return 0;
+ }
+
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
- } else if (((flags & TPARG_FL_MASK) ==
- (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
- (len = str_has_prefix(arg, "arg"))) {
+ len = str_has_prefix(arg, "arg");
+ if (len && tparg_is_function_entry(ctx->flags)) {
ret = kstrtoul(arg + len, 10, &param);
- if (ret) {
- goto inval_var;
- } else if (!param || param > PARAM_MAX_STACK) {
- trace_probe_log_err(offs, BAD_ARG_NUM);
- return -EINVAL;
+ if (ret)
+ goto inval;
+
+ if (!param || param > PARAM_MAX_STACK) {
+ err = TP_ERR_BAD_ARG_NUM;
+ goto inval;
}
+
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
+ /*
+ * The tracepoint probe will probe a stub function, and the
+ * first parameter of the stub is a dummy and should be ignored.
+ */
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param++;
+ return 0;
+ }
#endif
- } else
- goto inval_var;
- return ret;
-
-inval_var:
- trace_probe_log_err(offs, BAD_VAR);
+inval:
+ __trace_probe_log_err(ctx->offset, err);
return -EINVAL;
}
@@ -378,7 +886,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
static int
parse_probe_arg(char *arg, const struct fetch_type *type,
struct fetch_insn **pcode, struct fetch_insn *end,
- unsigned int flags, int offs)
+ struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code = *pcode;
unsigned long param;
@@ -389,13 +897,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, type, code, flags, offs);
+ ret = parse_probe_vars(arg, type, pcode, end, ctx);
break;
case '%': /* named register */
- if (flags & TPARG_FL_TPOINT) {
- /* eprobes do not handle registers */
- trace_probe_log_err(offs, BAD_VAR);
+ if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) {
+ /* eprobe and fprobe do not handle registers */
+ trace_probe_log_err(ctx->offset, BAD_VAR);
break;
}
ret = regs_query_register_offset(arg + 1);
@@ -404,14 +912,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->param = (unsigned int)ret;
ret = 0;
} else
- trace_probe_log_err(offs, BAD_REG_NAME);
+ trace_probe_log_err(ctx->offset, BAD_REG_NAME);
break;
case '@': /* memory, file-offset or symbol */
if (isdigit(arg[1])) {
ret = kstrtoul(arg + 1, 0, &param);
if (ret) {
- trace_probe_log_err(offs, BAD_MEM_ADDR);
+ trace_probe_log_err(ctx->offset, BAD_MEM_ADDR);
break;
}
/* load address */
@@ -419,13 +927,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->immediate = param;
} else if (arg[1] == '+') {
/* kprobes don't support file offsets */
- if (flags & TPARG_FL_KERNEL) {
- trace_probe_log_err(offs, FILE_ON_KPROBE);
+ if (ctx->flags & TPARG_FL_KERNEL) {
+ trace_probe_log_err(ctx->offset, FILE_ON_KPROBE);
return -EINVAL;
}
ret = kstrtol(arg + 2, 0, &offset);
if (ret) {
- trace_probe_log_err(offs, BAD_FILE_OFFS);
+ trace_probe_log_err(ctx->offset, BAD_FILE_OFFS);
break;
}
@@ -433,8 +941,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->immediate = (unsigned long)offset; // imm64?
} else {
/* uprobes don't support symbols */
- if (!(flags & TPARG_FL_KERNEL)) {
- trace_probe_log_err(offs, SYM_ON_UPROBE);
+ if (!(ctx->flags & TPARG_FL_KERNEL)) {
+ trace_probe_log_err(ctx->offset, SYM_ON_UPROBE);
return -EINVAL;
}
/* Preserve symbol for updating */
@@ -443,7 +951,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
if (!code->data)
return -ENOMEM;
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
code->op = FETCH_OP_IMM;
@@ -451,7 +959,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
}
/* These are fetching from memory */
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
*pcode = code;
@@ -470,47 +978,51 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
arg++; /* Skip '+', because kstrtol() rejects it. */
tmp = strchr(arg, '(');
if (!tmp) {
- trace_probe_log_err(offs, DEREF_NEED_BRACE);
+ trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE);
return -EINVAL;
}
*tmp = '\0';
ret = kstrtol(arg, 0, &offset);
if (ret) {
- trace_probe_log_err(offs, BAD_DEREF_OFFS);
+ trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS);
break;
}
- offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
+ ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
arg = tmp + 1;
tmp = strrchr(arg, ')');
if (!tmp) {
- trace_probe_log_err(offs + strlen(arg),
+ trace_probe_log_err(ctx->offset + strlen(arg),
DEREF_OPEN_BRACE);
return -EINVAL;
} else {
- const struct fetch_type *t2 = find_fetch_type(NULL, flags);
+ const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
+ int cur_offs = ctx->offset;
*tmp = '\0';
- ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
+ ret = parse_probe_arg(arg, t2, &code, end, ctx);
if (ret)
break;
+ ctx->offset = cur_offs;
if (code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) {
- trace_probe_log_err(offs, COMM_CANT_DEREF);
+ trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
return -EINVAL;
}
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
*pcode = code;
code->op = deref;
code->offset = offset;
+ /* Reset the last type if used */
+ ctx->last_type = NULL;
}
break;
case '\\': /* Immediate value */
if (arg[1] == '"') { /* Immediate string */
- ret = __parse_imm_string(arg + 2, &tmp, offs + 2);
+ ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2);
if (ret)
break;
code->op = FETCH_OP_DATA;
@@ -518,15 +1030,24 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
} else {
ret = str_to_immediate(arg + 1, &code->immediate);
if (ret)
- trace_probe_log_err(offs + 1, BAD_IMM);
+ trace_probe_log_err(ctx->offset + 1, BAD_IMM);
else
code->op = FETCH_OP_IMM;
}
break;
+ default:
+ if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
+ if (!tparg_is_function_entry(ctx->flags)) {
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EINVAL;
+ }
+ ret = parse_btf_arg(arg, pcode, end, ctx);
+ break;
+ }
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
- trace_probe_log_err(offs, BAD_FETCH_ARG);
+ trace_probe_log_err(ctx->offset, BAD_FETCH_ARG);
ret = -EINVAL;
}
return ret;
@@ -571,12 +1092,13 @@ static int __parse_bitfield_probe_arg(const char *bf,
/* String length checking wrapper */
static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
- struct probe_arg *parg, unsigned int flags, int offset)
+ struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code, *scode, *tmp = NULL;
char *t, *t2, *t3;
- char *arg;
int ret, len;
+ char *arg;
arg = kstrdup(argv, GFP_KERNEL);
if (!arg)
@@ -585,10 +1107,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
ret = -EINVAL;
len = strlen(arg);
if (len > MAX_ARGSTR_LEN) {
- trace_probe_log_err(offset, ARG_TOO_LONG);
+ trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
goto out;
} else if (len == 0) {
- trace_probe_log_err(offset, NO_ARG_BODY);
+ trace_probe_log_err(ctx->offset, NO_ARG_BODY);
goto out;
}
@@ -606,23 +1128,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
*t2++ = '\0';
t3 = strchr(t2, ']');
if (!t3) {
- offset += t2 + strlen(t2) - arg;
- trace_probe_log_err(offset,
+ int offs = t2 + strlen(t2) - arg;
+
+ trace_probe_log_err(ctx->offset + offs,
ARRAY_NO_CLOSE);
goto out;
} else if (t3[1] != '\0') {
- trace_probe_log_err(offset + t3 + 1 - arg,
+ trace_probe_log_err(ctx->offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
goto out;
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
- trace_probe_log_err(offset + t2 - arg,
+ trace_probe_log_err(ctx->offset + t2 - arg,
BAD_ARRAY_NUM);
goto out;
}
if (parg->count > MAX_ARRAY_LEN) {
- trace_probe_log_err(offset + t2 - arg,
+ trace_probe_log_err(ctx->offset + t2 - arg,
ARRAY_TOO_BIG);
goto out;
}
@@ -633,17 +1156,17 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
* Since $comm and immediate string can not be dereferenced,
* we can find those by strcmp. But ignore for eprobes.
*/
- if (!(flags & TPARG_FL_TPOINT) &&
+ if (!(ctx->flags & TPARG_FL_TEVENT) &&
(strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
- parg->type = find_fetch_type("string", flags);
+ parg->type = find_fetch_type("string", ctx->flags);
} else
- parg->type = find_fetch_type(t, flags);
+ parg->type = find_fetch_type(t, ctx->flags);
if (!parg->type) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
}
parg->offset = *size;
@@ -664,11 +1187,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
goto out;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+ ctx->last_type = NULL;
ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- flags, offset);
+ ctx);
if (ret)
goto fail;
+ /* Update storing type if BTF is available */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!t) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(t, "string")) {
+ ret = check_prepare_btf_string_fetch(t, &code, ctx);
+ if (ret)
+ goto fail;
+ }
+ }
+
ret = -EINVAL;
/* Store operation */
if (parg->type->is_string) {
@@ -676,7 +1212,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
BAD_SYMSTRING);
goto fail;
}
@@ -684,7 +1220,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
BAD_STRING);
goto fail;
}
@@ -703,7 +1239,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
*/
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
}
@@ -726,7 +1262,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
} else {
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
code->op = FETCH_OP_ST_RAW;
@@ -737,9 +1273,14 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (t != NULL) {
ret = __parse_bitfield_probe_arg(t, parg->type, &code);
if (ret) {
- trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
+ trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
goto fail;
}
+ } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ ret = parse_btf_bitfield(&code, ctx);
+ if (ret)
+ goto fail;
}
ret = -EINVAL;
/* Loop(Array) operation */
@@ -747,13 +1288,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING &&
scode->op != FETCH_OP_ST_USTRING) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
BAD_STRING);
goto fail;
}
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
code->op = FETCH_OP_LP_ARRAY;
@@ -801,8 +1342,35 @@ static int traceprobe_conflict_field_name(const char *name,
return 0;
}
+static char *generate_probe_arg_name(const char *arg, int idx)
+{
+ char *name = NULL;
+ const char *end;
+
+ /*
+ * If argument name is omitted, try arg as a name (BTF variable)
+ * or "argN".
+ */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) {
+ end = strchr(arg, ':');
+ if (!end)
+ end = arg + strlen(arg);
+
+ name = kmemdup_nul(arg, end - arg, GFP_KERNEL);
+ if (!name || !is_good_name(name)) {
+ kfree(name);
+ name = NULL;
+ }
+ }
+
+ if (!name)
+ name = kasprintf(GFP_KERNEL, "arg%d", idx + 1);
+
+ return name;
+}
+
int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
- unsigned int flags)
+ struct traceprobe_parse_context *ctx)
{
struct probe_arg *parg = &tp->args[i];
const char *body;
@@ -822,8 +1390,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
body++;
} else {
- /* If argument name is omitted, set "argN" */
- parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1);
+ parg->name = generate_probe_arg_name(arg, i);
body = arg;
}
if (!parg->name)
@@ -837,9 +1404,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
trace_probe_log_err(0, USED_ARG_NAME);
return -EINVAL;
}
+ ctx->offset = body - arg;
/* Parse fetch argument */
- return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
- body - arg);
+ return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx);
}
void traceprobe_free_probe_arg(struct probe_arg *arg)
@@ -858,6 +1425,151 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
kfree(arg->fmt);
}
+static int argv_has_var_arg(int argc, const char *argv[], int *args_idx,
+ struct traceprobe_parse_context *ctx)
+{
+ int i, found = 0;
+
+ for (i = 0; i < argc; i++)
+ if (str_has_prefix(argv[i], "$arg")) {
+ trace_probe_log_set_index(i + 2);
+
+ if (!tparg_is_function_entry(ctx->flags)) {
+ trace_probe_log_err(0, NOFENTRY_ARGS);
+ return -EINVAL;
+ }
+
+ if (isdigit(argv[i][4])) {
+ found = 1;
+ continue;
+ }
+
+ if (argv[i][4] != '*') {
+ trace_probe_log_err(0, BAD_VAR);
+ return -EINVAL;
+ }
+
+ if (*args_idx >= 0 && *args_idx < argc) {
+ trace_probe_log_err(0, DOUBLE_ARGS);
+ return -EINVAL;
+ }
+ found = 1;
+ *args_idx = i;
+ }
+
+ return found;
+}
+
+static int sprint_nth_btf_arg(int idx, const char *type,
+ char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx)
+{
+ const char *name;
+ int ret;
+
+ if (idx >= ctx->nr_params) {
+ trace_probe_log_err(0, NO_BTFARG);
+ return -ENOENT;
+ }
+ name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off);
+ if (!name) {
+ trace_probe_log_err(0, NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ ret = snprintf(buf, bufsize, "%s%s", name, type);
+ if (ret >= bufsize) {
+ trace_probe_log_err(0, ARGS_2LONG);
+ return -E2BIG;
+ }
+ return ret;
+}
+
+/* Return new_argv which must be freed after use */
+const char **traceprobe_expand_meta_args(int argc, const char *argv[],
+ int *new_argc, char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx)
+{
+ const struct btf_param *params = NULL;
+ int i, j, n, used, ret, args_idx = -1;
+ const char **new_argv = NULL;
+
+ ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!ret) {
+ *new_argc = argc;
+ return NULL;
+ }
+
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
+ if (args_idx != -1) {
+ /* $arg* requires BTF info */
+ trace_probe_log_err(0, NOSUP_BTFARG);
+ return (const char **)params;
+ }
+ *new_argc = argc;
+ return NULL;
+ }
+
+ if (args_idx >= 0)
+ *new_argc = argc + ctx->nr_params - 1;
+ else
+ *new_argc = argc;
+
+ new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL);
+ if (!new_argv)
+ return ERR_PTR(-ENOMEM);
+
+ used = 0;
+ for (i = 0, j = 0; i < argc; i++) {
+ trace_probe_log_set_index(i + 2);
+ if (i == args_idx) {
+ for (n = 0; n < ctx->nr_params; n++) {
+ ret = sprint_nth_btf_arg(n, "", buf + used,
+ bufsize - used, ctx);
+ if (ret < 0)
+ goto error;
+
+ new_argv[j++] = buf + used;
+ used += ret + 1;
+ }
+ continue;
+ }
+
+ if (str_has_prefix(argv[i], "$arg")) {
+ char *type = NULL;
+
+ n = simple_strtoul(argv[i] + 4, &type, 10);
+ if (type && !(*type == ':' || *type == '\0')) {
+ trace_probe_log_err(0, BAD_VAR);
+ ret = -ENOENT;
+ goto error;
+ }
+ /* Note: $argN starts from $arg1 */
+ ret = sprint_nth_btf_arg(n - 1, type, buf + used,
+ bufsize - used, ctx);
+ if (ret < 0)
+ goto error;
+ new_argv[j++] = buf + used;
+ used += ret + 1;
+ } else
+ new_argv[j++] = argv[i];
+ }
+
+ return new_argv;
+
+error:
+ kfree(new_argv);
+ return ERR_PTR(ret);
+}
+
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
+{
+ clear_btf_context(ctx);
+}
+
int traceprobe_update_arg(struct probe_arg *arg)
{
struct fetch_insn *code = arg->code;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 6a4ecfb1da43..02b432ae7513 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -23,6 +23,7 @@
#include <linux/limits.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
+#include <linux/btf.h>
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -32,7 +33,9 @@
#define MAX_ARGSTR_LEN 63
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
+#define MAX_BTF_ARGS_LEN 128
#define MAX_STRING_SIZE PATH_MAX
+#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -357,19 +360,58 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a
#define trace_probe_for_each_link_rcu(pos, tp) \
list_for_each_entry_rcu(pos, &(tp)->event->files, list)
+/*
+ * The flags used for parsing trace_probe arguments.
+ * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive.
+ * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive.
+ * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with
+ * TPARG_FL_KERNEL.
+ */
#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
-#define TPARG_FL_TPOINT BIT(3)
+#define TPARG_FL_TEVENT BIT(3)
#define TPARG_FL_USER BIT(4)
-#define TPARG_FL_MASK GENMASK(4, 0)
+#define TPARG_FL_FPROBE BIT(5)
+#define TPARG_FL_TPOINT BIT(6)
+#define TPARG_FL_LOC_MASK GENMASK(4, 0)
+
+static inline bool tparg_is_function_entry(unsigned int flags)
+{
+ return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY);
+}
+
+struct traceprobe_parse_context {
+ struct trace_event_call *event;
+ /* BTF related parameters */
+ const char *funcname; /* Function name in BTF */
+ const struct btf_type *proto; /* Prototype of the function */
+ const struct btf_param *params; /* Parameter of the function */
+ s32 nr_params; /* The number of the parameters */
+ struct btf *btf; /* The BTF to be used */
+ const struct btf_type *last_type; /* Saved type */
+ u32 last_bitoffs; /* Saved bitoffs */
+ u32 last_bitsize; /* Saved bitsize */
+ unsigned int flags;
+ int offset;
+};
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
- const char *argv, unsigned int flags);
+ const char *argv,
+ struct traceprobe_parse_context *ctx);
+const char **traceprobe_expand_meta_args(int argc, const char *argv[],
+ int *new_argc, char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+/*
+ * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called,
+ * this MUST be called for clean up the context and return a resource.
+ */
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset);
@@ -404,11 +446,12 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \
C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \
C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \
- C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \
+ C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \
C(BAD_MAXACT, "Invalid maxactive number"), \
C(MAXACT_TOO_BIG, "Maxactive is too big"), \
C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
+ C(NO_TRACEPOINT, "Tracepoint is not found"), \
C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
C(NO_GROUP_NAME, "Group name is not specified"), \
C(GROUP_TOO_LONG, "Group name is too long"), \
@@ -418,6 +461,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
C(EVENT_EXIST, "Given group/event name is already used by another event"), \
C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
+ C(NO_RETVAL, "This function returns 'void' type"), \
C(BAD_STACK_NUM, "Invalid stack number"), \
C(BAD_ARG_NUM, "Invalid argument number"), \
C(BAD_VAR, "Invalid $-valiable specified"), \
@@ -456,7 +500,21 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
- C(NO_EP_FILTER, "No filter rule after 'if'"),
+ C(NO_EP_FILTER, "No filter rule after 'if'"), \
+ C(NOSUP_BTFARG, "BTF is not available or not supported"), \
+ C(NO_BTFARG, "This variable is not found at this probe point"),\
+ C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \
+ C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
+ C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
+ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
+ C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
+ C(ARGIDX_2BIG, "$argN index is too big"), \
+ C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \
+ C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\
+ C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
+ C(NO_BTF_FIELD, "This field is not found."), \
+ C(BAD_BTF_TID, "Failed to get BTF type info."),\
+ C(BAD_TYPE4STR, "This type does not fit for string."),
#undef C
#define C(a, b) TP_ERR_##a
@@ -480,3 +538,8 @@ void __trace_probe_log_err(int offset, int err);
#define trace_probe_log_err(offs, err) \
__trace_probe_log_err(offs, TP_ERR_##err)
+
+struct uprobe_dispatch_data {
+ struct trace_uprobe *tu;
+ unsigned long bp_addr;
+};
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
index c4e1d4c03a85..bb723eefd7b7 100644
--- a/kernel/trace/trace_probe_kernel.h
+++ b/kernel/trace/trace_probe_kernel.h
@@ -2,8 +2,6 @@
#ifndef __TRACE_PROBE_KERNEL_H_
#define __TRACE_PROBE_KERNEL_H_
-#define FAULT_STRING "(fault)"
-
/*
* This depends on trace_probe.h, but can not include it due to
* the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
@@ -15,16 +13,8 @@ static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
- int ret;
- ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
- /*
- * strnlen_user_nofault returns zero on fault, insert the
- * FAULT_STRING when that occurs.
- */
- if (ret <= 0)
- return strlen(FAULT_STRING) + 1;
- return ret;
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
}
/* Return the length of string -- including null terminal byte */
@@ -44,18 +34,14 @@ fetch_store_strlen(unsigned long addr)
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
- /* For faults, return enough to hold the FAULT_STRING */
- return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
+ return (ret < 0) ? ret : len;
}
-static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
+static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base)
{
- if (ret >= 0) {
- *(u32 *)dest = make_data_loc(ret, __dest - base);
- } else {
- strscpy(__dest, FAULT_STRING, len);
- ret = strlen(__dest) + 1;
- }
+ if (ret < 0)
+ ret = 0;
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
}
/*
@@ -76,7 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
__dest = get_loc_data(dest, base);
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- set_data_loc(ret, dest, __dest, base, maxlen);
+ set_data_loc(ret, dest, __dest, base);
return ret;
}
@@ -107,7 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
* probing.
*/
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- set_data_loc(ret, dest, __dest, base, maxlen);
+ set_data_loc(ret, dest, __dest, base);
return ret;
}
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 00707630788d..3935b347f874 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -156,11 +156,11 @@ stage3:
code++;
goto array;
case FETCH_OP_ST_USTRING:
- ret += fetch_store_strlen_user(val + code->offset);
+ ret = fetch_store_strlen_user(val + code->offset);
code++;
goto array;
case FETCH_OP_ST_SYMSTR:
- ret += fetch_store_symstrlen(val + code->offset);
+ ret = fetch_store_symstrlen(val + code->offset);
code++;
goto array;
default:
@@ -204,6 +204,8 @@ stage3:
array:
/* the last stage: Loop on array */
if (code->op == FETCH_OP_LP_ARRAY) {
+ if (ret < 0)
+ ret = 0;
total += ret;
if (++i < code->param) {
code = s3;
@@ -265,9 +267,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
ret = process_fetch_insn(arg->code, rec, dl, base);
- if (unlikely(ret < 0 && arg->dynamic)) {
- *dl = make_data_loc(0, dyndata - base);
- } else {
+ if (arg->dynamic && likely(ret > 0)) {
dyndata += ret;
maxlen -= ret;
}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 330aee1c1a49..0469a04a355f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index e5e299260d0c..bac06ee3b98b 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
+ * @args: Arguments for the format string
*
* The tracer may use either sequence operations or its own
* copy to user routines. To simplify formatting of a trace
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 942ddbdace4a..de753403cdaf 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -555,12 +555,15 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
struct syscall_trace_enter *rec)
{
struct syscall_tp_t {
- unsigned long long regs;
+ struct trace_entry ent;
unsigned long syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
- } param;
+ } __aligned(8) param;
int i;
+ BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
+
+ /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -657,11 +660,12 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
struct syscall_trace_exit *rec)
{
struct syscall_tp_t {
- unsigned long long regs;
+ struct trace_entry ent;
unsigned long syscall_nr;
unsigned long ret;
- } param;
+ } __aligned(8) param;
+ /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8b92e34ff0c8..99c051de412a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -88,11 +88,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-struct uprobe_dispatch_data {
- struct trace_uprobe *tu;
- unsigned long bp_addr;
-};
-
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
unsigned long func, struct pt_regs *regs);
@@ -170,7 +165,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
*/
ret++;
*(u32 *)dest = make_data_loc(ret, (void *)dst - base);
- }
+ } else
+ *(u32 *)dest = make_data_loc(0, (void *)dst - base);
return ret;
}
@@ -686,10 +682,13 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ struct traceprobe_parse_context ctx = {
+ .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
+ };
+
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
- (is_return ? TPARG_FL_RETURN : 0) |
- TPARG_FL_USER);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
+ traceprobe_finish_parse(&ctx);
if (ret)
goto error;
}
@@ -1349,7 +1348,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (bpf_prog_array_valid(call)) {
u32 ret;
- ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
+ ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run);
if (!ret)
return;
}
@@ -1415,7 +1414,7 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
const char **filename, u64 *probe_offset,
- bool perf_type_tracepoint)
+ u64 *probe_addr, bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1432,6 +1431,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
+ *probe_addr = 0;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 2c765ee2a4d4..99c37eeebc16 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
-extern void tracing_map_set_field_descr(struct tracing_map *map,
- unsigned int i,
- unsigned int key_offset,
- tracing_map_cmp_fn_t cmp_fn);
extern int
tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_key *sort_keys,
diff --git a/kernel/ucount.c b/kernel/ucount.c
index ee8e57fd6f90..4aa6166cb856 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -104,7 +104,8 @@ bool setup_userns_sysctls(struct user_namespace *ns)
for (i = 0; i < UCOUNT_COUNTS; i++) {
tbl[i].data = &ns->ucount_max[i];
}
- ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
+ ARRAY_SIZE(user_table));
}
if (!ns->sysctls) {
kfree(tbl);
@@ -364,7 +365,7 @@ static __init int user_namespace_sysctl_init(void)
* default set so that registrations in the child sets work
* properly.
*/
- user_header = register_sysctl("user", empty);
+ user_header = register_sysctl_sz("user", empty, 0);
kmemleak_ignore(user_header);
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
diff --git a/kernel/umh.c b/kernel/umh.c
index 60aa9e764a38..1b13c5d34624 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -494,6 +494,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
}
EXPORT_SYMBOL(call_usermodehelper);
+#if defined(CONFIG_SYSCTL)
static int proc_cap_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -544,7 +545,7 @@ static int proc_cap_handler(struct ctl_table *table, int write,
return 0;
}
-struct ctl_table usermodehelper_table[] = {
+static struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
.data = &usermodehelper_bset,
@@ -561,3 +562,11 @@ struct ctl_table usermodehelper_table[] = {
},
{ }
};
+
+static int __init init_umh_sysctls(void)
+{
+ register_sysctl_init("kernel/usermodehelper", usermodehelper_table);
+ return 0;
+}
+early_initcall(init_umh_sysctls);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index e91cb4c2833f..d0b6b390ee42 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc.");
static inline bool lock_wqueue(struct watch_queue *wqueue)
{
spin_lock_bh(&wqueue->lock);
- if (unlikely(wqueue->defunct)) {
+ if (unlikely(!wqueue->pipe)) {
spin_unlock_bh(&wqueue->lock);
return false;
}
@@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
unsigned int head, tail, mask, note, offset, len;
bool done = false;
- if (!pipe)
- return false;
-
spin_lock_irq(&pipe->rd_wait.lock);
mask = pipe->ring_size - 1;
@@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new notifications from being stored. */
- wqueue->defunct = true;
+ /*
+ * This pipe can be freed by callers like free_pipe_info().
+ * Removing this reference also prevents new notifications.
+ */
+ wqueue->pipe = NULL;
while (!hlist_empty(&wqueue->watches)) {
watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33..d145305d95fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,18 @@
static DEFINE_MUTEX(watchdog_mutex);
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
-# define NMI_WATCHDOG_DEFAULT 1
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
+# define WATCHDOG_HARDLOCKUP_DEFAULT 1
#else
-# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
-# define NMI_WATCHDOG_DEFAULT 0
+# define WATCHDOG_HARDLOCKUP_DEFAULT 0
#endif
unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
-int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
-int __read_mostly soft_watchdog_user_enabled = 1;
+static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
+static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
-static int __read_mostly nmi_watchdog_available;
+static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -68,7 +66,7 @@ unsigned int __read_mostly hardlockup_panic =
*/
void __init hardlockup_detector_disable(void)
{
- nmi_watchdog_user_enabled = 0;
+ watchdog_hardlockup_user_enabled = 0;
}
static int __init hardlockup_panic_setup(char *str)
@@ -78,54 +76,158 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- nmi_watchdog_user_enabled = 0;
+ watchdog_hardlockup_user_enabled = 0;
else if (!strncmp(str, "1", 1))
- nmi_watchdog_user_enabled = 1;
+ watchdog_hardlockup_user_enabled = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-/*
- * These functions can be overridden if an architecture implements its
- * own hardlockup detector.
- *
- * watchdog_nmi_enable/disable can be implemented to start and stop when
- * softlockup watchdog start and stop. The arch must select the
- * SOFTLOCKUP_DETECTOR Kconfig.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)
+
+static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
+static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
+static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
+static unsigned long watchdog_hardlockup_all_cpu_dumped;
+
+notrace void arch_touch_nmi_watchdog(void)
{
- hardlockup_detector_perf_enable();
- return 0;
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_hardlockup_touched, true);
+}
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+void watchdog_hardlockup_touch_cpu(unsigned int cpu)
+{
+ per_cpu(watchdog_hardlockup_touched, cpu) = true;
}
-void __weak watchdog_nmi_disable(unsigned int cpu)
+static bool is_hardlockup(unsigned int cpu)
{
- hardlockup_detector_perf_disable();
+ int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return true;
+
+ /*
+ * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
+ * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
+ * written/read by a single CPU.
+ */
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+
+ return false;
+}
+
+static void watchdog_hardlockup_kick(void)
+{
+ int new_interrupts;
+
+ new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts));
+ watchdog_buddy_check_hardlockup(new_interrupts);
+}
+
+void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
+{
+ if (per_cpu(watchdog_hardlockup_touched, cpu)) {
+ per_cpu(watchdog_hardlockup_touched, cpu) = false;
+ return;
+ }
+
+ /*
+ * Check for a hardlockup by making sure the CPU's timer
+ * interrupt is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup(cpu)) {
+ unsigned int this_cpu = smp_processor_id();
+
+ /* Only print hardlockups once. */
+ if (per_cpu(watchdog_hardlockup_warned, cpu))
+ return;
+
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (cpu == this_cpu) {
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ } else {
+ trigger_single_cpu_backtrace(cpu);
+ }
+
+ /*
+ * Perform multi-CPU dump only once to avoid multiple
+ * hardlockups generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
+ trigger_allbutcpu_cpu_backtrace(cpu);
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ per_cpu(watchdog_hardlockup_warned, cpu) = true;
+ } else {
+ per_cpu(watchdog_hardlockup_warned, cpu) = false;
+ }
}
-/* Return 0, if a NMI watchdog is available. Error code otherwise */
-int __weak __init watchdog_nmi_probe(void)
+#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
+
+static inline void watchdog_hardlockup_kick(void) { }
+
+#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
+
+/*
+ * These functions can be overridden based on the configured hardlockdup detector.
+ *
+ * watchdog_hardlockup_enable/disable can be implemented to start and stop when
+ * softlockup watchdog start and stop. The detector must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
+ */
+void __weak watchdog_hardlockup_enable(unsigned int cpu) { }
+
+void __weak watchdog_hardlockup_disable(unsigned int cpu) { }
+
+/*
+ * Watchdog-detector specific API.
+ *
+ * Return 0 when hardlockup watchdog is available, negative value otherwise.
+ * Note that the negative value means that a delayed probe might
+ * succeed later.
+ */
+int __weak __init watchdog_hardlockup_probe(void)
{
- return hardlockup_detector_perf_init();
+ return -ENODEV;
}
/**
- * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration
*
* The reconfiguration steps are:
- * watchdog_nmi_stop();
+ * watchdog_hardlockup_stop();
* update_variables();
- * watchdog_nmi_start();
+ * watchdog_hardlockup_start();
*/
-void __weak watchdog_nmi_stop(void) { }
+void __weak watchdog_hardlockup_stop(void) { }
/**
- * watchdog_nmi_start - Start the watchdog after reconfiguration
+ * watchdog_hardlockup_start - Start the watchdog after reconfiguration
*
- * Counterpart to watchdog_nmi_stop().
+ * Counterpart to watchdog_hardlockup_stop().
*
* The following variables have been updated in update_variables() and
* contain the currently valid configuration:
@@ -133,23 +235,23 @@ void __weak watchdog_nmi_stop(void) { }
* - watchdog_thresh
* - watchdog_cpumask
*/
-void __weak watchdog_nmi_start(void) { }
+void __weak watchdog_hardlockup_start(void) { }
/**
* lockup_detector_update_enable - Update the sysctl enable bit
*
- * Caller needs to make sure that the NMI/perf watchdogs are off, so this
- * can't race with watchdog_nmi_disable().
+ * Caller needs to make sure that the hard watchdogs are off, so this
+ * can't race with watchdog_hardlockup_disable().
*/
static void lockup_detector_update_enable(void)
{
watchdog_enabled = 0;
if (!watchdog_user_enabled)
return;
- if (nmi_watchdog_available && nmi_watchdog_user_enabled)
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- if (soft_watchdog_user_enabled)
- watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
+ if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled)
+ watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED;
+ if (watchdog_softlockup_user_enabled)
+ watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED;
}
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -179,8 +281,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
static int __init nowatchdog_setup(char *str)
@@ -192,7 +292,7 @@ __setup("nowatchdog", nowatchdog_setup);
static int __init nosoftlockup_setup(char *str)
{
- soft_watchdog_user_enabled = 0;
+ watchdog_softlockup_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
@@ -306,7 +406,7 @@ static int is_softlockup(unsigned long touch_ts,
unsigned long period_ts,
unsigned long now)
{
- if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
+ if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -315,22 +415,6 @@ static int is_softlockup(unsigned long touch_ts,
}
/* watchdog detector functions */
-bool is_hardlockup(void)
-{
- unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
- if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return true;
-
- __this_cpu_write(hrtimer_interrupts_saved, hrint);
- return false;
-}
-
-static void watchdog_interrupt_count(void)
-{
- __this_cpu_inc(hrtimer_interrupts);
-}
-
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
@@ -361,8 +445,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (!watchdog_enabled)
return HRTIMER_NORESTART;
- /* kick the hardlockup detector */
- watchdog_interrupt_count();
+ watchdog_hardlockup_kick();
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
@@ -435,7 +518,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
dump_stack();
if (softlockup_all_cpu_backtrace) {
- trigger_allbutself_cpu_backtrace();
+ trigger_allbutcpu_cpu_backtrace(smp_processor_id());
clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
@@ -458,7 +541,7 @@ static void watchdog_enable(unsigned int cpu)
complete(done);
/*
- * Start the timer first to prevent the NMI watchdog triggering
+ * Start the timer first to prevent the hardlockup watchdog triggering
* before the timer has a chance to fire.
*/
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
@@ -468,9 +551,9 @@ static void watchdog_enable(unsigned int cpu)
/* Initialize timestamp */
update_touch_ts();
- /* Enable the perf event */
- if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
- watchdog_nmi_enable(cpu);
+ /* Enable the hardlockup detector */
+ if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)
+ watchdog_hardlockup_enable(cpu);
}
static void watchdog_disable(unsigned int cpu)
@@ -480,11 +563,11 @@ static void watchdog_disable(unsigned int cpu)
WARN_ON_ONCE(cpu != smp_processor_id());
/*
- * Disable the perf event first. That prevents that a large delay
- * between disabling the timer and disabling the perf event causes
- * the perf NMI to detect a false positive.
+ * Disable the hardlockup detector first. That prevents that a large
+ * delay between disabling the timer and disabling the hardlockup
+ * detector causes a false positive.
*/
- watchdog_nmi_disable(cpu);
+ watchdog_hardlockup_disable(cpu);
hrtimer_cancel(hrtimer);
wait_for_completion(this_cpu_ptr(&softlockup_completion));
}
@@ -540,7 +623,7 @@ int lockup_detector_offline_cpu(unsigned int cpu)
static void __lockup_detector_reconfigure(void)
{
cpus_read_lock();
- watchdog_nmi_stop();
+ watchdog_hardlockup_stop();
softlockup_stop_all();
set_sample_period();
@@ -548,7 +631,7 @@ static void __lockup_detector_reconfigure(void)
if (watchdog_enabled && watchdog_thresh)
softlockup_start_all();
- watchdog_nmi_start();
+ watchdog_hardlockup_start();
cpus_read_unlock();
/*
* Must be called outside the cpus locked section to prevent
@@ -589,9 +672,9 @@ static __init void lockup_detector_setup(void)
static void __lockup_detector_reconfigure(void)
{
cpus_read_lock();
- watchdog_nmi_stop();
+ watchdog_hardlockup_stop();
lockup_detector_update_enable();
- watchdog_nmi_start();
+ watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
@@ -646,14 +729,14 @@ static void proc_watchdog_update(void)
/*
* common function for watchdog, nmi_watchdog and soft_watchdog parameter
*
- * caller | table->data points to | 'which'
- * -------------------|----------------------------|--------------------------
- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
- * | | SOFT_WATCHDOG_ENABLED
- * -------------------|----------------------------|--------------------------
- * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
- * -------------------|----------------------------|--------------------------
- * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
+ * caller | table->data points to | 'which'
+ * -------------------|----------------------------------|-------------------------------
+ * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED |
+ * | | WATCHDOG_SOFTOCKUP_ENABLED
+ * -------------------|----------------------------------|-------------------------------
+ * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED
+ * -------------------|----------------------------------|-------------------------------
+ * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -685,7 +768,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
int proc_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
+ WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -695,9 +779,9 @@ int proc_watchdog(struct ctl_table *table, int write,
int proc_nmi_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- if (!nmi_watchdog_available && write)
+ if (!watchdog_hardlockup_available && write)
return -ENOTSUPP;
- return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -707,7 +791,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
int proc_soft_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -774,15 +858,6 @@ static struct ctl_table watchdog_sysctls[] = {
.extra2 = (void *)&sixty,
},
{
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = NMI_WATCHDOG_SYSCTL_PERM,
- .proc_handler = proc_nmi_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
.procname = "watchdog_cpumask",
.data = &watchdog_cpumask_bits,
.maxlen = NR_CPUS,
@@ -792,7 +867,7 @@ static struct ctl_table watchdog_sysctls[] = {
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
{
.procname = "soft_watchdog",
- .data = &soft_watchdog_user_enabled,
+ .data = &watchdog_softlockup_user_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_soft_watchdog,
@@ -845,14 +920,90 @@ static struct ctl_table watchdog_sysctls[] = {
{}
};
+static struct ctl_table watchdog_hardlockup_sysctl[] = {
+ {
+ .procname = "nmi_watchdog",
+ .data = &watchdog_hardlockup_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
static void __init watchdog_sysctl_init(void)
{
register_sysctl_init("kernel", watchdog_sysctls);
+
+ if (watchdog_hardlockup_available)
+ watchdog_hardlockup_sysctl[0].mode = 0644;
+ register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
}
+
#else
#define watchdog_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
+static void __init lockup_detector_delay_init(struct work_struct *work);
+static bool allow_lockup_detector_init_retry __initdata;
+
+static struct work_struct detector_work __initdata =
+ __WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
+
+static void __init lockup_detector_delay_init(struct work_struct *work)
+{
+ int ret;
+
+ ret = watchdog_hardlockup_probe();
+ if (ret) {
+ pr_info("Delayed init of the lockup detector failed: %d\n", ret);
+ pr_info("Hard watchdog permanently disabled\n");
+ return;
+ }
+
+ allow_lockup_detector_init_retry = false;
+
+ watchdog_hardlockup_available = true;
+ lockup_detector_setup();
+}
+
+/*
+ * lockup_detector_retry_init - retry init lockup detector if possible.
+ *
+ * Retry hardlockup detector init. It is useful when it requires some
+ * functionality that has to be initialized later on a particular
+ * platform.
+ */
+void __init lockup_detector_retry_init(void)
+{
+ /* Must be called before late init calls */
+ if (!allow_lockup_detector_init_retry)
+ return;
+
+ schedule_work(&detector_work);
+}
+
+/*
+ * Ensure that optional delayed hardlockup init is proceed before
+ * the init code and memory is freed.
+ */
+static int __init lockup_detector_check(void)
+{
+ /* Prevent any later retry. */
+ allow_lockup_detector_init_retry = false;
+
+ /* Make sure no work is pending. */
+ flush_work(&detector_work);
+
+ watchdog_sysctl_init();
+
+ return 0;
+
+}
+late_initcall_sync(lockup_detector_check);
+
void __init lockup_detector_init(void)
{
if (tick_nohz_full_enabled())
@@ -861,8 +1012,10 @@ void __init lockup_detector_init(void)
cpumask_copy(&watchdog_cpumask,
housekeeping_cpumask(HK_TYPE_TIMER));
- if (!watchdog_nmi_probe())
- nmi_watchdog_available = true;
+ if (!watchdog_hardlockup_probe())
+ watchdog_hardlockup_available = true;
+ else
+ allow_lockup_detector_init_retry = true;
+
lockup_detector_setup();
- watchdog_sysctl_init();
}
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
new file mode 100644
index 000000000000..34dbfe091f4b
--- /dev/null
+++ b/kernel/watchdog_buddy.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/nmi.h>
+#include <linux/percpu-defs.h>
+
+static cpumask_t __read_mostly watchdog_cpus;
+
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &watchdog_cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&watchdog_cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+int __init watchdog_hardlockup_probe(void)
+{
+ return 0;
+}
+
+void watchdog_hardlockup_enable(unsigned int cpu)
+{
+ unsigned int next_cpu;
+
+ /*
+ * The new CPU will be marked online before the hrtimer interrupt
+ * gets a chance to run on it. If another CPU tests for a
+ * hardlockup on the new CPU before it has run its the hrtimer
+ * interrupt, it will get a false positive. Touch the watchdog on
+ * the new CPU to delay the check for at least 3 sampling periods
+ * to guarantee one hrtimer has run on the new CPU.
+ */
+ watchdog_hardlockup_touch_cpu(cpu);
+
+ /*
+ * We are going to check the next CPU. Our watchdog_hrtimer
+ * need not be zero if the CPU has already been online earlier.
+ * Touch the watchdog on the next CPU to avoid false positive
+ * if we try to check it in less then 3 interrupts.
+ */
+ next_cpu = watchdog_next_cpu(cpu);
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ /*
+ * Makes sure that watchdog is touched on this CPU before
+ * other CPUs could see it in watchdog_cpus. The counter
+ * part is in watchdog_buddy_check_hardlockup().
+ */
+ smp_wmb();
+
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_hardlockup_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this CPU will cause the CPU before this one to start
+ * checking the one after this one. If this CPU just finished checking
+ * the next CPU and updating hrtimer_interrupts_saved, and then the
+ * previous CPU checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next CPU to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ /*
+ * Makes sure that watchdog is touched on the next CPU before
+ * this CPU disappear in watchdog_cpus. The counter part is in
+ * watchdog_buddy_check_hardlockup().
+ */
+ smp_wmb();
+
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_buddy_check_hardlockup(int hrtimer_interrupts)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (hrtimer_interrupts % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next CPU */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * Make sure that the watchdog was touched on next CPU when
+ * watchdog_next_cpu() returned another one because of
+ * a change in watchdog_hardlockup_enable()/disable().
+ */
+ smp_rmb();
+
+ watchdog_hardlockup_check(next_cpu, NULL);
+}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_perf.c
index 247bf0b1582c..8ea00c4a24b2 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_perf.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Detect hard lockups on a system
+ * Detect hard lockups on a system using perf
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
@@ -20,28 +20,12 @@
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
static DEFINE_PER_CPU(struct perf_event *, dead_event);
static struct cpumask dead_events_mask;
-static unsigned long hardlockup_allcpu_dumped;
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
-notrace void arch_touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
-}
-EXPORT_SYMBOL(arch_touch_nmi_watchdog);
-
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
static DEFINE_PER_CPU(ktime_t, last_timestamp);
static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
@@ -114,61 +98,24 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
if (!watchdog_check_timestamp())
return;
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
- this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
+ watchdog_hardlockup_check(smp_processor_id(), regs);
}
static int hardlockup_detector_event_create(void)
{
- unsigned int cpu = smp_processor_id();
+ unsigned int cpu;
struct perf_event_attr *wd_attr;
struct perf_event *evt;
+ /*
+ * Preemption is not disabled because memory will be allocated.
+ * Ensure CPU-locality by calling this in per-CPU kthread.
+ */
+ WARN_ON(!is_percpu_thread());
+ cpu = raw_smp_processor_id();
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
@@ -185,10 +132,14 @@ static int hardlockup_detector_event_create(void)
}
/**
- * hardlockup_detector_perf_enable - Enable the local event
+ * watchdog_hardlockup_enable - Enable the local event
+ *
+ * @cpu: The CPU to enable hard lockup on.
*/
-void hardlockup_detector_perf_enable(void)
+void watchdog_hardlockup_enable(unsigned int cpu)
{
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
if (hardlockup_detector_event_create())
return;
@@ -200,12 +151,16 @@ void hardlockup_detector_perf_enable(void)
}
/**
- * hardlockup_detector_perf_disable - Disable the local event
+ * watchdog_hardlockup_disable - Disable the local event
+ *
+ * @cpu: The CPU to enable hard lockup on.
*/
-void hardlockup_detector_perf_disable(void)
+void watchdog_hardlockup_disable(unsigned int cpu)
{
struct perf_event *event = this_cpu_read(watchdog_ev);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
if (event) {
perf_event_disable(event);
this_cpu_write(watchdog_ev, NULL);
@@ -268,7 +223,7 @@ void __init hardlockup_detector_perf_restart(void)
lockdep_assert_cpus_held();
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
return;
for_each_online_cpu(cpu) {
@@ -279,12 +234,22 @@ void __init hardlockup_detector_perf_restart(void)
}
}
+bool __weak __init arch_perf_nmi_is_available(void)
+{
+ return true;
+}
+
/**
- * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ * watchdog_hardlockup_probe - Probe whether NMI event is available at all
*/
-int __init hardlockup_detector_perf_init(void)
+int __init watchdog_hardlockup_probe(void)
{
- int ret = hardlockup_detector_event_create();
+ int ret;
+
+ if (!arch_perf_nmi_is_available())
+ return -ENODEV;
+
+ ret = hardlockup_detector_event_create();
if (ret) {
pr_info("Perf NMI watchdog permanently disabled\n");
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c913e333cce8..c85825e17df8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,6 +52,7 @@
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
@@ -121,10 +122,11 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires pool->lock and should
- * be done only from local cpu. Either disabling preemption on local
- * cpu or grabbing pool->lock is enough for read access. If
- * POOL_DISASSOCIATED is set, it's identical to L.
+ * K: Only modified by worker while holding pool->lock. Can be safely read by
+ * self, while holding pool->lock or from IRQ context if %current is the
+ * kworker.
+ *
+ * S: Only modified by worker self.
*
* A: wq_pool_attach_mutex protected.
*
@@ -153,7 +155,7 @@ struct worker_pool {
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
- unsigned int flags; /* X: flags */
+ unsigned int flags; /* L: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
@@ -200,6 +202,23 @@ struct worker_pool {
};
/*
+ * Per-pool_workqueue statistics. These can be monitored using
+ * tools/workqueue/wq_monitor.py.
+ */
+enum pool_workqueue_stats {
+ PWQ_STAT_STARTED, /* work items started execution */
+ PWQ_STAT_COMPLETED, /* work items completed execution */
+ PWQ_STAT_CPU_TIME, /* total CPU time consumed */
+ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
+ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
+ PWQ_STAT_MAYDAY, /* maydays to rescuer */
+ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
+
+ PWQ_NR_STATS,
+};
+
+/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
* of work_struct->data are used for flags and the remaining high bits
* point to the pwq; thus, pwqs need to be aligned at two's power of the
@@ -236,13 +255,15 @@ struct pool_workqueue {
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ u64 stats[PWQ_NR_STATS];
+
/*
- * Release of unbound pwq is punted to system_wq. See put_pwq()
- * and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also RCU protected so that the first pwq can be
- * determined without grabbing wq->mutex.
+ * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
+ * and pwq_release_workfn() for details. pool_workqueue itself is also
+ * RCU protected so that the first pwq can be determined without
+ * grabbing wq->mutex.
*/
- struct work_struct unbound_release_work;
+ struct kthread_work release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
@@ -301,17 +322,43 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
+ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
};
static struct kmem_cache *pwq_cache;
-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
+/*
+ * Each pod type describes how CPUs should be grouped for unbound workqueues.
+ * See the comment above workqueue_attrs->affn_scope.
+ */
+struct wq_pod_type {
+ int nr_pods; /* number of pods */
+ cpumask_var_t *pod_cpus; /* pod -> cpus */
+ int *pod_node; /* pod -> node */
+ int *cpu_pod; /* cpu -> pod */
+};
+
+static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
-static bool wq_disable_numa;
-module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+ [WQ_AFFN_DFL] = "default",
+ [WQ_AFFN_CPU] = "cpu",
+ [WQ_AFFN_SMT] = "smt",
+ [WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_NUMA] = "numa",
+ [WQ_AFFN_SYSTEM] = "system",
+};
+
+/*
+ * Per-cpu work items which run for longer than the following threshold are
+ * automatically considered CPU intensive and excluded from concurrency
+ * management to prevent them from noticeably delaying other per-cpu work items.
+ * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
+ * The actual value is initialized in wq_cpu_intensive_thresh_init().
+ */
+static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
+module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
@@ -319,10 +366,8 @@ module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
-static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
-
-/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_pod_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -336,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* for further constrain wq_unbound_cpumask by cmdline parameter*/
+static struct cpumask wq_cmdline_cpumask __initdata;
+
/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
@@ -365,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+/*
+ * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
+ * process context while holding a pool lock. Bounce to a dedicated kthread
+ * worker to avoid A-A deadlocks.
+ */
+static struct kthread_worker *pwq_release_worker;
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -571,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
-/**
- * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
- * @wq: the target workqueue
- * @node: the node ID
- *
- * This must be called with any of wq_pool_mutex, wq->mutex or RCU
- * read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- *
- * Return: The unbound pool_workqueue for @node.
- */
-static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
- int node)
-{
- assert_rcu_or_wq_mutex_or_pool_mutex(wq);
-
- /*
- * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
- * delayed item is pending. The plan is to keep CPU -> NODE
- * mapping valid and stable across CPU on/offlines. Once that
- * happens, this workaround can be removed.
- */
- if (unlikely(node == NUMA_NO_NODE))
- return wq->dfl_pwq;
-
- return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
-}
-
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -790,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work)
* they're being called with pool->lock held.
*/
-static bool __need_more_worker(struct worker_pool *pool)
-{
- return !pool->nr_running;
-}
-
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
@@ -805,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool)
*/
static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) && __need_more_worker(pool);
+ return !list_empty(&pool->worklist) && !pool->nr_running;
}
/* Can I start working? Called from busy but !running workers. */
@@ -836,153 +857,18 @@ static bool too_many_workers(struct worker_pool *pool)
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
-/*
- * Wake up functions.
- */
-
-/* Return the first idle worker. Called with pool->lock held. */
-static struct worker *first_idle_worker(struct worker_pool *pool)
-{
- if (unlikely(list_empty(&pool->idle_list)))
- return NULL;
-
- return list_first_entry(&pool->idle_list, struct worker, entry);
-}
-
-/**
- * wake_up_worker - wake up an idle worker
- * @pool: worker pool to wake worker from
- *
- * Wake up the first idle worker of @pool.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- */
-static void wake_up_worker(struct worker_pool *pool)
-{
- struct worker *worker = first_idle_worker(pool);
-
- if (likely(worker))
- wake_up_process(worker->task);
-}
-
-/**
- * wq_worker_running - a worker is running again
- * @task: task waking up
- *
- * This function is called when a worker returns from schedule()
- */
-void wq_worker_running(struct task_struct *task)
-{
- struct worker *worker = kthread_data(task);
-
- if (!worker->sleeping)
- return;
-
- /*
- * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
- * and the nr_running increment below, we may ruin the nr_running reset
- * and leave with an unexpected pool->nr_running == 1 on the newly unbound
- * pool. Protect against such race.
- */
- preempt_disable();
- if (!(worker->flags & WORKER_NOT_RUNNING))
- worker->pool->nr_running++;
- preempt_enable();
- worker->sleeping = 0;
-}
-
-/**
- * wq_worker_sleeping - a worker is going to sleep
- * @task: task going to sleep
- *
- * This function is called from schedule() when a busy worker is
- * going to sleep.
- */
-void wq_worker_sleeping(struct task_struct *task)
-{
- struct worker *worker = kthread_data(task);
- struct worker_pool *pool;
-
- /*
- * Rescuers, which may not have all the fields set up like normal
- * workers, also reach here, let's not access anything before
- * checking NOT_RUNNING.
- */
- if (worker->flags & WORKER_NOT_RUNNING)
- return;
-
- pool = worker->pool;
-
- /* Return if preempted before wq_worker_running() was reached */
- if (worker->sleeping)
- return;
-
- worker->sleeping = 1;
- raw_spin_lock_irq(&pool->lock);
-
- /*
- * Recheck in case unbind_workers() preempted us. We don't
- * want to decrement nr_running after the worker is unbound
- * and nr_running has been reset.
- */
- if (worker->flags & WORKER_NOT_RUNNING) {
- raw_spin_unlock_irq(&pool->lock);
- return;
- }
-
- pool->nr_running--;
- if (need_more_worker(pool))
- wake_up_worker(pool);
- raw_spin_unlock_irq(&pool->lock);
-}
-
-/**
- * wq_worker_last_func - retrieve worker's last work function
- * @task: Task to retrieve last work function of.
- *
- * Determine the last function a worker executed. This is called from
- * the scheduler to get a worker's last known identity.
- *
- * CONTEXT:
- * raw_spin_lock_irq(rq->lock)
- *
- * This function is called during schedule() when a kworker is going
- * to sleep. It's used by psi to identify aggregation workers during
- * dequeuing, to allow periodic aggregation to shut-off when that
- * worker is the last task in the system or cgroup to go to sleep.
- *
- * As this function doesn't involve any workqueue-related locking, it
- * only returns stable values when called from inside the scheduler's
- * queuing and dequeuing paths, when @task, which must be a kworker,
- * is guaranteed to not be processing any works.
- *
- * Return:
- * The last work function %current executed as a worker, NULL if it
- * hasn't executed any work yet.
- */
-work_func_t wq_worker_last_func(struct task_struct *task)
-{
- struct worker *worker = kthread_data(task);
-
- return worker->last_func;
-}
-
/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
@@ -999,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
worker->flags &= ~flags;
@@ -1022,6 +905,69 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
pool->nr_running++;
}
+/* Return the first idle worker. Called with pool->lock held. */
+static struct worker *first_idle_worker(struct worker_pool *pool)
+{
+ if (unlikely(list_empty(&pool->idle_list)))
+ return NULL;
+
+ return list_first_entry(&pool->idle_list, struct worker, entry);
+}
+
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
+
+ /* can't use worker_set_flags(), also called from create_worker() */
+ worker->flags |= WORKER_IDLE;
+ pool->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &pool->idle_list);
+
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+
+ /* Sanity check nr_running. */
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
+ worker_clr_flags(worker, WORKER_IDLE);
+ pool->nr_idle--;
+ list_del_init(&worker->entry);
+}
+
/**
* find_worker_executing_work - find worker which is executing a work
* @pool: pool of interest
@@ -1075,13 +1021,10 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* @head: target list to append @work to
* @nextp: out parameter for nested worklist walking
*
- * Schedule linked works starting from @work to @head. Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work. This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
+ * Schedule linked works starting from @work to @head. Work series to be
+ * scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
+ * @nextp.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
@@ -1111,6 +1054,348 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
}
/**
+ * assign_work - assign a work item and its linked work items to a worker
+ * @work: work to assign
+ * @worker: worker to assign to
+ * @nextp: out parameter for nested worklist walking
+ *
+ * Assign @work and its linked work items to @worker. If @work is already being
+ * executed by another worker in the same pool, it'll be punted there.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of the last
+ * scheduled work. This allows assign_work() to be nested inside
+ * list_for_each_entry_safe().
+ *
+ * Returns %true if @work was successfully assigned to @worker. %false if @work
+ * was punted to another worker already executing it.
+ */
+static bool assign_work(struct work_struct *work, struct worker *worker,
+ struct work_struct **nextp)
+{
+ struct worker_pool *pool = worker->pool;
+ struct worker *collision;
+
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * A single work shouldn't be executed concurrently by multiple workers.
+ * __queue_work() ensures that @work doesn't jump to a different pool
+ * while still running in the previous pool. Here, we should ensure that
+ * @work is not executed concurrently by multiple workers from the same
+ * pool. Check whether anyone is already processing the work. If so,
+ * defer the work to the currently executing one.
+ */
+ collision = find_worker_executing_work(pool, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, nextp);
+ return false;
+ }
+
+ move_linked_works(work, &worker->scheduled, nextp);
+ return true;
+}
+
+/**
+ * kick_pool - wake up an idle worker if necessary
+ * @pool: pool to kick
+ *
+ * @pool may have pending work items. Wake up worker if necessary. Returns
+ * whether a worker was woken up.
+ */
+static bool kick_pool(struct worker_pool *pool)
+{
+ struct worker *worker = first_idle_worker(pool);
+ struct task_struct *p;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!need_more_worker(pool) || !worker)
+ return false;
+
+ p = worker->task;
+
+#ifdef CONFIG_SMP
+ /*
+ * Idle @worker is about to execute @work and waking up provides an
+ * opportunity to migrate @worker at a lower cost by setting the task's
+ * wake_cpu field. Let's see if we want to move @worker to improve
+ * execution locality.
+ *
+ * We're waking the worker that went idle the latest and there's some
+ * chance that @worker is marked idle but hasn't gone off CPU yet. If
+ * so, setting the wake_cpu won't do anything. As this is a best-effort
+ * optimization and the race window is narrow, let's leave as-is for
+ * now. If this becomes pronounced, we can skip over workers which are
+ * still on cpu when picking an idle worker.
+ *
+ * If @pool has non-strict affinity, @worker might have ended up outside
+ * its affinity scope. Repatriate.
+ */
+ if (!pool->attrs->affn_strict &&
+ !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
+ struct work_struct *work = list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+ p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ }
+#endif
+ wake_up_process(p);
+ return true;
+}
+
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+
+/*
+ * Concurrency-managed per-cpu work items that hog CPU for longer than
+ * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
+ * which prevents them from stalling other concurrency-managed work items. If a
+ * work function keeps triggering this mechanism, it's likely that the work item
+ * should be using an unbound workqueue instead.
+ *
+ * wq_cpu_intensive_report() tracks work functions which trigger such conditions
+ * and report them so that they can be examined and converted to use unbound
+ * workqueues as appropriate. To avoid flooding the console, each violating work
+ * function is tracked and reported with exponential backoff.
+ */
+#define WCI_MAX_ENTS 128
+
+struct wci_ent {
+ work_func_t func;
+ atomic64_t cnt;
+ struct hlist_node hash_node;
+};
+
+static struct wci_ent wci_ents[WCI_MAX_ENTS];
+static int wci_nr_ents;
+static DEFINE_RAW_SPINLOCK(wci_lock);
+static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
+
+static struct wci_ent *wci_find_ent(work_func_t func)
+{
+ struct wci_ent *ent;
+
+ hash_for_each_possible_rcu(wci_hash, ent, hash_node,
+ (unsigned long)func) {
+ if (ent->func == func)
+ return ent;
+ }
+ return NULL;
+}
+
+static void wq_cpu_intensive_report(work_func_t func)
+{
+ struct wci_ent *ent;
+
+restart:
+ ent = wci_find_ent(func);
+ if (ent) {
+ u64 cnt;
+
+ /*
+ * Start reporting from the fourth time and back off
+ * exponentially.
+ */
+ cnt = atomic64_inc_return_relaxed(&ent->cnt);
+ if (cnt >= 4 && is_power_of_2(cnt))
+ printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
+ ent->func, wq_cpu_intensive_thresh_us,
+ atomic64_read(&ent->cnt));
+ return;
+ }
+
+ /*
+ * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
+ * is exhausted, something went really wrong and we probably made enough
+ * noise already.
+ */
+ if (wci_nr_ents >= WCI_MAX_ENTS)
+ return;
+
+ raw_spin_lock(&wci_lock);
+
+ if (wci_nr_ents >= WCI_MAX_ENTS) {
+ raw_spin_unlock(&wci_lock);
+ return;
+ }
+
+ if (wci_find_ent(func)) {
+ raw_spin_unlock(&wci_lock);
+ goto restart;
+ }
+
+ ent = &wci_ents[wci_nr_ents++];
+ ent->func = func;
+ atomic64_set(&ent->cnt, 1);
+ hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
+
+ raw_spin_unlock(&wci_lock);
+}
+
+#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+static void wq_cpu_intensive_report(work_func_t func) {}
+#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+
+/**
+ * wq_worker_running - a worker is running again
+ * @task: task waking up
+ *
+ * This function is called when a worker returns from schedule()
+ */
+void wq_worker_running(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ if (!READ_ONCE(worker->sleeping))
+ return;
+
+ /*
+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
+ * and the nr_running increment below, we may ruin the nr_running reset
+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
+ * pool. Protect against such race.
+ */
+ preempt_disable();
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ worker->pool->nr_running++;
+ preempt_enable();
+
+ /*
+ * CPU intensive auto-detection cares about how long a work item hogged
+ * CPU without sleeping. Reset the starting timestamp on wakeup.
+ */
+ worker->current_at = worker->task->se.sum_exec_runtime;
+
+ WRITE_ONCE(worker->sleeping, 0);
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ *
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
+ */
+void wq_worker_sleeping(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+ struct worker_pool *pool;
+
+ /*
+ * Rescuers, which may not have all the fields set up like normal
+ * workers, also reach here, let's not access anything before
+ * checking NOT_RUNNING.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING)
+ return;
+
+ pool = worker->pool;
+
+ /* Return if preempted before wq_worker_running() was reached */
+ if (READ_ONCE(worker->sleeping))
+ return;
+
+ WRITE_ONCE(worker->sleeping, 1);
+ raw_spin_lock_irq(&pool->lock);
+
+ /*
+ * Recheck in case unbind_workers() preempted us. We don't
+ * want to decrement nr_running after the worker is unbound
+ * and nr_running has been reset.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING) {
+ raw_spin_unlock_irq(&pool->lock);
+ return;
+ }
+
+ pool->nr_running--;
+ if (kick_pool(pool))
+ worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+
+ raw_spin_unlock_irq(&pool->lock);
+}
+
+/**
+ * wq_worker_tick - a scheduler tick occurred while a kworker is running
+ * @task: task currently running
+ *
+ * Called from scheduler_tick(). We're in the IRQ context and the current
+ * worker's fields which follow the 'K' locking rule can be accessed safely.
+ */
+void wq_worker_tick(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+ struct pool_workqueue *pwq = worker->current_pwq;
+ struct worker_pool *pool = worker->pool;
+
+ if (!pwq)
+ return;
+
+ pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
+
+ if (!wq_cpu_intensive_thresh_us)
+ return;
+
+ /*
+ * If the current worker is concurrency managed and hogged the CPU for
+ * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+ * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
+ *
+ * Set @worker->sleeping means that @worker is in the process of
+ * switching out voluntarily and won't be contributing to
+ * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
+ * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
+ * double decrements. The task is releasing the CPU anyway. Let's skip.
+ * We probably want to make this prettier in the future.
+ */
+ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
+ worker->task->se.sum_exec_runtime - worker->current_at <
+ wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
+ return;
+
+ raw_spin_lock(&pool->lock);
+
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+ wq_cpu_intensive_report(worker->current_func);
+ pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
+
+ if (kick_pool(pool))
+ pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+
+ raw_spin_unlock(&pool->lock);
+}
+
+/**
+ * wq_worker_last_func - retrieve worker's last work function
+ * @task: Task to retrieve last work function of.
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(rq->lock)
+ *
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ return worker->last_func;
+}
+
+/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1136,17 +1421,11 @@ static void put_pwq(struct pool_workqueue *pwq)
lockdep_assert_held(&pwq->pool->lock);
if (likely(--pwq->refcnt))
return;
- if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
- return;
/*
- * @pwq can't be released under pool->lock, bounce to
- * pwq_unbound_release_workfn(). This never recurses on the same
- * pool->lock as this path is taken only for unbound workqueues and
- * the release work item is scheduled on a per-cpu workqueue. To
- * avoid lockdep warning, unbound pool->locks are given lockdep
- * subclass of 1 in get_unbound_pool().
+ * @pwq can't be released under pool->lock, bounce to a dedicated
+ * kthread_worker to avoid A-A deadlocks.
*/
- schedule_work(&pwq->unbound_release_work);
+ kthread_queue_work(pwq_release_worker, &pwq->release_work);
}
/**
@@ -1362,7 +1641,7 @@ fail:
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
- struct worker_pool *pool = pwq->pool;
+ debug_work_activate(work);
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack_noalloc(work);
@@ -1371,9 +1650,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
-
- if (__need_more_worker(pool))
- wake_up_worker(pool);
}
/*
@@ -1427,8 +1703,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
- struct worker_pool *last_pool;
- struct list_head *worklist;
+ struct worker_pool *last_pool, *pool;
unsigned int work_flags;
unsigned int req_cpu = cpu;
@@ -1452,23 +1727,23 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
- if (wq->flags & WQ_UNBOUND) {
- if (req_cpu == WORK_CPU_UNBOUND)
+ if (req_cpu == WORK_CPU_UNBOUND) {
+ if (wq->flags & WQ_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- } else {
- if (req_cpu == WORK_CPU_UNBOUND)
+ else
cpu = raw_smp_processor_id();
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
+ pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ pool = pwq->pool;
+
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pwq->pool) {
+ if (last_pool && last_pool != pool) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -1477,26 +1752,27 @@ retry:
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
+ pool = pwq->pool;
+ WARN_ON_ONCE(pool != last_pool);
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
} else {
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
/*
- * pwq is determined and locked. For unbound pools, we could have
- * raced with pwq release and it could already be dead. If its
- * refcnt is zero, repeat pwq selection. Note that pwqs never die
- * without another pwq replacing it in the numa_pwq_tbl or while
- * work items are executing on it, so the retrying is guaranteed to
- * make forward-progress.
+ * pwq is determined and locked. For unbound pools, we could have raced
+ * with pwq release and it could already be dead. If its refcnt is zero,
+ * repeat pwq selection. Note that unbound pwqs never die without
+ * another pwq replacing it in cpu_pwq or while work items are executing
+ * on it, so the retrying is guaranteed to make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
cpu_relax();
goto retry;
}
@@ -1515,21 +1791,20 @@ retry:
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
+ if (list_empty(&pool->worklist))
+ pool->watchdog_ts = jiffies;
+
trace_workqueue_activate_work(work);
pwq->nr_active++;
- worklist = &pwq->pool->worklist;
- if (list_empty(worklist))
- pwq->pool->watchdog_ts = jiffies;
+ insert_work(pwq, work, &pool->worklist, work_flags);
+ kick_pool(pool);
} else {
work_flags |= WORK_STRUCT_INACTIVE;
- worklist = &pwq->inactive_works;
+ insert_work(pwq, work, &pwq->inactive_works, work_flags);
}
- debug_work_activate(work);
- insert_work(pwq, work, worklist, work_flags);
-
out:
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
}
@@ -1542,6 +1817,8 @@ out:
* We queue the work to a specific CPU, the caller must ensure it
* can't go away. Callers that fail to ensure that the specified
* CPU cannot go away will execute on a randomly chosen CPU.
+ * But note well that callers specifying a CPU that never has been
+ * online will get a splat.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
@@ -1564,7 +1841,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
EXPORT_SYMBOL(queue_work_on);
/**
- * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * select_numa_node_cpu - Select a CPU based on NUMA node
* @node: NUMA node ID that we want to select a CPU from
*
* This function will attempt to find a "random" cpu available on a given
@@ -1572,14 +1849,10 @@ EXPORT_SYMBOL(queue_work_on);
* WORK_CPU_UNBOUND indicating that we should just schedule to any
* available CPU if we need to schedule this work.
*/
-static int workqueue_select_cpu_near(int node)
+static int select_numa_node_cpu(int node)
{
int cpu;
- /* No point in doing this if NUMA isn't enabled for workqueues */
- if (!wq_numa_enabled)
- return WORK_CPU_UNBOUND;
-
/* Delay binding to CPU if node is not valid or online */
if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
return WORK_CPU_UNBOUND;
@@ -1636,7 +1909,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- int cpu = workqueue_select_cpu_near(node);
+ int cpu = select_numa_node_cpu(node);
__queue_work(cpu, wq, work);
ret = true;
@@ -1791,60 +2064,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
}
EXPORT_SYMBOL(queue_rcu_work);
-/**
- * worker_enter_idle - enter idle state
- * @worker: worker which is entering idle state
- *
- * @worker is entering idle state. Update stats and idle timer if
- * necessary.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_enter_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
- WARN_ON_ONCE(!list_empty(&worker->entry) &&
- (worker->hentry.next || worker->hentry.pprev)))
- return;
-
- /* can't use worker_set_flags(), also called from create_worker() */
- worker->flags |= WORKER_IDLE;
- pool->nr_idle++;
- worker->last_active = jiffies;
-
- /* idle_list is LIFO */
- list_add(&worker->entry, &pool->idle_list);
-
- if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
- mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-
- /* Sanity check nr_running. */
- WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
-}
-
-/**
- * worker_leave_idle - leave idle state
- * @worker: worker which is leaving idle state
- *
- * @worker is leaving idle state. Update stats.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_leave_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
- return;
- worker_clr_flags(worker, WORKER_IDLE);
- pool->nr_idle--;
- list_del_init(&worker->entry);
-}
-
static struct worker *alloc_worker(int node)
{
struct worker *worker;
@@ -1860,6 +2079,14 @@ static struct worker *alloc_worker(int node)
return worker;
}
+static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
+{
+ if (pool->cpu < 0 && pool->attrs->affn_strict)
+ return pool->attrs->__pod_cpumask;
+ else
+ return pool->attrs->cpumask;
+}
+
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
@@ -1885,7 +2112,7 @@ static void worker_attach_to_pool(struct worker *worker,
kthread_set_per_cpu(worker->task, pool->cpu);
if (worker->rescue_wq)
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
list_add_tail(&worker->node, &pool->workers);
worker->pool = pool;
@@ -1977,16 +2204,25 @@ static struct worker *create_worker(struct worker_pool *pool)
}
set_user_nice(worker->task, pool->attrs->nice);
- kthread_bind_mask(worker->task, pool->attrs->cpumask);
+ kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
+
worker->pool->nr_workers++;
worker_enter_idle(worker);
+ kick_pool(pool);
+
+ /*
+ * @worker is waiting on a completion in kthread() and will trigger hung
+ * check if not woken up soon. As kick_pool() might not have waken it
+ * up, wake it up explicitly once more.
+ */
wake_up_process(worker->task);
+
raw_spin_unlock_irq(&pool->lock);
return worker;
@@ -2114,9 +2350,8 @@ static void idle_worker_timeout(struct timer_list *t)
static void idle_cull_fn(struct work_struct *work)
{
struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
- struct list_head cull_list;
+ LIST_HEAD(cull_list);
- INIT_LIST_HEAD(&cull_list);
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
* cannot proceed beyong worker_detach_from_pool() in its self-destruct
@@ -2166,6 +2401,7 @@ static void send_mayday(struct work_struct *work)
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
+ pwq->stats[PWQ_STAT_MAYDAY]++;
}
}
@@ -2303,9 +2539,7 @@ __acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
- bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
unsigned long work_data;
- struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
@@ -2322,24 +2556,13 @@ __acquires(&pool->lock)
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
- /*
- * A single work shouldn't be executed concurrently by
- * multiple workers on a single cpu. Check whether anyone is
- * already processing the work. If so, defer the work to the
- * currently executing one.
- */
- collision = find_worker_executing_work(pool, work);
- if (unlikely(collision)) {
- move_linked_works(work, &collision->scheduled, NULL);
- return;
- }
-
/* claim and dequeue */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
+ worker->current_at = worker->task->se.sum_exec_runtime;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -2357,18 +2580,16 @@ __acquires(&pool->lock)
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
- if (unlikely(cpu_intensive))
+ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Wake up another worker if necessary. The condition is always
- * false for normal per-cpu workers since nr_running would always
- * be >= 1 at this point. This is used to chain execution of the
- * pending work items for WORKER_NOT_RUNNING workers such as the
- * UNBOUND and CPU_INTENSIVE ones.
+ * Kick @pool if necessary. It's always noop for per-cpu worker pools
+ * since nr_running would always be >= 1 at this point. This is used to
+ * chain execution of the pending work items for WORKER_NOT_RUNNING
+ * workers such as the UNBOUND and CPU_INTENSIVE ones.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
/*
* Record the last pool and clear PENDING which should be the last
@@ -2378,6 +2599,7 @@ __acquires(&pool->lock)
*/
set_work_pool_and_clear_pending(work, pool->id);
+ pwq->stats[PWQ_STAT_STARTED]++;
raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
@@ -2411,6 +2633,7 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
@@ -2435,9 +2658,12 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
- /* clear cpu intensive status */
- if (unlikely(cpu_intensive))
- worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+ /*
+ * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
+ * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
+ * wq_cpu_intensive_thresh_us. Clear it.
+ */
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* tag the worker for identification in schedule() */
worker->last_func = worker->current_func;
@@ -2465,9 +2691,15 @@ __acquires(&pool->lock)
*/
static void process_scheduled_works(struct worker *worker)
{
- while (!list_empty(&worker->scheduled)) {
- struct work_struct *work = list_first_entry(&worker->scheduled,
- struct work_struct, entry);
+ struct work_struct *work;
+ bool first = true;
+
+ while ((work = list_first_entry_or_null(&worker->scheduled,
+ struct work_struct, entry))) {
+ if (first) {
+ worker->pool->watchdog_ts = jiffies;
+ first = false;
+ }
process_one_work(worker, work);
}
}
@@ -2548,17 +2780,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
- pool->watchdog_ts = jiffies;
-
- if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
- /* optimization path, not strictly necessary */
- process_one_work(worker, work);
- if (unlikely(!list_empty(&worker->scheduled)))
- process_scheduled_works(worker);
- } else {
- move_linked_works(work, &worker->scheduled, NULL);
+ if (assign_work(work, worker, NULL))
process_scheduled_works(worker);
- }
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
@@ -2602,7 +2825,6 @@ static int rescuer_thread(void *__rescuer)
{
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
- struct list_head *scheduled = &rescuer->scheduled;
bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2633,7 +2855,6 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
- bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2648,17 +2869,14 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(scheduled));
+ WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq) {
- if (first)
- pool->watchdog_ts = jiffies;
- move_linked_works(work, scheduled, &n);
- }
- first = false;
+ if (get_work_pwq(work) == pwq &&
+ assign_work(work, rescuer, &n))
+ pwq->stats[PWQ_STAT_RESCUED]++;
}
- if (!list_empty(scheduled)) {
+ if (!list_empty(&rescuer->scheduled)) {
process_scheduled_works(rescuer);
/*
@@ -2691,12 +2909,10 @@ repeat:
put_pwq(pwq);
/*
- * Leave this pool. If need_more_worker() is %true, notify a
- * regular worker; otherwise, we end up with 0 concurrency
- * and stalling the execution.
+ * Leave this pool. Notify regular workers; otherwise, we end up
+ * with 0 concurrency and stalling the execution.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
@@ -2831,7 +3047,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
pwq->nr_in_flight[work_color]++;
work_flags |= work_color_to_flags(work_color);
- debug_work_activate(&barr->work);
insert_work(pwq, &barr->work, head, work_flags);
}
@@ -3494,6 +3709,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
+ free_cpumask_var(attrs->__pod_cpumask);
kfree(attrs);
}
}
@@ -3515,8 +3731,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
goto fail;
if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
+ if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
+ goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ attrs->affn_scope = WQ_AFFN_DFL;
return attrs;
fail:
free_workqueue_attrs(attrs);
@@ -3528,12 +3747,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
+ to->affn_strict = from->affn_strict;
+
/*
- * Unlike hash and equality test, this function doesn't ignore
- * ->no_numa as it is used for both pool and wq attrs. Instead,
- * get_unbound_pool() explicitly clears ->no_numa after copying.
+ * Unlike hash and equality test, copying shouldn't ignore wq-only
+ * fields as copying is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears the fields.
*/
- to->no_numa = from->no_numa;
+ to->affn_scope = from->affn_scope;
+ to->ordered = from->ordered;
+}
+
+/*
+ * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
+ * comments in 'struct workqueue_attrs' definition.
+ */
+static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
+{
+ attrs->affn_scope = WQ_AFFN_NR_TYPES;
+ attrs->ordered = false;
}
/* hash value of the content of @attr */
@@ -3544,6 +3777,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
hash = jhash_1word(attrs->nice, hash);
hash = jhash(cpumask_bits(attrs->cpumask),
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash(cpumask_bits(attrs->__pod_cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash_1word(attrs->affn_strict, hash);
return hash;
}
@@ -3555,9 +3791,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
return false;
if (!cpumask_equal(a->cpumask, b->cpumask))
return false;
+ if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
+ return false;
+ if (a->affn_strict != b->affn_strict)
+ return false;
return true;
}
+/* Update @attrs with actually available CPUs */
+static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
+ const cpumask_t *unbound_cpumask)
+{
+ /*
+ * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
+ * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
+ * @unbound_cpumask.
+ */
+ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
+ if (unlikely(cpumask_empty(attrs->cpumask)))
+ cpumask_copy(attrs->cpumask, unbound_cpumask);
+}
+
+/* find wq_pod_type to use for @attrs */
+static const struct wq_pod_type *
+wqattrs_pod_type(const struct workqueue_attrs *attrs)
+{
+ enum wq_affn_scope scope;
+ struct wq_pod_type *pt;
+
+ /* to synchronize access to wq_affn_dfl */
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (attrs->affn_scope == WQ_AFFN_DFL)
+ scope = wq_affn_dfl;
+ else
+ scope = attrs->affn_scope;
+
+ pt = &wq_pod_types[scope];
+
+ if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
+ likely(pt->nr_pods))
+ return pt;
+
+ /*
+ * Before workqueue_init_topology(), only SYSTEM is available which is
+ * initialized in workqueue_init_early().
+ */
+ pt = &wq_pod_types[WQ_AFFN_SYSTEM];
+ BUG_ON(!pt->nr_pods);
+ return pt;
+}
+
/**
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
@@ -3596,6 +3880,9 @@ static int init_worker_pool(struct worker_pool *pool)
pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
+
+ wqattrs_clear_for_pool(pool->attrs);
+
return 0;
}
@@ -3643,12 +3930,8 @@ static void rcu_free_wq(struct rcu_head *rcu)
container_of(rcu, struct workqueue_struct, rcu);
wq_free_lockdep(wq);
-
- if (!(wq->flags & WQ_UNBOUND))
- free_percpu(wq->cpu_pwqs);
- else
- free_workqueue_attrs(wq->unbound_attrs);
-
+ free_percpu(wq->cpu_pwq);
+ free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
}
@@ -3675,10 +3958,8 @@ static void rcu_free_pool(struct rcu_head *rcu)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
- struct list_head cull_list;
struct worker *worker;
-
- INIT_LIST_HEAD(&cull_list);
+ LIST_HEAD(cull_list);
lockdep_assert_held(&wq_pool_mutex);
@@ -3762,10 +4043,10 @@ static void put_unbound_pool(struct worker_pool *pool)
*/
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
- int node;
- int target_node = NUMA_NO_NODE;
+ int pod, node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
@@ -3777,31 +4058,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
}
}
- /* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
- for_each_node(node) {
- if (cpumask_subset(attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
- target_node = node;
- break;
- }
+ /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
+ for (pod = 0; pod < pt->nr_pods; pod++) {
+ if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
+ node = pt->pod_node[pod];
+ break;
}
}
/* nope, create a new one */
- pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
- lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ pool->node = node;
copy_workqueue_attrs(pool->attrs, attrs);
- pool->node = target_node;
-
- /*
- * no_numa isn't a worker_pool attribute, always clear it. See
- * 'struct workqueue_attrs' comments for detail.
- */
- pool->attrs->no_numa = false;
+ wqattrs_clear_for_pool(pool->attrs);
if (worker_pool_assign_id(pool) < 0)
goto fail;
@@ -3827,34 +4099,33 @@ static void rcu_free_pwq(struct rcu_head *rcu)
}
/*
- * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
- * and needs to be destroyed.
+ * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
+ * refcnt and needs to be destroyed.
*/
-static void pwq_unbound_release_workfn(struct work_struct *work)
+static void pwq_release_workfn(struct kthread_work *work)
{
struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
- unbound_release_work);
+ release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
bool is_last = false;
/*
- * when @pwq is not linked, it doesn't hold any reference to the
+ * When @pwq is not linked, it doesn't hold any reference to the
* @wq, and @wq is invalid to access.
*/
if (!list_empty(&pwq->pwqs_node)) {
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
-
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
mutex_unlock(&wq->mutex);
}
- mutex_lock(&wq_pool_mutex);
- put_unbound_pool(pool);
- mutex_unlock(&wq_pool_mutex);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+ }
call_rcu(&pwq->rcu, rcu_free_pwq);
@@ -3898,24 +4169,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
- bool kick = false;
-
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->inactive_works) &&
- pwq->nr_active < pwq->max_active) {
+ pwq->nr_active < pwq->max_active)
pwq_activate_first_inactive(pwq);
- kick = true;
- }
- /*
- * Need to kick a worker after thawed or an unbound wq's
- * max_active is bumped. In realtime scenarios, always kicking a
- * worker will cause interference on the isolated cpu cores, so
- * let's kick iff work items were activated.
- */
- if (kick)
- wake_up_worker(pwq->pool);
+ kick_pool(pwq->pool);
} else {
pwq->max_active = 0;
}
@@ -3938,7 +4198,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->inactive_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
- INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+ kthread_init_work(&pwq->release_work, pwq_release_workfn);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -3986,61 +4246,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
}
/**
- * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
- * @node: the target NUMA node
+ * @cpu: the target CPU
* @cpu_going_down: if >= 0, the CPU to consider as offline
- * @cpumask: outarg, the resulting cpumask
- *
- * Calculate the cpumask a workqueue with @attrs should use on @node. If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation. The result is stored in @cpumask.
*
- * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
- * enabled and @node has online CPUs requested by @attrs, the returned
- * cpumask is the intersection of the possible CPUs of @node and
- * @attrs->cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * The result is stored in @attrs->__pod_cpumask.
*
- * The caller is responsible for ensuring that the cpumask of @node stays
- * stable.
+ * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
+ * and @pod has online CPUs requested by @attrs, the returned cpumask is the
+ * intersection of the possible CPUs of @pod and @attrs->cpumask.
*
- * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
- * %false if equal.
+ * The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
- int cpu_going_down, cpumask_t *cpumask)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
+ int cpu_going_down)
{
- if (!wq_numa_enabled || attrs->no_numa)
- goto use_dfl;
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int pod = pt->cpu_pod[cpu];
- /* does @node have any online CPUs @attrs wants? */
- cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ /* does @pod have any online CPUs @attrs wants? */
+ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
+ cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, cpumask);
+ cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
- if (cpumask_empty(cpumask))
- goto use_dfl;
+ if (cpumask_empty(attrs->__pod_cpumask)) {
+ cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
+ return;
+ }
- /* yeap, return possible CPUs in @node that @attrs wants */
- cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ /* yeap, return possible CPUs in @pod that @attrs wants */
+ cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
- if (cpumask_empty(cpumask)) {
+ if (cpumask_empty(attrs->__pod_cpumask))
pr_warn_once("WARNING: workqueue cpumask: online intersect > "
"possible intersect\n");
- return false;
- }
-
- return !cpumask_equal(cpumask, attrs->cpumask);
-
-use_dfl:
- cpumask_copy(cpumask, attrs->cpumask);
- return false;
}
-/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
-static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
- int node,
- struct pool_workqueue *pwq)
+/* install @pwq into @wq's cpu_pwq and return the old pwq */
+static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
+ int cpu, struct pool_workqueue *pwq)
{
struct pool_workqueue *old_pwq;
@@ -4050,8 +4298,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
return old_pwq;
}
@@ -4068,10 +4316,10 @@ struct apply_wqattrs_ctx {
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
if (ctx) {
- int node;
+ int cpu;
- for_each_node(node)
- put_pwq_unlocked(ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ put_pwq_unlocked(ctx->pwq_tbl[cpu]);
put_pwq_unlocked(ctx->dfl_pwq);
free_workqueue_attrs(ctx->attrs);
@@ -4087,76 +4335,64 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
- struct workqueue_attrs *new_attrs, *tmp_attrs;
- int node;
+ struct workqueue_attrs *new_attrs;
+ int cpu;
lockdep_assert_held(&wq_pool_mutex);
- ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
+ if (WARN_ON(attrs->affn_scope < 0 ||
+ attrs->affn_scope >= WQ_AFFN_NR_TYPES))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs();
- tmp_attrs = alloc_workqueue_attrs();
- if (!ctx || !new_attrs || !tmp_attrs)
+ if (!ctx || !new_attrs)
goto out_free;
/*
- * Calculate the attrs of the default pwq with unbound_cpumask
- * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
- * If the user configured cpumask doesn't overlap with the
- * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
- */
- copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
- if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, unbound_cpumask);
-
- /*
- * We may create multiple pwqs with differing cpumasks. Make a
- * copy of @new_attrs which will be modified and used to obtain
- * pools.
- */
- copy_workqueue_attrs(tmp_attrs, new_attrs);
-
- /*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
+ copy_workqueue_attrs(new_attrs, attrs);
+ wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->dfl_pwq)
goto out_free;
- for_each_node(node) {
- if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
- ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
- if (!ctx->pwq_tbl[node])
- goto out_free;
- } else {
+ for_each_possible_cpu(cpu) {
+ if (new_attrs->ordered) {
ctx->dfl_pwq->refcnt++;
- ctx->pwq_tbl[node] = ctx->dfl_pwq;
+ ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
+ } else {
+ wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
+ if (!ctx->pwq_tbl[cpu])
+ goto out_free;
}
}
/* save the user configured attrs and sanitize it. */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->attrs = new_attrs;
ctx->wq = wq;
- free_workqueue_attrs(tmp_attrs);
return ctx;
out_free:
- free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
apply_wqattrs_cleanup(ctx);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
- int node;
+ int cpu;
/* all pwqs have been created successfully, let's install'em */
mutex_lock(&ctx->wq->mutex);
@@ -4164,9 +4400,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
- for_each_node(node)
- ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
- ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
+ ctx->pwq_tbl[cpu]);
/* @dfl_pwq might not have been used, ensure it's linked */
link_pwq(ctx->dfl_pwq);
@@ -4206,8 +4442,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
}
ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
- if (!ctx)
- return -ENOMEM;
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
/* the ctx has been prepared successfully, let's commit it */
apply_wqattrs_commit(ctx);
@@ -4221,12 +4457,11 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
* @wq: the target workqueue
* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
*
- * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on. Older pwqs are released as in-flight work
- * items finish. Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
+ * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
+ * work items are affine to the pod it was issued on. Older pwqs are released as
+ * in-flight work items finish. Note that a work item which repeatedly requeues
+ * itself back-to-back will stay on its current pwq.
*
* Performs GFP_KERNEL allocations.
*
@@ -4249,40 +4484,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU coming up or going down
+ * @cpu: the CPU to update pool association for
+ * @hotplug_cpu: the CPU coming up or going down
* @online: whether @cpu is coming up or going down
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
* @wq accordingly.
*
- * If NUMA affinity can't be adjusted due to memory allocation failure, it
- * falls back to @wq->dfl_pwq which may not be optimal but is always
- * correct.
- *
- * Note that when the last allowed CPU of a NUMA node goes offline for a
- * workqueue with a cpumask spanning multiple nodes, the workers which were
- * already executing the work items for the workqueue will lose their CPU
- * affinity and may execute on any CPU. This is similar to how per-cpu
- * workqueues behave on CPU_DOWN. If a workqueue user wants strict
- * affinity, it's the user's responsibility to flush the work item from
- * CPU_DOWN_PREPARE.
+ *
+ * If pod affinity can't be adjusted due to memory allocation failure, it falls
+ * back to @wq->dfl_pwq which may not be optimal but is always correct.
+ *
+ * Note that when the last allowed CPU of a pod goes offline for a workqueue
+ * with a cpumask spanning multiple pods, the workers which were already
+ * executing the work items for the workqueue will lose their CPU affinity and
+ * may execute on any CPU. This is similar to how per-cpu workqueues behave on
+ * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
+ * responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
- bool online)
+static void wq_update_pod(struct workqueue_struct *wq, int cpu,
+ int hotplug_cpu, bool online)
{
- int node = cpu_to_node(cpu);
- int cpu_off = online ? -1 : cpu;
+ int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
- cpumask_t *cpumask;
lockdep_assert_held(&wq_pool_mutex);
- if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
- wq->unbound_attrs->no_numa)
+ if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
return;
/*
@@ -4290,36 +4522,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_unbound_numa_attrs_buf;
- cpumask = target_attrs->cpumask;
+ target_attrs = wq_update_pod_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
- pwq = unbound_pwq_by_node(wq, node);
+ wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
- /*
- * Let's determine what needs to be done. If the target cpumask is
- * different from the default pwq's, we need to compare it to @pwq's
- * and create a new one if they don't match. If the target cpumask
- * equals the default pwq's, the default pwq should be used.
- */
- if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
- if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
- return;
- } else {
- goto use_dfl_pwq;
- }
+ /* nothing to do if the target cpumask matches the current pwq */
+ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
+ lockdep_is_held(&wq_pool_mutex));
+ if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ return;
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
- pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
wq->name);
goto use_dfl_pwq;
}
/* Install the new pwq. */
mutex_lock(&wq->mutex);
- old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
goto out_unlock;
use_dfl_pwq:
@@ -4327,7 +4552,7 @@ use_dfl_pwq:
raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4338,21 +4563,26 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
- if (!(wq->flags & WQ_UNBOUND)) {
- wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
- if (!wq->cpu_pwqs)
- return -ENOMEM;
+ wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
+ if (!wq->cpu_pwq)
+ goto enomem;
+ if (!(wq->flags & WQ_UNBOUND)) {
for_each_possible_cpu(cpu) {
- struct pool_workqueue *pwq =
- per_cpu_ptr(wq->cpu_pwqs, cpu);
- struct worker_pool *cpu_pools =
- per_cpu(cpu_worker_pools, cpu);
+ struct pool_workqueue **pwq_p =
+ per_cpu_ptr(wq->cpu_pwq, cpu);
+ struct worker_pool *pool =
+ &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);
+
+ *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
+ pool->node);
+ if (!*pwq_p)
+ goto enomem;
- init_pwq(pwq, wq, &cpu_pools[highpri]);
+ init_pwq(*pwq_p, wq, pool);
mutex_lock(&wq->mutex);
- link_pwq(pwq);
+ link_pwq(*pwq_p);
mutex_unlock(&wq->mutex);
}
return 0;
@@ -4371,18 +4601,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
cpus_read_unlock();
return ret;
+
+enomem:
+ if (wq->cpu_pwq) {
+ for_each_possible_cpu(cpu)
+ kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ free_percpu(wq->cpu_pwq);
+ wq->cpu_pwq = NULL;
+ }
+ return -ENOMEM;
}
static int wq_clamp_max_active(int max_active, unsigned int flags,
const char *name)
{
- int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
-
- if (max_active < 1 || max_active > lim)
+ if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
- max_active, name, 1, lim);
+ max_active, name, 1, WQ_MAX_ACTIVE);
- return clamp_val(max_active, 1, lim);
+ return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}
/*
@@ -4405,7 +4642,7 @@ static int init_rescuer(struct workqueue_struct *wq)
}
rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);
if (IS_ERR(rescuer->task)) {
ret = PTR_ERR(rescuer->task);
pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
@@ -4426,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, ...)
{
- size_t tbl_size = 0;
va_list args;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
/*
- * Unbound && max_active == 1 used to imply ordered, which is no
- * longer the case on NUMA machines due to per-node pools. While
+ * Unbound && max_active == 1 used to imply ordered, which is no longer
+ * the case on many machines due to per-pod pools. While
* alloc_ordered_workqueue() is the right way to create an ordered
- * workqueue, keep the previous behavior to avoid subtle breakages
- * on NUMA.
+ * workqueue, keep the previous behavior to avoid subtle breakages.
*/
if ((flags & WQ_UNBOUND) && max_active == 1)
flags |= __WQ_ORDERED;
@@ -4446,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- if (flags & WQ_UNBOUND)
- tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
-
- wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
@@ -4544,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq)
void destroy_workqueue(struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
- int node;
+ int cpu;
/*
* Remove it from sysfs first so that sanity check failure doesn't
@@ -4603,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- if (!(wq->flags & WQ_UNBOUND)) {
- wq_unregister_lockdep(wq);
- /*
- * The base ref is never dropped on per-cpu pwqs. Directly
- * schedule RCU free.
- */
- call_rcu(&wq->rcu, rcu_free_wq);
- } else {
- /*
- * We're the sole accessor of @wq at this point. Directly
- * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
- * @wq will be freed when the last pwq is released.
- */
- for_each_node(node) {
- pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
- put_pwq_unlocked(pwq);
- }
+ /*
+ * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
+ * to put the base refs. @wq will be auto-destroyed from the last
+ * pwq_put. RCU read lock prevents @wq from going away from under us.
+ */
+ rcu_read_lock();
- /*
- * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
- * put. Don't access it afterwards.
- */
- pwq = wq->dfl_pwq;
- wq->dfl_pwq = NULL;
+ for_each_possible_cpu(cpu) {
+ pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
put_pwq_unlocked(pwq);
}
+
+ put_pwq_unlocked(wq->dfl_pwq);
+ wq->dfl_pwq = NULL;
+
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -4706,10 +4928,11 @@ bool current_is_workqueue_rescuer(void)
* unreliable and only useful as advisory hints or for debugging.
*
* If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
- * Note that both per-cpu and unbound workqueues may be associated with
- * multiple pool_workqueues which have separate congested states. A
- * workqueue being congested on one CPU doesn't mean the workqueue is also
- * contested on other CPUs / NUMA nodes.
+ *
+ * With the exception of ordered workqueues, all workqueues have per-cpu
+ * pool_workqueues, each with its own congested state. A workqueue being
+ * congested on one CPU doesn't mean that the workqueue is contested on any
+ * other CPUs.
*
* Return:
* %true if congested, %false otherwise.
@@ -4725,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
-
+ pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
ret = !list_empty(&pwq->inactive_works);
+
preempt_enable();
rcu_read_unlock();
@@ -5205,7 +5425,7 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- wake_up_worker(pool);
+ kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
@@ -5238,7 +5458,7 @@ static void rebind_workers(struct worker_pool *pool)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
+ pool_allowed_cpus(pool)) < 0);
}
raw_spin_lock_irq(&pool->lock);
@@ -5332,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_unlock(&wq_pool_attach_mutex);
}
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update pod affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, true);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5350,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu)
unbind_workers(cpu);
- /* update NUMA affinity of unbound workqueues */
+ /* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, false);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5549,8 +5787,8 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
continue;
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
- if (!ctx) {
- ret = -ENOMEM;
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
break;
}
@@ -5608,21 +5846,72 @@ out_unlock:
return ret;
}
+static int parse_affn_scope(const char *val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
+ if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
+ return i;
+ }
+ return -EINVAL;
+}
+
+static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
+{
+ struct workqueue_struct *wq;
+ int affn, cpu;
+
+ affn = parse_affn_scope(val);
+ if (affn < 0)
+ return affn;
+ if (affn == WQ_AFFN_DFL)
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+
+ wq_affn_dfl = affn;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
+{
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
+}
+
+static const struct kernel_param_ops wq_affn_dfl_ops = {
+ .set = wq_affn_dfl_set,
+ .get = wq_affn_dfl_get,
+};
+
+module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
+
#ifdef CONFIG_SYSFS
/*
* Workqueues with WQ_SYSFS flag set is visible to userland via
* /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
* following attributes.
*
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
*
* Unbound workqueues have the following extra attributes.
*
- * pool_ids RO int : the associated pool IDs for each node
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- * numa RW bool : whether enable NUMA affinity
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ * affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
+ * affinity_strict RW bool : worker CPU affinity is strict
*/
struct wq_device {
struct workqueue_struct *wq;
@@ -5675,28 +5964,6 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- cpus_read_lock();
- rcu_read_lock();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock();
- cpus_read_unlock();
-
- return written;
-}
-
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -5787,50 +6054,84 @@ out_unlock:
return ret ?: count;
}
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static ssize_t wq_affn_scope_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
+ if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
+ written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ wq_affn_names[WQ_AFFN_DFL],
+ wq_affn_names[wq_affn_dfl]);
+ else
+ written = scnprintf(buf, PAGE_SIZE, "%s\n",
+ wq_affn_names[wq->unbound_attrs->affn_scope]);
mutex_unlock(&wq->mutex);
return written;
}
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t wq_affn_scope_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int v, ret = -ENOMEM;
+ int affn, ret = -ENOMEM;
- apply_wqattrs_lock();
+ affn = parse_affn_scope(buf);
+ if (affn < 0)
+ return affn;
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- goto out_unlock;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
+ if (attrs) {
+ attrs->affn_scope = affn;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
-out_unlock:
+static ssize_t wq_affinity_strict_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ wq->unbound_attrs->affn_strict);
+}
+
+static ssize_t wq_affinity_strict_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret = -ENOMEM;
+
+ if (sscanf(buf, "%d", &v) != 1)
+ return -EINVAL;
+
+ apply_wqattrs_lock();
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (attrs) {
+ attrs->affn_strict = (bool)v;
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ }
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
+ __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
__ATTR_NULL,
};
@@ -6196,62 +6497,19 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
-static void __init wq_numa_init(void)
-{
- cpumask_var_t *tbl;
- int node, cpu;
-
- if (num_possible_nodes() <= 1)
- return;
-
- if (wq_disable_numa) {
- pr_info("workqueue: NUMA affinity support disabled\n");
- return;
- }
-
- for_each_possible_cpu(cpu) {
- if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- return;
- }
- }
-
- wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_unbound_numa_attrs_buf);
-
- /*
- * We want masks of possible CPUs of each node which isn't readily
- * available. Build one from cpu_to_node() which should have been
- * fully initialized by now.
- */
- tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
- BUG_ON(!tbl);
-
- for_each_node(node)
- BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE));
-
- for_each_possible_cpu(cpu) {
- node = cpu_to_node(cpu);
- cpumask_set_cpu(cpu, tbl[node]);
- }
-
- wq_numa_possible_cpumask = tbl;
- wq_numa_enabled = true;
-}
-
/**
* workqueue_init_early - early init for workqueue subsystem
*
- * This is the first half of two-staged workqueue subsystem initialization
- * and invoked as soon as the bare basics - memory allocation, cpumasks and
- * idr are up. It sets up all the data structures and system workqueues
- * and allows early boot code to create workqueues and queue/cancel work
- * items. Actual work item execution starts only after kthreads can be
- * created and scheduled right before early initcalls.
+ * This is the first step of three-staged workqueue subsystem initialization and
+ * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
+ * up. It sets up all the data structures and system workqueues and allows early
+ * boot code to create workqueues and queue/cancel work items. Actual work item
+ * execution starts only after kthreads can be created and scheduled right
+ * before early initcalls.
*/
void __init workqueue_init_early(void)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
@@ -6261,8 +6519,30 @@ void __init workqueue_init_early(void)
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (!cpumask_empty(&wq_cmdline_cpumask))
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
+ wq_update_pod_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!wq_update_pod_attrs_buf);
+
+ /* initialize WQ_AFFN_SYSTEM pods */
+ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
+
+ BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
+
+ wq_update_pod_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!wq_update_pod_attrs_buf);
+
+ pt->nr_pods = 1;
+ cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
+ pt->pod_node[0] = NUMA_NO_NODE;
+ pt->cpu_pod[0] = 0;
+
/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
@@ -6272,7 +6552,9 @@ void __init workqueue_init_early(void)
BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
+ pool->attrs->affn_strict = true;
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
@@ -6293,11 +6575,10 @@ void __init workqueue_init_early(void)
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
- * Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
- attrs->no_numa = true;
+ attrs->ordered = true;
ordered_wq_attrs[i] = attrs;
}
@@ -6305,7 +6586,7 @@ void __init workqueue_init_early(void)
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_UNBOUND_MAX_ACTIVE);
+ WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -6319,14 +6600,53 @@ void __init workqueue_init_early(void)
!system_freezable_power_efficient_wq);
}
+static void __init wq_cpu_intensive_thresh_init(void)
+{
+ unsigned long thresh;
+ unsigned long bogo;
+
+ /* if the user set it to a specific value, keep it */
+ if (wq_cpu_intensive_thresh_us != ULONG_MAX)
+ return;
+
+ pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ BUG_ON(IS_ERR(pwq_release_worker));
+
+ /*
+ * The default of 10ms is derived from the fact that most modern (as of
+ * 2023) processors can do a lot in 10ms and that it's just below what
+ * most consider human-perceivable. However, the kernel also runs on a
+ * lot slower CPUs including microcontrollers where the threshold is way
+ * too low.
+ *
+ * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
+ * This is by no means accurate but it doesn't have to be. The mechanism
+ * is still useful even when the threshold is fully scaled up. Also, as
+ * the reports would usually be applicable to everyone, some machines
+ * operating on longer thresholds won't significantly diminish their
+ * usefulness.
+ */
+ thresh = 10 * USEC_PER_MSEC;
+
+ /* see init/calibrate.c for lpj -> BogoMIPS calculation */
+ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
+ if (bogo < 4000)
+ thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
+
+ pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
+ loops_per_jiffy, bogo, thresh);
+
+ wq_cpu_intensive_thresh_us = thresh;
+}
+
/**
* workqueue_init - bring workqueue subsystem fully online
*
- * This is the latter half of two-staged workqueue subsystem initialization
- * and invoked as soon as kthreads can be created and scheduled.
- * Workqueues have been created and work items queued on them, but there
- * are no kworkers executing the work items yet. Populate the worker pools
- * with the initial workers and enable future kworker creations.
+ * This is the second step of three-staged workqueue subsystem initialization
+ * and invoked as soon as kthreads can be created and scheduled. Workqueues have
+ * been created and work items queued on them, but there are no kworkers
+ * executing the work items yet. Populate the worker pools with the initial
+ * workers and enable future kworker creations.
*/
void __init workqueue_init(void)
{
@@ -6334,19 +6654,14 @@ void __init workqueue_init(void)
struct worker_pool *pool;
int cpu, bkt;
- /*
- * It'd be simpler to initialize NUMA in workqueue_init_early() but
- * CPU to node mapping may not be available that early on some
- * archs such as power and arm64. As per-cpu pools created
- * previously could be missing node hint and unbound pools NUMA
- * affinity, fix them up.
- *
- * Also, while iterating workqueues, create rescuers if requested.
- */
- wq_numa_init();
+ wq_cpu_intensive_thresh_init();
mutex_lock(&wq_pool_mutex);
+ /*
+ * Per-cpu pools created earlier could be missing node hint. Fix them
+ * up. Also, create a rescuer for workqueues that requested it.
+ */
for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
@@ -6354,7 +6669,6 @@ void __init workqueue_init(void)
}
list_for_each_entry(wq, &workqueues, list) {
- wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq),
"workqueue: failed to create early rescuer for %s",
wq->name);
@@ -6378,9 +6692,114 @@ void __init workqueue_init(void)
}
/*
- * Despite the naming, this is a no-op function which is here only for avoiding
- * link error. Since compile-time warning may fail to catch, we will need to
- * emit run-time warning from __flush_workqueue().
+ * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
+ * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
+ * and consecutive pod ID. The rest of @pt is initialized accordingly.
+ */
+static void __init init_pod_type(struct wq_pod_type *pt,
+ bool (*cpus_share_pod)(int, int))
+{
+ int cur, pre, cpu, pod;
+
+ pt->nr_pods = 0;
+
+ /* init @pt->cpu_pod[] according to @cpus_share_pod() */
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->cpu_pod);
+
+ for_each_possible_cpu(cur) {
+ for_each_possible_cpu(pre) {
+ if (pre >= cur) {
+ pt->cpu_pod[cur] = pt->nr_pods++;
+ break;
+ }
+ if (cpus_share_pod(cur, pre)) {
+ pt->cpu_pod[cur] = pt->cpu_pod[pre];
+ break;
+ }
+ }
+ }
+
+ /* init the rest to match @pt->cpu_pod[] */
+ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node);
+
+ for (pod = 0; pod < pt->nr_pods; pod++)
+ BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
+
+ for_each_possible_cpu(cpu) {
+ cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
+ pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
+ }
+}
+
+static bool __init cpus_dont_share(int cpu0, int cpu1)
+{
+ return false;
+}
+
+static bool __init cpus_share_smt(int cpu0, int cpu1)
+{
+#ifdef CONFIG_SCHED_SMT
+ return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
+#else
+ return false;
+#endif
+}
+
+static bool __init cpus_share_numa(int cpu0, int cpu1)
+{
+ return cpu_to_node(cpu0) == cpu_to_node(cpu1);
+}
+
+/**
+ * workqueue_init_topology - initialize CPU pods for unbound workqueues
+ *
+ * This is the third step of there-staged workqueue subsystem initialization and
+ * invoked after SMP and topology information are fully initialized. It
+ * initializes the unbound CPU pods accordingly.
*/
-void __warn_flushing_systemwide_wq(void) { }
+void __init workqueue_init_topology(void)
+{
+ struct workqueue_struct *wq;
+ int cpu;
+
+ init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
+ init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+ init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
+ init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * Workqueues allocated earlier would have all CPUs sharing the default
+ * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
+ * combinations to apply per-pod sharing.
+ */
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+}
+
+void __warn_flushing_systemwide_wq(void)
+{
+ pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
+ dump_stack();
+}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
+
+static int __init workqueue_unbound_cpus_setup(char *str)
+{
+ if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
+ cpumask_clear(&wq_cmdline_cpumask);
+ pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
+ }
+
+ return 1;
+}
+__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index e00b1204a8e9..f6275944ada7 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -28,13 +28,18 @@ struct worker {
struct hlist_node hentry; /* L: while busy */
};
- struct work_struct *current_work; /* L: work being processed */
- work_func_t current_func; /* L: current_work's fn */
- struct pool_workqueue *current_pwq; /* L: current_work's pwq */
- unsigned int current_color; /* L: current_work's color */
- struct list_head scheduled; /* L: scheduled works */
+ struct work_struct *current_work; /* K: work being processed and its */
+ work_func_t current_func; /* K: function */
+ struct pool_workqueue *current_pwq; /* K: pwq */
+ u64 current_at; /* K: runtime at start or last wakeup */
+ unsigned int current_color; /* K: color */
+
+ int sleeping; /* S: is worker sleeping? */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
+ /* used by the scheduler to determine a worker's last known identity */
+ work_func_t last_func; /* K: last work's fn */
+
+ struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* A: the associated pool */
@@ -42,10 +47,9 @@ struct worker {
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
- unsigned long last_active; /* L: last active timestamp */
- unsigned int flags; /* X: flags */
+ unsigned long last_active; /* K: last active timestamp */
+ unsigned int flags; /* L: flags */
int id; /* I: worker id */
- int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -55,9 +59,6 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
-
- /* used by the scheduler to determine a worker's last known identity */
- work_func_t last_func;
};
/**
@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
*/
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
+void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */